openstat-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstat/__init__.py +3 -0
- openstat/__main__.py +4 -0
- openstat/backends/__init__.py +16 -0
- openstat/backends/duckdb_backend.py +70 -0
- openstat/backends/polars_backend.py +52 -0
- openstat/cli.py +92 -0
- openstat/commands/__init__.py +82 -0
- openstat/commands/adv_stat_cmds.py +1255 -0
- openstat/commands/advanced_ml_cmds.py +576 -0
- openstat/commands/advreg_cmds.py +207 -0
- openstat/commands/alias_cmds.py +135 -0
- openstat/commands/arch_cmds.py +82 -0
- openstat/commands/arules_cmds.py +111 -0
- openstat/commands/automodel_cmds.py +212 -0
- openstat/commands/backend_cmds.py +82 -0
- openstat/commands/base.py +170 -0
- openstat/commands/bayes_cmds.py +71 -0
- openstat/commands/causal_cmds.py +269 -0
- openstat/commands/cluster_cmds.py +152 -0
- openstat/commands/data_cmds.py +996 -0
- openstat/commands/datamanip_cmds.py +672 -0
- openstat/commands/dataquality_cmds.py +174 -0
- openstat/commands/datetime_cmds.py +176 -0
- openstat/commands/dimreduce_cmds.py +184 -0
- openstat/commands/discrete_cmds.py +149 -0
- openstat/commands/dsl_cmds.py +143 -0
- openstat/commands/epi_cmds.py +93 -0
- openstat/commands/equiv_tobit_cmds.py +94 -0
- openstat/commands/esttab_cmds.py +196 -0
- openstat/commands/export_beamer_cmds.py +142 -0
- openstat/commands/export_cmds.py +201 -0
- openstat/commands/export_extra_cmds.py +240 -0
- openstat/commands/factor_cmds.py +180 -0
- openstat/commands/groupby_cmds.py +155 -0
- openstat/commands/help_cmds.py +237 -0
- openstat/commands/i18n_cmds.py +43 -0
- openstat/commands/import_extra_cmds.py +561 -0
- openstat/commands/influence_cmds.py +134 -0
- openstat/commands/iv_cmds.py +106 -0
- openstat/commands/manova_cmds.py +105 -0
- openstat/commands/mediate_cmds.py +233 -0
- openstat/commands/meta_cmds.py +284 -0
- openstat/commands/mi_cmds.py +228 -0
- openstat/commands/mixed_cmds.py +79 -0
- openstat/commands/mixture_changepoint_cmds.py +166 -0
- openstat/commands/ml_adv_cmds.py +147 -0
- openstat/commands/ml_cmds.py +178 -0
- openstat/commands/model_eval_cmds.py +142 -0
- openstat/commands/network_cmds.py +288 -0
- openstat/commands/nlquery_cmds.py +161 -0
- openstat/commands/nonparam_cmds.py +149 -0
- openstat/commands/outreg_cmds.py +247 -0
- openstat/commands/panel_cmds.py +141 -0
- openstat/commands/pdf_cmds.py +226 -0
- openstat/commands/pipeline_cmds.py +319 -0
- openstat/commands/plot_cmds.py +189 -0
- openstat/commands/plugin_cmds.py +79 -0
- openstat/commands/posthoc_cmds.py +153 -0
- openstat/commands/power_cmds.py +172 -0
- openstat/commands/profile_cmds.py +246 -0
- openstat/commands/rbridge_cmds.py +81 -0
- openstat/commands/regex_cmds.py +104 -0
- openstat/commands/report_cmds.py +48 -0
- openstat/commands/repro_cmds.py +129 -0
- openstat/commands/resampling_cmds.py +109 -0
- openstat/commands/reshape_cmds.py +223 -0
- openstat/commands/sem_cmds.py +177 -0
- openstat/commands/stat_cmds.py +1040 -0
- openstat/commands/stata_import_cmds.py +215 -0
- openstat/commands/string_cmds.py +124 -0
- openstat/commands/surv_cmds.py +145 -0
- openstat/commands/survey_cmds.py +153 -0
- openstat/commands/textanalysis_cmds.py +192 -0
- openstat/commands/ts_adv_cmds.py +136 -0
- openstat/commands/ts_cmds.py +195 -0
- openstat/commands/tui_cmds.py +111 -0
- openstat/commands/ux_cmds.py +191 -0
- openstat/commands/validate_cmds.py +270 -0
- openstat/commands/viz_adv_cmds.py +312 -0
- openstat/commands/viz_extra_cmds.py +251 -0
- openstat/commands/watch_cmds.py +69 -0
- openstat/config.py +106 -0
- openstat/dsl/__init__.py +0 -0
- openstat/dsl/parser.py +332 -0
- openstat/dsl/tokenizer.py +105 -0
- openstat/i18n.py +120 -0
- openstat/io/__init__.py +0 -0
- openstat/io/loader.py +187 -0
- openstat/jupyter/__init__.py +18 -0
- openstat/jupyter/display.py +18 -0
- openstat/jupyter/magic.py +60 -0
- openstat/logging_config.py +59 -0
- openstat/plots/__init__.py +0 -0
- openstat/plots/plotter.py +437 -0
- openstat/plots/surv_plots.py +32 -0
- openstat/plots/ts_plots.py +59 -0
- openstat/plugins/__init__.py +5 -0
- openstat/plugins/manager.py +69 -0
- openstat/repl.py +457 -0
- openstat/reporting/__init__.py +0 -0
- openstat/reporting/eda.py +208 -0
- openstat/reporting/report.py +67 -0
- openstat/script_runner.py +319 -0
- openstat/session.py +133 -0
- openstat/stats/__init__.py +0 -0
- openstat/stats/advanced_regression.py +269 -0
- openstat/stats/arch_garch.py +84 -0
- openstat/stats/bayesian.py +103 -0
- openstat/stats/causal.py +258 -0
- openstat/stats/clustering.py +206 -0
- openstat/stats/discrete.py +311 -0
- openstat/stats/epidemiology.py +119 -0
- openstat/stats/equiv_tobit.py +163 -0
- openstat/stats/factor.py +174 -0
- openstat/stats/imputation.py +282 -0
- openstat/stats/influence.py +78 -0
- openstat/stats/iv.py +131 -0
- openstat/stats/manova.py +124 -0
- openstat/stats/mixed.py +128 -0
- openstat/stats/ml.py +275 -0
- openstat/stats/ml_advanced.py +117 -0
- openstat/stats/model_eval.py +183 -0
- openstat/stats/models.py +1342 -0
- openstat/stats/nonparametric.py +130 -0
- openstat/stats/panel.py +179 -0
- openstat/stats/power.py +295 -0
- openstat/stats/resampling.py +203 -0
- openstat/stats/survey.py +213 -0
- openstat/stats/survival.py +196 -0
- openstat/stats/timeseries.py +142 -0
- openstat/stats/ts_advanced.py +114 -0
- openstat/types.py +11 -0
- openstat/web/__init__.py +1 -0
- openstat/web/app.py +117 -0
- openstat/web/session_manager.py +73 -0
- openstat/web/static/app.js +117 -0
- openstat/web/static/index.html +38 -0
- openstat/web/static/style.css +103 -0
- openstat_cli-1.0.0.dist-info/METADATA +748 -0
- openstat_cli-1.0.0.dist-info/RECORD +143 -0
- openstat_cli-1.0.0.dist-info/WHEEL +4 -0
- openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
- openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,748 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: openstat-cli
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Open-source statistical analysis tool — a free alternative to Stata, SPSS, and SAS
|
|
5
|
+
Project-URL: Homepage, https://github.com/openstat-project/openstat
|
|
6
|
+
Project-URL: Documentation, https://github.com/openstat-project/openstat#readme
|
|
7
|
+
Project-URL: Bug Tracker, https://github.com/openstat-project/openstat/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/openstat-project/openstat/blob/main/CHANGELOG.md
|
|
9
|
+
Author: baristiran
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: causal-inference,cli,data-analysis,econometrics,machine-learning,panel-data,regression,repl,spss,stata,statistics,survival-analysis,time-series
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Environment :: Console
|
|
15
|
+
Classifier: Intended Audience :: Education
|
|
16
|
+
Classifier: Intended Audience :: Financial and Insurance Industry
|
|
17
|
+
Classifier: Intended Audience :: Science/Research
|
|
18
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
19
|
+
Classifier: Operating System :: OS Independent
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
26
|
+
Classifier: Topic :: Scientific/Engineering :: Mathematics
|
|
27
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
28
|
+
Requires-Python: >=3.10
|
|
29
|
+
Requires-Dist: matplotlib>=3.8
|
|
30
|
+
Requires-Dist: numpy>=1.24
|
|
31
|
+
Requires-Dist: polars>=1.0
|
|
32
|
+
Requires-Dist: prompt-toolkit>=3.0
|
|
33
|
+
Requires-Dist: rich>=13.0
|
|
34
|
+
Requires-Dist: scipy>=1.12
|
|
35
|
+
Requires-Dist: statsmodels>=0.14
|
|
36
|
+
Requires-Dist: typer>=0.12
|
|
37
|
+
Provides-Extra: all
|
|
38
|
+
Requires-Dist: arch>=6.0; extra == 'all'
|
|
39
|
+
Requires-Dist: connectorx>=0.3; extra == 'all'
|
|
40
|
+
Requires-Dist: duckdb>=0.10; extra == 'all'
|
|
41
|
+
Requires-Dist: fastapi>=0.100; extra == 'all'
|
|
42
|
+
Requires-Dist: ipython>=8.0; extra == 'all'
|
|
43
|
+
Requires-Dist: lifelines>=0.28; extra == 'all'
|
|
44
|
+
Requires-Dist: linearmodels>=6.0; extra == 'all'
|
|
45
|
+
Requires-Dist: nbformat>=5.0; extra == 'all'
|
|
46
|
+
Requires-Dist: networkx>=3.0; extra == 'all'
|
|
47
|
+
Requires-Dist: openpyxl>=3.1; extra == 'all'
|
|
48
|
+
Requires-Dist: pandas>=2.0; extra == 'all'
|
|
49
|
+
Requires-Dist: plotly>=5.0; extra == 'all'
|
|
50
|
+
Requires-Dist: pyreadstat>=1.0; extra == 'all'
|
|
51
|
+
Requires-Dist: python-docx>=1.1; extra == 'all'
|
|
52
|
+
Requires-Dist: python-multipart; extra == 'all'
|
|
53
|
+
Requires-Dist: python-pptx>=0.6; extra == 'all'
|
|
54
|
+
Requires-Dist: rapidfuzz>=3.0; extra == 'all'
|
|
55
|
+
Requires-Dist: reportlab>=4.0; extra == 'all'
|
|
56
|
+
Requires-Dist: scikit-learn>=1.4; extra == 'all'
|
|
57
|
+
Requires-Dist: semopy>=2.3; extra == 'all'
|
|
58
|
+
Requires-Dist: shap>=0.44; extra == 'all'
|
|
59
|
+
Requires-Dist: textual>=0.60; extra == 'all'
|
|
60
|
+
Requires-Dist: uvicorn>=0.30; extra == 'all'
|
|
61
|
+
Requires-Dist: websockets; extra == 'all'
|
|
62
|
+
Requires-Dist: xlsxwriter>=3.1; extra == 'all'
|
|
63
|
+
Provides-Extra: anthropic
|
|
64
|
+
Requires-Dist: anthropic>=0.20; extra == 'anthropic'
|
|
65
|
+
Provides-Extra: database
|
|
66
|
+
Requires-Dist: connectorx>=0.3; extra == 'database'
|
|
67
|
+
Provides-Extra: dev
|
|
68
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
69
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
70
|
+
Provides-Extra: duckdb
|
|
71
|
+
Requires-Dist: duckdb>=0.10; extra == 'duckdb'
|
|
72
|
+
Provides-Extra: excel
|
|
73
|
+
Requires-Dist: openpyxl>=3.1; extra == 'excel'
|
|
74
|
+
Requires-Dist: xlsxwriter>=3.1; extra == 'excel'
|
|
75
|
+
Provides-Extra: factor
|
|
76
|
+
Requires-Dist: scikit-learn>=1.4; extra == 'factor'
|
|
77
|
+
Provides-Extra: fuzzy
|
|
78
|
+
Requires-Dist: rapidfuzz>=3.0; extra == 'fuzzy'
|
|
79
|
+
Provides-Extra: garch
|
|
80
|
+
Requires-Dist: arch>=6.0; extra == 'garch'
|
|
81
|
+
Provides-Extra: interactive
|
|
82
|
+
Requires-Dist: plotly>=5.0; extra == 'interactive'
|
|
83
|
+
Provides-Extra: jupyter
|
|
84
|
+
Requires-Dist: ipython>=8.0; extra == 'jupyter'
|
|
85
|
+
Provides-Extra: ml
|
|
86
|
+
Requires-Dist: scikit-learn>=1.4; extra == 'ml'
|
|
87
|
+
Provides-Extra: network
|
|
88
|
+
Requires-Dist: networkx>=3.0; extra == 'network'
|
|
89
|
+
Provides-Extra: notebook
|
|
90
|
+
Requires-Dist: nbformat>=5.0; extra == 'notebook'
|
|
91
|
+
Provides-Extra: openai
|
|
92
|
+
Requires-Dist: openai>=1.0; extra == 'openai'
|
|
93
|
+
Provides-Extra: panel
|
|
94
|
+
Requires-Dist: linearmodels>=6.0; extra == 'panel'
|
|
95
|
+
Provides-Extra: pdf
|
|
96
|
+
Requires-Dist: reportlab>=4.0; extra == 'pdf'
|
|
97
|
+
Provides-Extra: rbridge
|
|
98
|
+
Requires-Dist: rpy2>=3.5; extra == 'rbridge'
|
|
99
|
+
Provides-Extra: report
|
|
100
|
+
Requires-Dist: python-docx>=1.1; extra == 'report'
|
|
101
|
+
Requires-Dist: python-pptx>=0.6; extra == 'report'
|
|
102
|
+
Provides-Extra: sas
|
|
103
|
+
Requires-Dist: pandas>=2.0; extra == 'sas'
|
|
104
|
+
Requires-Dist: pyreadstat>=1.0; extra == 'sas'
|
|
105
|
+
Provides-Extra: sem
|
|
106
|
+
Requires-Dist: semopy>=2.3; extra == 'sem'
|
|
107
|
+
Provides-Extra: shap
|
|
108
|
+
Requires-Dist: shap>=0.44; extra == 'shap'
|
|
109
|
+
Provides-Extra: spss
|
|
110
|
+
Requires-Dist: pandas>=2.0; extra == 'spss'
|
|
111
|
+
Requires-Dist: pyreadstat>=1.0; extra == 'spss'
|
|
112
|
+
Provides-Extra: stata
|
|
113
|
+
Requires-Dist: pandas>=2.0; extra == 'stata'
|
|
114
|
+
Requires-Dist: pyreadstat>=1.0; extra == 'stata'
|
|
115
|
+
Provides-Extra: survival
|
|
116
|
+
Requires-Dist: lifelines>=0.28; extra == 'survival'
|
|
117
|
+
Provides-Extra: tui
|
|
118
|
+
Requires-Dist: textual>=0.60; extra == 'tui'
|
|
119
|
+
Provides-Extra: web
|
|
120
|
+
Requires-Dist: fastapi>=0.100; extra == 'web'
|
|
121
|
+
Requires-Dist: python-multipart; extra == 'web'
|
|
122
|
+
Requires-Dist: uvicorn>=0.30; extra == 'web'
|
|
123
|
+
Requires-Dist: websockets; extra == 'web'
|
|
124
|
+
Description-Content-Type: text/markdown
|
|
125
|
+
|
|
126
|
+
<p align="center">
|
|
127
|
+
<img src="https://img.shields.io/badge/version-1.0.0-blue?style=for-the-badge" alt="Version">
|
|
128
|
+
<img src="https://img.shields.io/badge/python-3.10%2B-brightgreen?style=for-the-badge&logo=python&logoColor=white" alt="Python">
|
|
129
|
+
<img src="https://img.shields.io/badge/license-MIT-orange?style=for-the-badge" alt="License">
|
|
130
|
+
<img src="https://img.shields.io/badge/tests-816%20passed-success?style=for-the-badge" alt="Tests">
|
|
131
|
+
<img src="https://img.shields.io/badge/commands-231-blueviolet?style=for-the-badge" alt="Commands">
|
|
132
|
+
<img src="https://img.shields.io/badge/powered%20by-Polars%20%7C%20statsmodels-purple?style=for-the-badge" alt="Stack">
|
|
133
|
+
</p>
|
|
134
|
+
|
|
135
|
+
<h1 align="center">OpenStat</h1>
|
|
136
|
+
|
|
137
|
+
<p align="center">
|
|
138
|
+
<strong>The open-source statistical analysis tool you've been waiting for.</strong><br>
|
|
139
|
+
Load data. Explore. Transform. Model. Plot. Report. All from your terminal.
|
|
140
|
+
</p>
|
|
141
|
+
|
|
142
|
+
<p align="center">
|
|
143
|
+
<a href="#-quick-start">Quick Start</a> •
|
|
144
|
+
<a href="#-why-openstat">Why OpenStat?</a> •
|
|
145
|
+
<a href="#-full-command-reference">Commands</a> •
|
|
146
|
+
<a href="#-statistical-models">Models</a> •
|
|
147
|
+
<a href="#-contributing">Contributing</a>
|
|
148
|
+
</p>
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
> **Note:** OpenStat is an independent, community-driven open-source project. It is not affiliated with, endorsed by, or connected to StataCorp LLC or any commercial statistical software vendor.
|
|
153
|
+
|
|
154
|
+
## Why OpenStat?
|
|
155
|
+
|
|
156
|
+
**Statistical analysis shouldn't require expensive licenses.** Every researcher, student, data scientist, and curious mind deserves access to professional-grade statistical tools — for free, forever.
|
|
157
|
+
|
|
158
|
+
OpenStat brings the familiar workflow of commercial statistical packages into your terminal with a clean, intuitive REPL. It's built on the incredible open-source Python ecosystem (Polars, statsmodels, scipy) and designed to be:
|
|
159
|
+
|
|
160
|
+
- **Accessible** — No licensing fees. No registration. Just `pip install` and go.
|
|
161
|
+
- **Familiar** — If you've used Stata, R, or SPSS, you'll feel right at home.
|
|
162
|
+
- **Fast** — Powered by [Polars](https://pola.rs/) (not pandas) for blazing-fast data operations.
|
|
163
|
+
- **Safe** — No `eval()` anywhere. All user expressions go through a secure whitelist parser.
|
|
164
|
+
- **Scriptable** — Write `.ost` scripts for reproducible analysis pipelines.
|
|
165
|
+
- **Extensible** — Adding a new command takes 10 lines of code. Seriously.
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
## Quick Start
|
|
170
|
+
|
|
171
|
+
### Installation
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
# Clone the repository
|
|
175
|
+
git clone https://github.com/baristiran/OpenStat.git
|
|
176
|
+
cd OpenStat
|
|
177
|
+
|
|
178
|
+
# Create a virtual environment (recommended)
|
|
179
|
+
python -m venv .venv
|
|
180
|
+
source .venv/bin/activate # Windows: .venv\Scripts\activate
|
|
181
|
+
|
|
182
|
+
# Install OpenStat with all dependencies
|
|
183
|
+
pip install -e ".[dev]"
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### Launch the Interactive REPL
|
|
187
|
+
|
|
188
|
+
```bash
|
|
189
|
+
openstat repl
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
```
|
|
193
|
+
OpenStat v0.2.0 — Open-source statistical analysis tool
|
|
194
|
+
Type help for commands, quit to exit.
|
|
195
|
+
|
|
196
|
+
openstat> load examples/data.csv
|
|
197
|
+
Loaded 50 rows x 7 columns from examples/data.csv
|
|
198
|
+
|
|
199
|
+
openstat> summarize age income score
|
|
200
|
+
┌──────────┬────┬─────────┬─────────┬───────┬─────────┬─────────┬─────────┬─────────┐
|
|
201
|
+
│ Variable │ N │ Mean │ SD │ Min │ P25 │ P50 │ P75 │ Max │
|
|
202
|
+
├──────────┼────┼─────────┼─────────┼───────┼─────────┼─────────┼─────────┼─────────┤
|
|
203
|
+
│ age │ 50 │ 34.6600 │ 8.7634 │ 21.00 │ 27.2500 │ 34.0000 │ 42.5000 │ 53.0000 │
|
|
204
|
+
│ income │ 50 │ 49840.0 │ 17547.2 │ 26000 │ 34000.0 │ 47000.0 │ 66000.0 │ 88000.0 │
|
|
205
|
+
│ score │ 50 │ 7.4280 │ 1.2844 │ 4.90 │ 6.4750 │ 7.5000 │ 8.5500 │ 9.4000 │
|
|
206
|
+
└──────────┴────┴─────────┴─────────┴───────┴─────────┴─────────┴─────────┴─────────┘
|
|
207
|
+
|
|
208
|
+
openstat> ols score ~ age + income --robust
|
|
209
|
+
┌──────────┬────────┬─────────┬───────┬────────┬────────────┬─────────────┐
|
|
210
|
+
│ Variable │ Coef │ Std.Err │ t/z │ P>|t| │ [95% CI L] │ [95% CI H] │
|
|
211
|
+
├──────────┼────────┼─────────┼───────┼────────┼────────────┼─────────────┤
|
|
212
|
+
│ _cons │ 2.1435 │ 0.4521 │ 4.741 │ 0.0000 │ 1.2343 │ 3.0527 │
|
|
213
|
+
│ age │ 0.0312 │ 0.0187 │ 1.668 │ 0.1018 │ -0.0066 │ 0.0690 │
|
|
214
|
+
│ income │ 0.0001 │ 0.0000 │ 5.234 │ 0.0000 │ 0.0000 │ 0.0001 │
|
|
215
|
+
└──────────┴────────┴─────────┴───────┴────────┴────────────┴─────────────┘
|
|
216
|
+
N = 50 | R² = 0.5481 | Adj.R² = 0.5289 | F(2, 47) = 28.52 (p=0.0000)
|
|
217
|
+
|
|
218
|
+
openstat> predict yhat
|
|
219
|
+
Predictions added as 'yhat'. 50 rows x 8 columns.
|
|
220
|
+
|
|
221
|
+
openstat> quit
|
|
222
|
+
Bye!
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
### Run a Script
|
|
226
|
+
|
|
227
|
+
```bash
|
|
228
|
+
# Run an analysis script
|
|
229
|
+
openstat run examples/demo.ost
|
|
230
|
+
|
|
231
|
+
# Strict mode — stop on first error (great for CI/CD)
|
|
232
|
+
openstat run examples/demo.ost --strict
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
---
|
|
236
|
+
|
|
237
|
+
## What's New in v0.2.0
|
|
238
|
+
|
|
239
|
+
Version 0.2.0 is a massive leap in statistical depth. Here's what's new:
|
|
240
|
+
|
|
241
|
+
| Feature | What it does | Example |
|
|
242
|
+
|---------|-------------|---------|
|
|
243
|
+
| **Interaction Terms** | Model interactions between variables | `ols y ~ x1*x2` or `ols y ~ x1:x2` |
|
|
244
|
+
| **Cluster-Robust SE** | Standard errors robust to within-group correlation | `ols y ~ x1 + x2 --cluster=region` |
|
|
245
|
+
| **Poisson Regression** | Count data modeling with optional exposure offset | `poisson visits ~ age + income --exposure=time` |
|
|
246
|
+
| **Negative Binomial** | Overdispersed count data (reports dispersion alpha) | `negbin claims ~ age + gender` |
|
|
247
|
+
| **Quantile Regression** | Model any quantile, not just the mean | `quantreg y ~ x1 + x2 tau=0.75` |
|
|
248
|
+
| **Marginal Effects** | Average or at-means marginal effects for logit/probit | `margins --at=average` |
|
|
249
|
+
| **Bootstrap CI** | Non-parametric confidence intervals via resampling | `bootstrap n=1000 ci=95` |
|
|
250
|
+
| **Post-Estimation Diagnostics** | Breusch-Pagan, Ramsey RESET, link test, IC | `estat all` |
|
|
251
|
+
| **Model Comparison** | Side-by-side model comparison table | `estimates table` |
|
|
252
|
+
| **Multi-Way Interactions** | Three-way and beyond: `x1*x2*x3` auto-expands | Full factorial expansion |
|
|
253
|
+
|
|
254
|
+
---
|
|
255
|
+
|
|
256
|
+
## Full Command Reference
|
|
257
|
+
|
|
258
|
+
### Data Management
|
|
259
|
+
|
|
260
|
+
| Command | Description | Example |
|
|
261
|
+
|---------|-------------|---------|
|
|
262
|
+
| `load <path>` | Load CSV, Parquet, Stata (.dta), Excel (.xlsx) | `load survey.csv` |
|
|
263
|
+
| `save <path>` | Save data to any supported format | `save results.parquet` |
|
|
264
|
+
| `describe` | Show dataset structure (types, nulls) | `describe` |
|
|
265
|
+
| `head [N]` | Show first N rows (default: 10) | `head 20` |
|
|
266
|
+
| `tail [N]` | Show last N rows | `tail 5` |
|
|
267
|
+
| `count` | Row and column count | `count` |
|
|
268
|
+
| `merge <path> on <key> [how=...]` | Join with another file | `merge scores.csv on id how=left` |
|
|
269
|
+
| `undo` | Undo last data change (multi-level) | `undo` |
|
|
270
|
+
|
|
271
|
+
### Data Transformation
|
|
272
|
+
|
|
273
|
+
| Command | Description | Example |
|
|
274
|
+
|---------|-------------|---------|
|
|
275
|
+
| `filter <expr>` | Filter rows with expressions | `filter age > 30 and income < 50000` |
|
|
276
|
+
| `select <cols>` | Keep specific columns | `select age income score` |
|
|
277
|
+
| `derive <col> = <expr>` | Create new variables | `derive bmi = weight / (height ** 2)` |
|
|
278
|
+
| `dropna [cols]` | Drop missing values | `dropna age income` |
|
|
279
|
+
| `fillna <col> <strategy>` | Fill missing values | `fillna income median` |
|
|
280
|
+
| `sort <col> [--desc]` | Sort dataset | `sort income --desc` |
|
|
281
|
+
| `rename <old> <new>` | Rename a column | `rename income salary` |
|
|
282
|
+
| `cast <col> <type>` | Cast column type | `cast age float` |
|
|
283
|
+
| `encode <col> [as <new>]` | Label-encode strings | `encode region as region_code` |
|
|
284
|
+
| `recode <col> old=new ...` | Recode values | `recode region North=N South=S` |
|
|
285
|
+
| `replace <col> <old> <new>` | Replace values | `replace region North Norte` |
|
|
286
|
+
| `sample <N\|N%>` | Random sample | `sample 100` or `sample 10%` |
|
|
287
|
+
| `duplicates [drop] [cols]` | Find or drop duplicates | `duplicates drop` |
|
|
288
|
+
| `unique <col>` | List unique values | `unique region` |
|
|
289
|
+
| `lag <col> [N]` | Lag variable (shift down) | `lag price 2` |
|
|
290
|
+
| `lead <col> [N]` | Lead variable (shift up) | `lead price` |
|
|
291
|
+
| `pivot <val> by <col>` | Reshape to wide format | `pivot score by subject over name` |
|
|
292
|
+
| `melt <ids>, <vals>` | Reshape to long format | `melt name, math eng` |
|
|
293
|
+
|
|
294
|
+
### Descriptive Statistics
|
|
295
|
+
|
|
296
|
+
| Command | Description | Example |
|
|
297
|
+
|---------|-------------|---------|
|
|
298
|
+
| `summarize [cols]` | Summary statistics (N, Mean, SD, quartiles) | `summarize age income` |
|
|
299
|
+
| `tabulate <col>` | Frequency table (top 50 values) | `tabulate education` |
|
|
300
|
+
| `crosstab <row> <col>` | Two-way contingency table with row percentages | `crosstab gender status` |
|
|
301
|
+
| `corr [cols]` | Pearson correlation matrix | `corr age income score` |
|
|
302
|
+
| `groupby <cols> summarize <aggs>` | Group-by with aggregations | `groupby region summarize mean(income) count()` |
|
|
303
|
+
|
|
304
|
+
### Statistical Models
|
|
305
|
+
|
|
306
|
+
| Command | Description | Example |
|
|
307
|
+
|---------|-------------|---------|
|
|
308
|
+
| `ols y ~ x1 + x2` | OLS linear regression | `ols score ~ age + income --robust` |
|
|
309
|
+
| `logit y ~ x1 + x2` | Logistic regression (binary) | `logit employed ~ age + income` |
|
|
310
|
+
| `probit y ~ x1 + x2` | Probit regression (binary) | `probit employed ~ age + income` |
|
|
311
|
+
| `poisson y ~ x1 + x2` | Poisson regression (counts) | `poisson visits ~ age --exposure=time` |
|
|
312
|
+
| `negbin y ~ x1 + x2` | Negative Binomial (overdispersed) | `negbin claims ~ age + gender` |
|
|
313
|
+
| `quantreg y ~ x1 + x2` | Quantile regression | `quantreg wage ~ edu + exp tau=0.9` |
|
|
314
|
+
|
|
315
|
+
**All models support:** `--robust` (heteroscedasticity-robust SE), `--cluster=col` (cluster-robust SE)
|
|
316
|
+
|
|
317
|
+
**Formula syntax:**
|
|
318
|
+
- `y ~ x1 + x2` — standard predictors
|
|
319
|
+
- `y ~ x1*x2` — full factorial (expands to `x1 + x2 + x1:x2`)
|
|
320
|
+
- `y ~ x1:x2` — interaction term only
|
|
321
|
+
- `y ~ x1*x2*x3` — three-way interaction (all combinations)
|
|
322
|
+
|
|
323
|
+
### Post-Estimation
|
|
324
|
+
|
|
325
|
+
| Command | Description | Example |
|
|
326
|
+
|---------|-------------|---------|
|
|
327
|
+
| `predict [name]` | Predicted values from last model | `predict yhat` |
|
|
328
|
+
| `residuals [name]` | Residuals + diagnostic plots | `residuals resid` |
|
|
329
|
+
| `vif` | Variance Inflation Factor | `vif` |
|
|
330
|
+
| `margins [--at=means\|average]` | Marginal effects (logit/probit) | `margins --at=average` |
|
|
331
|
+
| `bootstrap [n=N] [ci=N]` | Bootstrap confidence intervals | `bootstrap n=1000 ci=95` |
|
|
332
|
+
| `estat <sub>` | Post-estimation diagnostics | `estat all` |
|
|
333
|
+
| `estimates table` | Side-by-side model comparison | `estimates table` |
|
|
334
|
+
| `stepwise y ~ x1 + ...` | Stepwise variable selection | `stepwise y ~ x1 + x2 + x3 --backward` |
|
|
335
|
+
| `latex [path.tex]` | Export model as LaTeX table | `latex results.tex` |
|
|
336
|
+
|
|
337
|
+
**`estat` subcommands:**
|
|
338
|
+
- `estat hettest` — Breusch-Pagan heteroscedasticity test
|
|
339
|
+
- `estat ovtest` — Ramsey RESET specification test
|
|
340
|
+
- `estat linktest` — Link test for model specification
|
|
341
|
+
- `estat ic` — Information criteria (AIC, BIC, Log-Likelihood)
|
|
342
|
+
- `estat all` — Run all diagnostics at once
|
|
343
|
+
|
|
344
|
+
### Hypothesis Tests
|
|
345
|
+
|
|
346
|
+
| Command | Description | Example |
|
|
347
|
+
|---------|-------------|---------|
|
|
348
|
+
| `ttest <col>` | One-sample t-test (H0: mean=0) | `ttest score mu=7` |
|
|
349
|
+
| `ttest <col> by <group>` | Two-sample Welch t-test | `ttest income by employed` |
|
|
350
|
+
| `ttest <col> paired <col2>` | Paired t-test | `ttest before paired after` |
|
|
351
|
+
| `chi2 <col1> <col2>` | Chi-square independence test | `chi2 region employed` |
|
|
352
|
+
| `anova <col> by <group>` | One-way ANOVA (F-test) | `anova score by region` |
|
|
353
|
+
|
|
354
|
+
### Visualization
|
|
355
|
+
|
|
356
|
+
| Command | Description | Example |
|
|
357
|
+
|---------|-------------|---------|
|
|
358
|
+
| `plot hist <col>` | Histogram | `plot hist age` |
|
|
359
|
+
| `plot scatter <y> <x>` | Scatter plot | `plot scatter score income` |
|
|
360
|
+
| `plot line <y> <x>` | Line plot | `plot line score age` |
|
|
361
|
+
| `plot box <col> [by <g>]` | Box plot (optionally grouped) | `plot box income by region` |
|
|
362
|
+
| `plot bar <col> [by <g>]` | Bar chart | `plot bar income by region` |
|
|
363
|
+
| `plot heatmap [cols]` | Correlation heatmap | `plot heatmap age income score` |
|
|
364
|
+
| `plot diagnostics` | Residual diagnostic plots | `plot diagnostics` |
|
|
365
|
+
|
|
366
|
+
### Other
|
|
367
|
+
|
|
368
|
+
| Command | Description | Example |
|
|
369
|
+
|---------|-------------|---------|
|
|
370
|
+
| `report <path>` | Generate Markdown report | `report analysis.md` |
|
|
371
|
+
| `help [cmd]` | Show help (all or specific command) | `help ols` |
|
|
372
|
+
| `quit` / `exit` / `q` | Exit REPL | `quit` |
|
|
373
|
+
|
|
374
|
+
---
|
|
375
|
+
|
|
376
|
+
## Expression Language
|
|
377
|
+
|
|
378
|
+
The expression language used by `filter` and `derive` is a **safe, recursive-descent parser** — no Python `eval()` is ever used.
|
|
379
|
+
|
|
380
|
+
```bash
|
|
381
|
+
# Arithmetic
|
|
382
|
+
openstat> derive income_k = income / 1000
|
|
383
|
+
openstat> derive bmi = weight / (height ** 2)
|
|
384
|
+
|
|
385
|
+
# Comparisons and boolean logic
|
|
386
|
+
openstat> filter age > 30 and income < 50000
|
|
387
|
+
openstat> filter not is_null(score) and region == "North"
|
|
388
|
+
|
|
389
|
+
# Functions
|
|
390
|
+
openstat> derive log_income = log(income)
|
|
391
|
+
openstat> derive name_upper = upper(name)
|
|
392
|
+
openstat> derive score_clean = fill_null(score, 0)
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
**Available functions:**
|
|
396
|
+
|
|
397
|
+
| Category | Functions |
|
|
398
|
+
|----------|----------|
|
|
399
|
+
| Math | `log(x)`, `log10(x)`, `sqrt(x)`, `abs(x)`, `exp(x)`, `round(x, n)` |
|
|
400
|
+
| String | `upper(x)`, `lower(x)`, `len_chars(x)`, `strip(x)`, `contains(x, "pattern")` |
|
|
401
|
+
| Null | `is_null(x)`, `is_not_null(x)`, `fill_null(x, value)` |
|
|
402
|
+
| Type | `cast_float(x)`, `cast_int(x)`, `cast_str(x)` |
|
|
403
|
+
|
|
404
|
+
---
|
|
405
|
+
|
|
406
|
+
## Statistical Models — In Depth
|
|
407
|
+
|
|
408
|
+
### Automatic Diagnostics
|
|
409
|
+
|
|
410
|
+
Every model automatically checks for common problems and warns you:
|
|
411
|
+
|
|
412
|
+
- **Multicollinearity** — Condition number > 30 triggers a warning
|
|
413
|
+
- **Heteroscedasticity** — Breusch-Pagan test; suggests `--robust` if p < 0.05
|
|
414
|
+
- **Autocorrelation** — Durbin-Watson statistic far from 2.0
|
|
415
|
+
- **Convergence** — Warns if logit/probit MLE did not converge
|
|
416
|
+
- **Missing values** — Reports how many observations were dropped
|
|
417
|
+
- **Low sample size** — Warns when observation-to-predictor ratio is low
|
|
418
|
+
|
|
419
|
+
### Interaction Terms
|
|
420
|
+
|
|
421
|
+
```bash
|
|
422
|
+
# Full factorial: automatically expands to x1 + x2 + x1:x2
|
|
423
|
+
openstat> ols y ~ x1*x2
|
|
424
|
+
|
|
425
|
+
# Interaction only
|
|
426
|
+
openstat> ols y ~ x1 + x2 + x1:x2
|
|
427
|
+
|
|
428
|
+
# Three-way interaction (7 terms total)
|
|
429
|
+
openstat> ols y ~ x1*x2*x3
|
|
430
|
+
```
|
|
431
|
+
|
|
432
|
+
### Cluster-Robust Standard Errors
|
|
433
|
+
|
|
434
|
+
```bash
|
|
435
|
+
# Clustered at the region level
|
|
436
|
+
openstat> ols wage ~ education + experience --cluster=region
|
|
437
|
+
|
|
438
|
+
# Works with all model types
|
|
439
|
+
openstat> logit promoted ~ age + performance --cluster=department
|
|
440
|
+
```
|
|
441
|
+
|
|
442
|
+
### Marginal Effects
|
|
443
|
+
|
|
444
|
+
After fitting a logit or probit model, compute marginal effects to understand the practical impact:
|
|
445
|
+
|
|
446
|
+
```bash
|
|
447
|
+
openstat> logit employed ~ age + education + income
|
|
448
|
+
openstat> margins # Average marginal effects (default)
|
|
449
|
+
openstat> margins --at=means # Marginal effects at means
|
|
450
|
+
```
|
|
451
|
+
|
|
452
|
+
### Bootstrap Confidence Intervals
|
|
453
|
+
|
|
454
|
+
Non-parametric bootstrap for any model — no distributional assumptions needed:
|
|
455
|
+
|
|
456
|
+
```bash
|
|
457
|
+
openstat> ols y ~ x1 + x2
|
|
458
|
+
openstat> bootstrap n=1000 ci=95 # 1000 replications, 95% CI
|
|
459
|
+
openstat> bootstrap n=5000 ci=99 # More replications, 99% CI
|
|
460
|
+
```
|
|
461
|
+
|
|
462
|
+
Bootstrap uses thread-pool parallelism for speed when n > 100.
|
|
463
|
+
|
|
464
|
+
### Model Comparison
|
|
465
|
+
|
|
466
|
+
Run multiple models and compare them side-by-side:
|
|
467
|
+
|
|
468
|
+
```bash
|
|
469
|
+
openstat> ols y ~ x1
|
|
470
|
+
openstat> ols y ~ x1 + x2
|
|
471
|
+
openstat> ols y ~ x1 + x2 + x1:x2
|
|
472
|
+
openstat> estimates table
|
|
473
|
+
```
|
|
474
|
+
|
|
475
|
+
This produces a publication-ready comparison table with coefficients, standard errors, R², AIC, BIC, and more.
|
|
476
|
+
|
|
477
|
+
---
|
|
478
|
+
|
|
479
|
+
## File Formats
|
|
480
|
+
|
|
481
|
+
| Format | Import | Export | Dependency |
|
|
482
|
+
|--------|:------:|:------:|------------|
|
|
483
|
+
| CSV | Yes | Yes | Built-in |
|
|
484
|
+
| Parquet | Yes | Yes | Built-in |
|
|
485
|
+
| Stata (.dta) | Yes | Yes | `pip install openstat[stata]` |
|
|
486
|
+
| Excel (.xlsx) | Yes | Yes | `pip install openstat[excel]` |
|
|
487
|
+
|
|
488
|
+
---
|
|
489
|
+
|
|
490
|
+
## Configuration
|
|
491
|
+
|
|
492
|
+
Customize OpenStat by creating `~/.openstat/config.toml`:
|
|
493
|
+
|
|
494
|
+
```toml
|
|
495
|
+
[data]
|
|
496
|
+
output_dir = "outputs"
|
|
497
|
+
csv_separator = ","
|
|
498
|
+
|
|
499
|
+
[display]
|
|
500
|
+
tabulate_limit = 50
|
|
501
|
+
head_default = 10
|
|
502
|
+
|
|
503
|
+
[undo]
|
|
504
|
+
max_undo_stack = 20
|
|
505
|
+
max_undo_memory_mb = 500
|
|
506
|
+
|
|
507
|
+
[plotting]
|
|
508
|
+
plot_dpi = 150
|
|
509
|
+
plot_figsize_w = 8.0
|
|
510
|
+
plot_figsize_h = 5.0
|
|
511
|
+
|
|
512
|
+
[model]
|
|
513
|
+
condition_threshold = 30
|
|
514
|
+
min_obs_per_predictor = 5
|
|
515
|
+
bootstrap_iterations = 1000
|
|
516
|
+
```
|
|
517
|
+
|
|
518
|
+
---
|
|
519
|
+
|
|
520
|
+
## CLI Options
|
|
521
|
+
|
|
522
|
+
```bash
|
|
523
|
+
openstat repl # Interactive REPL
|
|
524
|
+
openstat run script.ost # Run a script
|
|
525
|
+
openstat run script.ost --strict # Stop on first error (exit code 1)
|
|
526
|
+
openstat --verbose repl # Verbose logging (INFO)
|
|
527
|
+
openstat --debug repl # Debug logging (DEBUG)
|
|
528
|
+
openstat --version # Show version
|
|
529
|
+
```
|
|
530
|
+
|
|
531
|
+
Logs are saved to `~/.openstat/logs/openstat.log`.
|
|
532
|
+
|
|
533
|
+
---
|
|
534
|
+
|
|
535
|
+
## Aggregation Functions
|
|
536
|
+
|
|
537
|
+
For use with `groupby ... summarize`:
|
|
538
|
+
|
|
539
|
+
| Function | Description |
|
|
540
|
+
|----------|-------------|
|
|
541
|
+
| `mean(col)` | Arithmetic mean |
|
|
542
|
+
| `sd(col)` | Standard deviation (sample) |
|
|
543
|
+
| `sum(col)` | Sum |
|
|
544
|
+
| `min(col)` | Minimum |
|
|
545
|
+
| `max(col)` | Maximum |
|
|
546
|
+
| `median(col)` | Median |
|
|
547
|
+
| `count()` | Row count per group |
|
|
548
|
+
|
|
549
|
+
---
|
|
550
|
+
|
|
551
|
+
## Technology Stack
|
|
552
|
+
|
|
553
|
+
OpenStat is built on best-in-class open-source libraries:
|
|
554
|
+
|
|
555
|
+
| Component | Library | Why |
|
|
556
|
+
|-----------|---------|-----|
|
|
557
|
+
| Data Engine | [Polars](https://pola.rs/) | 10-100x faster than pandas, zero-copy, Rust-powered |
|
|
558
|
+
| Statistics | [statsmodels](https://www.statsmodels.org/) | Industry-standard OLS, GLM, quantile regression |
|
|
559
|
+
| Scientific | [SciPy](https://scipy.org/) | Hypothesis tests, distributions |
|
|
560
|
+
| Plotting | [matplotlib](https://matplotlib.org/) | Publication-quality figures |
|
|
561
|
+
| CLI Framework | [Typer](https://typer.tiangolo.com/) | Beautiful CLI with zero boilerplate |
|
|
562
|
+
| Terminal UI | [Rich](https://github.com/Textualize/rich) | Gorgeous tables and formatting |
|
|
563
|
+
| REPL | [prompt-toolkit](https://python-prompt-toolkit.readthedocs.io/) | Tab completion, history, syntax |
|
|
564
|
+
|
|
565
|
+
---
|
|
566
|
+
|
|
567
|
+
## Project Structure
|
|
568
|
+
|
|
569
|
+
```
|
|
570
|
+
OpenStat/
|
|
571
|
+
├── src/openstat/
|
|
572
|
+
│ ├── cli.py # Typer CLI entry point
|
|
573
|
+
│ ├── repl.py # Interactive REPL with tab completion
|
|
574
|
+
│ ├── session.py # Session state, undo system
|
|
575
|
+
│ ├── config.py # Configuration management (~/.openstat/config.toml)
|
|
576
|
+
│ ├── commands/
|
|
577
|
+
│ │ ├── base.py # @command decorator, registry, CommandArgs
|
|
578
|
+
│ │ ├── data_cmds.py # load, filter, select, derive, sort, merge, ...
|
|
579
|
+
│ │ ├── stat_cmds.py # summarize, ols, logit, poisson, margins, ...
|
|
580
|
+
│ │ ├── plot_cmds.py # plot hist/scatter/line/box/bar/heatmap
|
|
581
|
+
│ │ └── report_cmds.py # report, help
|
|
582
|
+
│ ├── dsl/
|
|
583
|
+
│ │ ├── tokenizer.py # Safe expression tokenizer
|
|
584
|
+
│ │ └── parser.py # Recursive descent parser (no eval!)
|
|
585
|
+
│ ├── stats/
|
|
586
|
+
│ │ └── models.py # OLS, Logit, Probit, Poisson, NegBin, QuantReg, ...
|
|
587
|
+
│ ├── plots/
|
|
588
|
+
│ │ └── plotter.py # matplotlib chart generation
|
|
589
|
+
│ ├── io/
|
|
590
|
+
│ │ └── loader.py # CSV, Parquet, DTA, Excel loaders
|
|
591
|
+
│ └── reporting/
|
|
592
|
+
│ └── report.py # Markdown report generator
|
|
593
|
+
├── tests/ # 343 tests (and growing!)
|
|
594
|
+
├── examples/
|
|
595
|
+
│ ├── data.csv # Sample dataset
|
|
596
|
+
│ └── demo.ost # Demo script showcasing all features
|
|
597
|
+
├── .github/workflows/
|
|
598
|
+
│ └── ci.yml # GitHub Actions: test on 4 Python versions x 2 OS
|
|
599
|
+
├── pyproject.toml
|
|
600
|
+
├── LICENSE # MIT
|
|
601
|
+
├── CONTRIBUTING.md
|
|
602
|
+
└── README.md
|
|
603
|
+
```
|
|
604
|
+
|
|
605
|
+
---
|
|
606
|
+
|
|
607
|
+
## Development
|
|
608
|
+
|
|
609
|
+
```bash
|
|
610
|
+
# Install with dev dependencies
|
|
611
|
+
pip install -e ".[dev]"
|
|
612
|
+
|
|
613
|
+
# Run the full test suite
|
|
614
|
+
pytest
|
|
615
|
+
|
|
616
|
+
# Run with coverage
|
|
617
|
+
pytest --cov=openstat --cov-report=term-missing
|
|
618
|
+
|
|
619
|
+
# Run a specific test file
|
|
620
|
+
pytest tests/test_v020.py -v
|
|
621
|
+
|
|
622
|
+
# Lint
|
|
623
|
+
pip install ruff
|
|
624
|
+
ruff check src/ tests/
|
|
625
|
+
```
|
|
626
|
+
|
|
627
|
+
**Current test status:** 343 tests passed, 0 failures across 11 test files.
|
|
628
|
+
|
|
629
|
+
---
|
|
630
|
+
|
|
631
|
+
## Contributing
|
|
632
|
+
|
|
633
|
+
**We love contributions!** Whether you're fixing a typo, adding a new command, improving documentation, or building an entire new feature — your contribution matters and is deeply appreciated.
|
|
634
|
+
|
|
635
|
+
OpenStat is built by the community, for the community. Every contribution makes statistical analysis more accessible to researchers, students, and data scientists around the world.
|
|
636
|
+
|
|
637
|
+
### How to Contribute
|
|
638
|
+
|
|
639
|
+
1. **Fork** the repository
|
|
640
|
+
2. **Create** a feature branch (`git checkout -b feature/amazing-feature`)
|
|
641
|
+
3. **Write** your code and tests
|
|
642
|
+
4. **Ensure** all tests pass (`pytest`) and lint is clean (`ruff check src/`)
|
|
643
|
+
5. **Submit** a Pull Request with a clear description
|
|
644
|
+
|
|
645
|
+
### What Can You Contribute?
|
|
646
|
+
|
|
647
|
+
- **New statistical methods** — Panel data, time series, survival analysis, mixed models
|
|
648
|
+
- **New commands** — Any data manipulation or analysis command you find useful
|
|
649
|
+
- **DSL functions** — Add functions to the expression language
|
|
650
|
+
- **Plot types** — New visualization types
|
|
651
|
+
- **Documentation** — Tutorials, examples, translations
|
|
652
|
+
- **Bug reports** — Found something that doesn't work? Open an issue!
|
|
653
|
+
- **Performance** — Found a bottleneck? We'd love a PR!
|
|
654
|
+
- **File formats** — Support for more data formats (SAS, SPSS, etc.)
|
|
655
|
+
|
|
656
|
+
### First-Time Contributors Welcome!
|
|
657
|
+
|
|
658
|
+
Never contributed to open source before? No problem! Look for issues labeled `good first issue`. We're happy to mentor and guide you through the process. Every expert was once a beginner.
|
|
659
|
+
|
|
660
|
+
Check out [CONTRIBUTING.md](CONTRIBUTING.md) for detailed setup instructions and coding guidelines.
|
|
661
|
+
|
|
662
|
+
---
|
|
663
|
+
|
|
664
|
+
## Roadmap
|
|
665
|
+
|
|
666
|
+
We have big plans for OpenStat. Here's what's coming:
|
|
667
|
+
|
|
668
|
+
### Completed
|
|
669
|
+
|
|
670
|
+
- [x] OLS, Logit, Probit regression
|
|
671
|
+
- [x] Interaction terms (`x1*x2`, `x1:x2`, multi-way)
|
|
672
|
+
- [x] Cluster-robust standard errors
|
|
673
|
+
- [x] Poisson & Negative Binomial regression
|
|
674
|
+
- [x] Quantile regression
|
|
675
|
+
- [x] Marginal effects (average, at-means)
|
|
676
|
+
- [x] Bootstrap confidence intervals (parallelized)
|
|
677
|
+
- [x] Post-estimation diagnostics (`estat`)
|
|
678
|
+
- [x] Model comparison tables (`estimates table`)
|
|
679
|
+
- [x] Stepwise variable selection (forward/backward)
|
|
680
|
+
- [x] Robust standard errors (HC1)
|
|
681
|
+
- [x] Residual diagnostics with plots
|
|
682
|
+
- [x] VIF multicollinearity check
|
|
683
|
+
- [x] LaTeX table export
|
|
684
|
+
- [x] Data joining/merging
|
|
685
|
+
- [x] Pivot/melt reshaping
|
|
686
|
+
- [x] Safe expression language (no eval)
|
|
687
|
+
- [x] Tab completion in REPL
|
|
688
|
+
- [x] Configuration file support
|
|
689
|
+
- [x] Multi-level undo with memory management
|
|
690
|
+
- [x] CI/CD with GitHub Actions
|
|
691
|
+
|
|
692
|
+
### Planned
|
|
693
|
+
|
|
694
|
+
- [ ] Panel data / fixed effects / random effects
|
|
695
|
+
- [ ] Time series analysis (ARIMA, VAR)
|
|
696
|
+
- [ ] Survival analysis (Cox PH, Kaplan-Meier)
|
|
697
|
+
- [ ] Mixed / hierarchical linear models
|
|
698
|
+
- [ ] Instrumental variables (2SLS, IV)
|
|
699
|
+
- [ ] DuckDB / LazyFrame backend for large datasets
|
|
700
|
+
- [ ] Plugin / extension system
|
|
701
|
+
- [ ] Web-based GUI
|
|
702
|
+
- [ ] Jupyter notebook integration
|
|
703
|
+
- [ ] SAS (.sas7bdat) and SPSS (.sav) file support
|
|
704
|
+
- [ ] Multiple imputation for missing data
|
|
705
|
+
- [ ] Survey weighting support
|
|
706
|
+
|
|
707
|
+
---
|
|
708
|
+
|
|
709
|
+
## Community
|
|
710
|
+
|
|
711
|
+
OpenStat is more than code — it's a community of people who believe that statistical tools should be free and open. If you use OpenStat in your research, teaching, or work, we'd love to hear about it!
|
|
712
|
+
|
|
713
|
+
- **Star this repo** if you find it useful — it helps others discover the project
|
|
714
|
+
- **Share** with colleagues, students, and fellow researchers
|
|
715
|
+
- **Open issues** for bugs, feature requests, or questions
|
|
716
|
+
- **Join the conversation** in GitHub Discussions
|
|
717
|
+
|
|
718
|
+
---
|
|
719
|
+
|
|
720
|
+
## Acknowledgements
|
|
721
|
+
|
|
722
|
+
OpenStat stands on the shoulders of giants. Huge thanks to the maintainers and contributors of:
|
|
723
|
+
|
|
724
|
+
- [Polars](https://pola.rs/) — for reimagining what a DataFrame library can be
|
|
725
|
+
- [statsmodels](https://www.statsmodels.org/) — for bringing professional statistics to Python
|
|
726
|
+
- [SciPy](https://scipy.org/) — for decades of scientific computing excellence
|
|
727
|
+
- [Rich](https://github.com/Textualize/rich) — for making terminal output beautiful
|
|
728
|
+
- [prompt-toolkit](https://python-prompt-toolkit.readthedocs.io/) — for the interactive REPL foundation
|
|
729
|
+
|
|
730
|
+
And to every researcher, student, and data scientist who believes in open science. This project is for you.
|
|
731
|
+
|
|
732
|
+
---
|
|
733
|
+
|
|
734
|
+
## License
|
|
735
|
+
|
|
736
|
+
MIT License. See [LICENSE](LICENSE) for details.
|
|
737
|
+
|
|
738
|
+
Free as in freedom. Free as in beer. Use it, modify it, share it, sell it — no restrictions.
|
|
739
|
+
|
|
740
|
+
---
|
|
741
|
+
|
|
742
|
+
<p align="center">
|
|
743
|
+
<strong>If OpenStat helps your work, give it a star! Every star helps more people discover free statistical tools.</strong>
|
|
744
|
+
</p>
|
|
745
|
+
|
|
746
|
+
<p align="center">
|
|
747
|
+
Made with care for the open-source community.
|
|
748
|
+
</p>
|