openstat-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. openstat/__init__.py +3 -0
  2. openstat/__main__.py +4 -0
  3. openstat/backends/__init__.py +16 -0
  4. openstat/backends/duckdb_backend.py +70 -0
  5. openstat/backends/polars_backend.py +52 -0
  6. openstat/cli.py +92 -0
  7. openstat/commands/__init__.py +82 -0
  8. openstat/commands/adv_stat_cmds.py +1255 -0
  9. openstat/commands/advanced_ml_cmds.py +576 -0
  10. openstat/commands/advreg_cmds.py +207 -0
  11. openstat/commands/alias_cmds.py +135 -0
  12. openstat/commands/arch_cmds.py +82 -0
  13. openstat/commands/arules_cmds.py +111 -0
  14. openstat/commands/automodel_cmds.py +212 -0
  15. openstat/commands/backend_cmds.py +82 -0
  16. openstat/commands/base.py +170 -0
  17. openstat/commands/bayes_cmds.py +71 -0
  18. openstat/commands/causal_cmds.py +269 -0
  19. openstat/commands/cluster_cmds.py +152 -0
  20. openstat/commands/data_cmds.py +996 -0
  21. openstat/commands/datamanip_cmds.py +672 -0
  22. openstat/commands/dataquality_cmds.py +174 -0
  23. openstat/commands/datetime_cmds.py +176 -0
  24. openstat/commands/dimreduce_cmds.py +184 -0
  25. openstat/commands/discrete_cmds.py +149 -0
  26. openstat/commands/dsl_cmds.py +143 -0
  27. openstat/commands/epi_cmds.py +93 -0
  28. openstat/commands/equiv_tobit_cmds.py +94 -0
  29. openstat/commands/esttab_cmds.py +196 -0
  30. openstat/commands/export_beamer_cmds.py +142 -0
  31. openstat/commands/export_cmds.py +201 -0
  32. openstat/commands/export_extra_cmds.py +240 -0
  33. openstat/commands/factor_cmds.py +180 -0
  34. openstat/commands/groupby_cmds.py +155 -0
  35. openstat/commands/help_cmds.py +237 -0
  36. openstat/commands/i18n_cmds.py +43 -0
  37. openstat/commands/import_extra_cmds.py +561 -0
  38. openstat/commands/influence_cmds.py +134 -0
  39. openstat/commands/iv_cmds.py +106 -0
  40. openstat/commands/manova_cmds.py +105 -0
  41. openstat/commands/mediate_cmds.py +233 -0
  42. openstat/commands/meta_cmds.py +284 -0
  43. openstat/commands/mi_cmds.py +228 -0
  44. openstat/commands/mixed_cmds.py +79 -0
  45. openstat/commands/mixture_changepoint_cmds.py +166 -0
  46. openstat/commands/ml_adv_cmds.py +147 -0
  47. openstat/commands/ml_cmds.py +178 -0
  48. openstat/commands/model_eval_cmds.py +142 -0
  49. openstat/commands/network_cmds.py +288 -0
  50. openstat/commands/nlquery_cmds.py +161 -0
  51. openstat/commands/nonparam_cmds.py +149 -0
  52. openstat/commands/outreg_cmds.py +247 -0
  53. openstat/commands/panel_cmds.py +141 -0
  54. openstat/commands/pdf_cmds.py +226 -0
  55. openstat/commands/pipeline_cmds.py +319 -0
  56. openstat/commands/plot_cmds.py +189 -0
  57. openstat/commands/plugin_cmds.py +79 -0
  58. openstat/commands/posthoc_cmds.py +153 -0
  59. openstat/commands/power_cmds.py +172 -0
  60. openstat/commands/profile_cmds.py +246 -0
  61. openstat/commands/rbridge_cmds.py +81 -0
  62. openstat/commands/regex_cmds.py +104 -0
  63. openstat/commands/report_cmds.py +48 -0
  64. openstat/commands/repro_cmds.py +129 -0
  65. openstat/commands/resampling_cmds.py +109 -0
  66. openstat/commands/reshape_cmds.py +223 -0
  67. openstat/commands/sem_cmds.py +177 -0
  68. openstat/commands/stat_cmds.py +1040 -0
  69. openstat/commands/stata_import_cmds.py +215 -0
  70. openstat/commands/string_cmds.py +124 -0
  71. openstat/commands/surv_cmds.py +145 -0
  72. openstat/commands/survey_cmds.py +153 -0
  73. openstat/commands/textanalysis_cmds.py +192 -0
  74. openstat/commands/ts_adv_cmds.py +136 -0
  75. openstat/commands/ts_cmds.py +195 -0
  76. openstat/commands/tui_cmds.py +111 -0
  77. openstat/commands/ux_cmds.py +191 -0
  78. openstat/commands/validate_cmds.py +270 -0
  79. openstat/commands/viz_adv_cmds.py +312 -0
  80. openstat/commands/viz_extra_cmds.py +251 -0
  81. openstat/commands/watch_cmds.py +69 -0
  82. openstat/config.py +106 -0
  83. openstat/dsl/__init__.py +0 -0
  84. openstat/dsl/parser.py +332 -0
  85. openstat/dsl/tokenizer.py +105 -0
  86. openstat/i18n.py +120 -0
  87. openstat/io/__init__.py +0 -0
  88. openstat/io/loader.py +187 -0
  89. openstat/jupyter/__init__.py +18 -0
  90. openstat/jupyter/display.py +18 -0
  91. openstat/jupyter/magic.py +60 -0
  92. openstat/logging_config.py +59 -0
  93. openstat/plots/__init__.py +0 -0
  94. openstat/plots/plotter.py +437 -0
  95. openstat/plots/surv_plots.py +32 -0
  96. openstat/plots/ts_plots.py +59 -0
  97. openstat/plugins/__init__.py +5 -0
  98. openstat/plugins/manager.py +69 -0
  99. openstat/repl.py +457 -0
  100. openstat/reporting/__init__.py +0 -0
  101. openstat/reporting/eda.py +208 -0
  102. openstat/reporting/report.py +67 -0
  103. openstat/script_runner.py +319 -0
  104. openstat/session.py +133 -0
  105. openstat/stats/__init__.py +0 -0
  106. openstat/stats/advanced_regression.py +269 -0
  107. openstat/stats/arch_garch.py +84 -0
  108. openstat/stats/bayesian.py +103 -0
  109. openstat/stats/causal.py +258 -0
  110. openstat/stats/clustering.py +206 -0
  111. openstat/stats/discrete.py +311 -0
  112. openstat/stats/epidemiology.py +119 -0
  113. openstat/stats/equiv_tobit.py +163 -0
  114. openstat/stats/factor.py +174 -0
  115. openstat/stats/imputation.py +282 -0
  116. openstat/stats/influence.py +78 -0
  117. openstat/stats/iv.py +131 -0
  118. openstat/stats/manova.py +124 -0
  119. openstat/stats/mixed.py +128 -0
  120. openstat/stats/ml.py +275 -0
  121. openstat/stats/ml_advanced.py +117 -0
  122. openstat/stats/model_eval.py +183 -0
  123. openstat/stats/models.py +1342 -0
  124. openstat/stats/nonparametric.py +130 -0
  125. openstat/stats/panel.py +179 -0
  126. openstat/stats/power.py +295 -0
  127. openstat/stats/resampling.py +203 -0
  128. openstat/stats/survey.py +213 -0
  129. openstat/stats/survival.py +196 -0
  130. openstat/stats/timeseries.py +142 -0
  131. openstat/stats/ts_advanced.py +114 -0
  132. openstat/types.py +11 -0
  133. openstat/web/__init__.py +1 -0
  134. openstat/web/app.py +117 -0
  135. openstat/web/session_manager.py +73 -0
  136. openstat/web/static/app.js +117 -0
  137. openstat/web/static/index.html +38 -0
  138. openstat/web/static/style.css +103 -0
  139. openstat_cli-1.0.0.dist-info/METADATA +748 -0
  140. openstat_cli-1.0.0.dist-info/RECORD +143 -0
  141. openstat_cli-1.0.0.dist-info/WHEEL +4 -0
  142. openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
  143. openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,748 @@
1
+ Metadata-Version: 2.4
2
+ Name: openstat-cli
3
+ Version: 1.0.0
4
+ Summary: Open-source statistical analysis tool — a free alternative to Stata, SPSS, and SAS
5
+ Project-URL: Homepage, https://github.com/openstat-project/openstat
6
+ Project-URL: Documentation, https://github.com/openstat-project/openstat#readme
7
+ Project-URL: Bug Tracker, https://github.com/openstat-project/openstat/issues
8
+ Project-URL: Changelog, https://github.com/openstat-project/openstat/blob/main/CHANGELOG.md
9
+ Author: baristiran
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: causal-inference,cli,data-analysis,econometrics,machine-learning,panel-data,regression,repl,spss,stata,statistics,survival-analysis,time-series
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Environment :: Console
15
+ Classifier: Intended Audience :: Education
16
+ Classifier: Intended Audience :: Financial and Insurance Industry
17
+ Classifier: Intended Audience :: Science/Research
18
+ Classifier: License :: OSI Approved :: MIT License
19
+ Classifier: Operating System :: OS Independent
20
+ Classifier: Programming Language :: Python :: 3
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: 3.11
23
+ Classifier: Programming Language :: Python :: 3.12
24
+ Classifier: Programming Language :: Python :: 3.13
25
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
26
+ Classifier: Topic :: Scientific/Engineering :: Mathematics
27
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
+ Requires-Python: >=3.10
29
+ Requires-Dist: matplotlib>=3.8
30
+ Requires-Dist: numpy>=1.24
31
+ Requires-Dist: polars>=1.0
32
+ Requires-Dist: prompt-toolkit>=3.0
33
+ Requires-Dist: rich>=13.0
34
+ Requires-Dist: scipy>=1.12
35
+ Requires-Dist: statsmodels>=0.14
36
+ Requires-Dist: typer>=0.12
37
+ Provides-Extra: all
38
+ Requires-Dist: arch>=6.0; extra == 'all'
39
+ Requires-Dist: connectorx>=0.3; extra == 'all'
40
+ Requires-Dist: duckdb>=0.10; extra == 'all'
41
+ Requires-Dist: fastapi>=0.100; extra == 'all'
42
+ Requires-Dist: ipython>=8.0; extra == 'all'
43
+ Requires-Dist: lifelines>=0.28; extra == 'all'
44
+ Requires-Dist: linearmodels>=6.0; extra == 'all'
45
+ Requires-Dist: nbformat>=5.0; extra == 'all'
46
+ Requires-Dist: networkx>=3.0; extra == 'all'
47
+ Requires-Dist: openpyxl>=3.1; extra == 'all'
48
+ Requires-Dist: pandas>=2.0; extra == 'all'
49
+ Requires-Dist: plotly>=5.0; extra == 'all'
50
+ Requires-Dist: pyreadstat>=1.0; extra == 'all'
51
+ Requires-Dist: python-docx>=1.1; extra == 'all'
52
+ Requires-Dist: python-multipart; extra == 'all'
53
+ Requires-Dist: python-pptx>=0.6; extra == 'all'
54
+ Requires-Dist: rapidfuzz>=3.0; extra == 'all'
55
+ Requires-Dist: reportlab>=4.0; extra == 'all'
56
+ Requires-Dist: scikit-learn>=1.4; extra == 'all'
57
+ Requires-Dist: semopy>=2.3; extra == 'all'
58
+ Requires-Dist: shap>=0.44; extra == 'all'
59
+ Requires-Dist: textual>=0.60; extra == 'all'
60
+ Requires-Dist: uvicorn>=0.30; extra == 'all'
61
+ Requires-Dist: websockets; extra == 'all'
62
+ Requires-Dist: xlsxwriter>=3.1; extra == 'all'
63
+ Provides-Extra: anthropic
64
+ Requires-Dist: anthropic>=0.20; extra == 'anthropic'
65
+ Provides-Extra: database
66
+ Requires-Dist: connectorx>=0.3; extra == 'database'
67
+ Provides-Extra: dev
68
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
69
+ Requires-Dist: pytest>=8.0; extra == 'dev'
70
+ Provides-Extra: duckdb
71
+ Requires-Dist: duckdb>=0.10; extra == 'duckdb'
72
+ Provides-Extra: excel
73
+ Requires-Dist: openpyxl>=3.1; extra == 'excel'
74
+ Requires-Dist: xlsxwriter>=3.1; extra == 'excel'
75
+ Provides-Extra: factor
76
+ Requires-Dist: scikit-learn>=1.4; extra == 'factor'
77
+ Provides-Extra: fuzzy
78
+ Requires-Dist: rapidfuzz>=3.0; extra == 'fuzzy'
79
+ Provides-Extra: garch
80
+ Requires-Dist: arch>=6.0; extra == 'garch'
81
+ Provides-Extra: interactive
82
+ Requires-Dist: plotly>=5.0; extra == 'interactive'
83
+ Provides-Extra: jupyter
84
+ Requires-Dist: ipython>=8.0; extra == 'jupyter'
85
+ Provides-Extra: ml
86
+ Requires-Dist: scikit-learn>=1.4; extra == 'ml'
87
+ Provides-Extra: network
88
+ Requires-Dist: networkx>=3.0; extra == 'network'
89
+ Provides-Extra: notebook
90
+ Requires-Dist: nbformat>=5.0; extra == 'notebook'
91
+ Provides-Extra: openai
92
+ Requires-Dist: openai>=1.0; extra == 'openai'
93
+ Provides-Extra: panel
94
+ Requires-Dist: linearmodels>=6.0; extra == 'panel'
95
+ Provides-Extra: pdf
96
+ Requires-Dist: reportlab>=4.0; extra == 'pdf'
97
+ Provides-Extra: rbridge
98
+ Requires-Dist: rpy2>=3.5; extra == 'rbridge'
99
+ Provides-Extra: report
100
+ Requires-Dist: python-docx>=1.1; extra == 'report'
101
+ Requires-Dist: python-pptx>=0.6; extra == 'report'
102
+ Provides-Extra: sas
103
+ Requires-Dist: pandas>=2.0; extra == 'sas'
104
+ Requires-Dist: pyreadstat>=1.0; extra == 'sas'
105
+ Provides-Extra: sem
106
+ Requires-Dist: semopy>=2.3; extra == 'sem'
107
+ Provides-Extra: shap
108
+ Requires-Dist: shap>=0.44; extra == 'shap'
109
+ Provides-Extra: spss
110
+ Requires-Dist: pandas>=2.0; extra == 'spss'
111
+ Requires-Dist: pyreadstat>=1.0; extra == 'spss'
112
+ Provides-Extra: stata
113
+ Requires-Dist: pandas>=2.0; extra == 'stata'
114
+ Requires-Dist: pyreadstat>=1.0; extra == 'stata'
115
+ Provides-Extra: survival
116
+ Requires-Dist: lifelines>=0.28; extra == 'survival'
117
+ Provides-Extra: tui
118
+ Requires-Dist: textual>=0.60; extra == 'tui'
119
+ Provides-Extra: web
120
+ Requires-Dist: fastapi>=0.100; extra == 'web'
121
+ Requires-Dist: python-multipart; extra == 'web'
122
+ Requires-Dist: uvicorn>=0.30; extra == 'web'
123
+ Requires-Dist: websockets; extra == 'web'
124
+ Description-Content-Type: text/markdown
125
+
126
+ <p align="center">
127
+ <img src="https://img.shields.io/badge/version-1.0.0-blue?style=for-the-badge" alt="Version">
128
+ <img src="https://img.shields.io/badge/python-3.10%2B-brightgreen?style=for-the-badge&logo=python&logoColor=white" alt="Python">
129
+ <img src="https://img.shields.io/badge/license-MIT-orange?style=for-the-badge" alt="License">
130
+ <img src="https://img.shields.io/badge/tests-816%20passed-success?style=for-the-badge" alt="Tests">
131
+ <img src="https://img.shields.io/badge/commands-231-blueviolet?style=for-the-badge" alt="Commands">
132
+ <img src="https://img.shields.io/badge/powered%20by-Polars%20%7C%20statsmodels-purple?style=for-the-badge" alt="Stack">
133
+ </p>
134
+
135
+ <h1 align="center">OpenStat</h1>
136
+
137
+ <p align="center">
138
+ <strong>The open-source statistical analysis tool you've been waiting for.</strong><br>
139
+ Load data. Explore. Transform. Model. Plot. Report. All from your terminal.
140
+ </p>
141
+
142
+ <p align="center">
143
+ <a href="#-quick-start">Quick Start</a> &bull;
144
+ <a href="#-why-openstat">Why OpenStat?</a> &bull;
145
+ <a href="#-full-command-reference">Commands</a> &bull;
146
+ <a href="#-statistical-models">Models</a> &bull;
147
+ <a href="#-contributing">Contributing</a>
148
+ </p>
149
+
150
+ ---
151
+
152
+ > **Note:** OpenStat is an independent, community-driven open-source project. It is not affiliated with, endorsed by, or connected to StataCorp LLC or any commercial statistical software vendor.
153
+
154
+ ## Why OpenStat?
155
+
156
+ **Statistical analysis shouldn't require expensive licenses.** Every researcher, student, data scientist, and curious mind deserves access to professional-grade statistical tools — for free, forever.
157
+
158
+ OpenStat brings the familiar workflow of commercial statistical packages into your terminal with a clean, intuitive REPL. It's built on the incredible open-source Python ecosystem (Polars, statsmodels, scipy) and designed to be:
159
+
160
+ - **Accessible** — No licensing fees. No registration. Just `pip install` and go.
161
+ - **Familiar** — If you've used Stata, R, or SPSS, you'll feel right at home.
162
+ - **Fast** — Powered by [Polars](https://pola.rs/) (not pandas) for blazing-fast data operations.
163
+ - **Safe** — No `eval()` anywhere. All user expressions go through a secure whitelist parser.
164
+ - **Scriptable** — Write `.ost` scripts for reproducible analysis pipelines.
165
+ - **Extensible** — Adding a new command takes 10 lines of code. Seriously.
166
+
167
+ ---
168
+
169
+ ## Quick Start
170
+
171
+ ### Installation
172
+
173
+ ```bash
174
+ # Clone the repository
175
+ git clone https://github.com/baristiran/OpenStat.git
176
+ cd OpenStat
177
+
178
+ # Create a virtual environment (recommended)
179
+ python -m venv .venv
180
+ source .venv/bin/activate # Windows: .venv\Scripts\activate
181
+
182
+ # Install OpenStat with all dependencies
183
+ pip install -e ".[dev]"
184
+ ```
185
+
186
+ ### Launch the Interactive REPL
187
+
188
+ ```bash
189
+ openstat repl
190
+ ```
191
+
192
+ ```
193
+ OpenStat v0.2.0 — Open-source statistical analysis tool
194
+ Type help for commands, quit to exit.
195
+
196
+ openstat> load examples/data.csv
197
+ Loaded 50 rows x 7 columns from examples/data.csv
198
+
199
+ openstat> summarize age income score
200
+ ┌──────────┬────┬─────────┬─────────┬───────┬─────────┬─────────┬─────────┬─────────┐
201
+ │ Variable │ N │ Mean │ SD │ Min │ P25 │ P50 │ P75 │ Max │
202
+ ├──────────┼────┼─────────┼─────────┼───────┼─────────┼─────────┼─────────┼─────────┤
203
+ │ age │ 50 │ 34.6600 │ 8.7634 │ 21.00 │ 27.2500 │ 34.0000 │ 42.5000 │ 53.0000 │
204
+ │ income │ 50 │ 49840.0 │ 17547.2 │ 26000 │ 34000.0 │ 47000.0 │ 66000.0 │ 88000.0 │
205
+ │ score │ 50 │ 7.4280 │ 1.2844 │ 4.90 │ 6.4750 │ 7.5000 │ 8.5500 │ 9.4000 │
206
+ └──────────┴────┴─────────┴─────────┴───────┴─────────┴─────────┴─────────┴─────────┘
207
+
208
+ openstat> ols score ~ age + income --robust
209
+ ┌──────────┬────────┬─────────┬───────┬────────┬────────────┬─────────────┐
210
+ │ Variable │ Coef │ Std.Err │ t/z │ P>|t| │ [95% CI L] │ [95% CI H] │
211
+ ├──────────┼────────┼─────────┼───────┼────────┼────────────┼─────────────┤
212
+ │ _cons │ 2.1435 │ 0.4521 │ 4.741 │ 0.0000 │ 1.2343 │ 3.0527 │
213
+ │ age │ 0.0312 │ 0.0187 │ 1.668 │ 0.1018 │ -0.0066 │ 0.0690 │
214
+ │ income │ 0.0001 │ 0.0000 │ 5.234 │ 0.0000 │ 0.0000 │ 0.0001 │
215
+ └──────────┴────────┴─────────┴───────┴────────┴────────────┴─────────────┘
216
+ N = 50 | R² = 0.5481 | Adj.R² = 0.5289 | F(2, 47) = 28.52 (p=0.0000)
217
+
218
+ openstat> predict yhat
219
+ Predictions added as 'yhat'. 50 rows x 8 columns.
220
+
221
+ openstat> quit
222
+ Bye!
223
+ ```
224
+
225
+ ### Run a Script
226
+
227
+ ```bash
228
+ # Run an analysis script
229
+ openstat run examples/demo.ost
230
+
231
+ # Strict mode — stop on first error (great for CI/CD)
232
+ openstat run examples/demo.ost --strict
233
+ ```
234
+
235
+ ---
236
+
237
+ ## What's New in v0.2.0
238
+
239
+ Version 0.2.0 is a massive leap in statistical depth. Here's what's new:
240
+
241
+ | Feature | What it does | Example |
242
+ |---------|-------------|---------|
243
+ | **Interaction Terms** | Model interactions between variables | `ols y ~ x1*x2` or `ols y ~ x1:x2` |
244
+ | **Cluster-Robust SE** | Standard errors robust to within-group correlation | `ols y ~ x1 + x2 --cluster=region` |
245
+ | **Poisson Regression** | Count data modeling with optional exposure offset | `poisson visits ~ age + income --exposure=time` |
246
+ | **Negative Binomial** | Overdispersed count data (reports dispersion alpha) | `negbin claims ~ age + gender` |
247
+ | **Quantile Regression** | Model any quantile, not just the mean | `quantreg y ~ x1 + x2 tau=0.75` |
248
+ | **Marginal Effects** | Average or at-means marginal effects for logit/probit | `margins --at=average` |
249
+ | **Bootstrap CI** | Non-parametric confidence intervals via resampling | `bootstrap n=1000 ci=95` |
250
+ | **Post-Estimation Diagnostics** | Breusch-Pagan, Ramsey RESET, link test, IC | `estat all` |
251
+ | **Model Comparison** | Side-by-side model comparison table | `estimates table` |
252
+ | **Multi-Way Interactions** | Three-way and beyond: `x1*x2*x3` auto-expands | Full factorial expansion |
253
+
254
+ ---
255
+
256
+ ## Full Command Reference
257
+
258
+ ### Data Management
259
+
260
+ | Command | Description | Example |
261
+ |---------|-------------|---------|
262
+ | `load <path>` | Load CSV, Parquet, Stata (.dta), Excel (.xlsx) | `load survey.csv` |
263
+ | `save <path>` | Save data to any supported format | `save results.parquet` |
264
+ | `describe` | Show dataset structure (types, nulls) | `describe` |
265
+ | `head [N]` | Show first N rows (default: 10) | `head 20` |
266
+ | `tail [N]` | Show last N rows | `tail 5` |
267
+ | `count` | Row and column count | `count` |
268
+ | `merge <path> on <key> [how=...]` | Join with another file | `merge scores.csv on id how=left` |
269
+ | `undo` | Undo last data change (multi-level) | `undo` |
270
+
271
+ ### Data Transformation
272
+
273
+ | Command | Description | Example |
274
+ |---------|-------------|---------|
275
+ | `filter <expr>` | Filter rows with expressions | `filter age > 30 and income < 50000` |
276
+ | `select <cols>` | Keep specific columns | `select age income score` |
277
+ | `derive <col> = <expr>` | Create new variables | `derive bmi = weight / (height ** 2)` |
278
+ | `dropna [cols]` | Drop missing values | `dropna age income` |
279
+ | `fillna <col> <strategy>` | Fill missing values | `fillna income median` |
280
+ | `sort <col> [--desc]` | Sort dataset | `sort income --desc` |
281
+ | `rename <old> <new>` | Rename a column | `rename income salary` |
282
+ | `cast <col> <type>` | Cast column type | `cast age float` |
283
+ | `encode <col> [as <new>]` | Label-encode strings | `encode region as region_code` |
284
+ | `recode <col> old=new ...` | Recode values | `recode region North=N South=S` |
285
+ | `replace <col> <old> <new>` | Replace values | `replace region North Norte` |
286
+ | `sample <N\|N%>` | Random sample | `sample 100` or `sample 10%` |
287
+ | `duplicates [drop] [cols]` | Find or drop duplicates | `duplicates drop` |
288
+ | `unique <col>` | List unique values | `unique region` |
289
+ | `lag <col> [N]` | Lag variable (shift down) | `lag price 2` |
290
+ | `lead <col> [N]` | Lead variable (shift up) | `lead price` |
291
+ | `pivot <val> by <col>` | Reshape to wide format | `pivot score by subject over name` |
292
+ | `melt <ids>, <vals>` | Reshape to long format | `melt name, math eng` |
293
+
294
+ ### Descriptive Statistics
295
+
296
+ | Command | Description | Example |
297
+ |---------|-------------|---------|
298
+ | `summarize [cols]` | Summary statistics (N, Mean, SD, quartiles) | `summarize age income` |
299
+ | `tabulate <col>` | Frequency table (top 50 values) | `tabulate education` |
300
+ | `crosstab <row> <col>` | Two-way contingency table with row percentages | `crosstab gender status` |
301
+ | `corr [cols]` | Pearson correlation matrix | `corr age income score` |
302
+ | `groupby <cols> summarize <aggs>` | Group-by with aggregations | `groupby region summarize mean(income) count()` |
303
+
304
+ ### Statistical Models
305
+
306
+ | Command | Description | Example |
307
+ |---------|-------------|---------|
308
+ | `ols y ~ x1 + x2` | OLS linear regression | `ols score ~ age + income --robust` |
309
+ | `logit y ~ x1 + x2` | Logistic regression (binary) | `logit employed ~ age + income` |
310
+ | `probit y ~ x1 + x2` | Probit regression (binary) | `probit employed ~ age + income` |
311
+ | `poisson y ~ x1 + x2` | Poisson regression (counts) | `poisson visits ~ age --exposure=time` |
312
+ | `negbin y ~ x1 + x2` | Negative Binomial (overdispersed) | `negbin claims ~ age + gender` |
313
+ | `quantreg y ~ x1 + x2` | Quantile regression | `quantreg wage ~ edu + exp tau=0.9` |
314
+
315
+ **All models support:** `--robust` (heteroscedasticity-robust SE), `--cluster=col` (cluster-robust SE)
316
+
317
+ **Formula syntax:**
318
+ - `y ~ x1 + x2` — standard predictors
319
+ - `y ~ x1*x2` — full factorial (expands to `x1 + x2 + x1:x2`)
320
+ - `y ~ x1:x2` — interaction term only
321
+ - `y ~ x1*x2*x3` — three-way interaction (all combinations)
322
+
323
+ ### Post-Estimation
324
+
325
+ | Command | Description | Example |
326
+ |---------|-------------|---------|
327
+ | `predict [name]` | Predicted values from last model | `predict yhat` |
328
+ | `residuals [name]` | Residuals + diagnostic plots | `residuals resid` |
329
+ | `vif` | Variance Inflation Factor | `vif` |
330
+ | `margins [--at=means\|average]` | Marginal effects (logit/probit) | `margins --at=average` |
331
+ | `bootstrap [n=N] [ci=N]` | Bootstrap confidence intervals | `bootstrap n=1000 ci=95` |
332
+ | `estat <sub>` | Post-estimation diagnostics | `estat all` |
333
+ | `estimates table` | Side-by-side model comparison | `estimates table` |
334
+ | `stepwise y ~ x1 + ...` | Stepwise variable selection | `stepwise y ~ x1 + x2 + x3 --backward` |
335
+ | `latex [path.tex]` | Export model as LaTeX table | `latex results.tex` |
336
+
337
+ **`estat` subcommands:**
338
+ - `estat hettest` — Breusch-Pagan heteroscedasticity test
339
+ - `estat ovtest` — Ramsey RESET specification test
340
+ - `estat linktest` — Link test for model specification
341
+ - `estat ic` — Information criteria (AIC, BIC, Log-Likelihood)
342
+ - `estat all` — Run all diagnostics at once
343
+
344
+ ### Hypothesis Tests
345
+
346
+ | Command | Description | Example |
347
+ |---------|-------------|---------|
348
+ | `ttest <col>` | One-sample t-test (H0: mean=0) | `ttest score mu=7` |
349
+ | `ttest <col> by <group>` | Two-sample Welch t-test | `ttest income by employed` |
350
+ | `ttest <col> paired <col2>` | Paired t-test | `ttest before paired after` |
351
+ | `chi2 <col1> <col2>` | Chi-square independence test | `chi2 region employed` |
352
+ | `anova <col> by <group>` | One-way ANOVA (F-test) | `anova score by region` |
353
+
354
+ ### Visualization
355
+
356
+ | Command | Description | Example |
357
+ |---------|-------------|---------|
358
+ | `plot hist <col>` | Histogram | `plot hist age` |
359
+ | `plot scatter <y> <x>` | Scatter plot | `plot scatter score income` |
360
+ | `plot line <y> <x>` | Line plot | `plot line score age` |
361
+ | `plot box <col> [by <g>]` | Box plot (optionally grouped) | `plot box income by region` |
362
+ | `plot bar <col> [by <g>]` | Bar chart | `plot bar income by region` |
363
+ | `plot heatmap [cols]` | Correlation heatmap | `plot heatmap age income score` |
364
+ | `plot diagnostics` | Residual diagnostic plots | `plot diagnostics` |
365
+
366
+ ### Other
367
+
368
+ | Command | Description | Example |
369
+ |---------|-------------|---------|
370
+ | `report <path>` | Generate Markdown report | `report analysis.md` |
371
+ | `help [cmd]` | Show help (all or specific command) | `help ols` |
372
+ | `quit` / `exit` / `q` | Exit REPL | `quit` |
373
+
374
+ ---
375
+
376
+ ## Expression Language
377
+
378
+ The expression language used by `filter` and `derive` is a **safe, recursive-descent parser** — no Python `eval()` is ever used.
379
+
380
+ ```bash
381
+ # Arithmetic
382
+ openstat> derive income_k = income / 1000
383
+ openstat> derive bmi = weight / (height ** 2)
384
+
385
+ # Comparisons and boolean logic
386
+ openstat> filter age > 30 and income < 50000
387
+ openstat> filter not is_null(score) and region == "North"
388
+
389
+ # Functions
390
+ openstat> derive log_income = log(income)
391
+ openstat> derive name_upper = upper(name)
392
+ openstat> derive score_clean = fill_null(score, 0)
393
+ ```
394
+
395
+ **Available functions:**
396
+
397
+ | Category | Functions |
398
+ |----------|----------|
399
+ | Math | `log(x)`, `log10(x)`, `sqrt(x)`, `abs(x)`, `exp(x)`, `round(x, n)` |
400
+ | String | `upper(x)`, `lower(x)`, `len_chars(x)`, `strip(x)`, `contains(x, "pattern")` |
401
+ | Null | `is_null(x)`, `is_not_null(x)`, `fill_null(x, value)` |
402
+ | Type | `cast_float(x)`, `cast_int(x)`, `cast_str(x)` |
403
+
404
+ ---
405
+
406
+ ## Statistical Models — In Depth
407
+
408
+ ### Automatic Diagnostics
409
+
410
+ Every model automatically checks for common problems and warns you:
411
+
412
+ - **Multicollinearity** — Condition number > 30 triggers a warning
413
+ - **Heteroscedasticity** — Breusch-Pagan test; suggests `--robust` if p < 0.05
414
+ - **Autocorrelation** — Durbin-Watson statistic far from 2.0
415
+ - **Convergence** — Warns if logit/probit MLE did not converge
416
+ - **Missing values** — Reports how many observations were dropped
417
+ - **Low sample size** — Warns when observation-to-predictor ratio is low
418
+
419
+ ### Interaction Terms
420
+
421
+ ```bash
422
+ # Full factorial: automatically expands to x1 + x2 + x1:x2
423
+ openstat> ols y ~ x1*x2
424
+
425
+ # Interaction only
426
+ openstat> ols y ~ x1 + x2 + x1:x2
427
+
428
+ # Three-way interaction (7 terms total)
429
+ openstat> ols y ~ x1*x2*x3
430
+ ```
431
+
432
+ ### Cluster-Robust Standard Errors
433
+
434
+ ```bash
435
+ # Clustered at the region level
436
+ openstat> ols wage ~ education + experience --cluster=region
437
+
438
+ # Works with all model types
439
+ openstat> logit promoted ~ age + performance --cluster=department
440
+ ```
441
+
442
+ ### Marginal Effects
443
+
444
+ After fitting a logit or probit model, compute marginal effects to understand the practical impact:
445
+
446
+ ```bash
447
+ openstat> logit employed ~ age + education + income
448
+ openstat> margins # Average marginal effects (default)
449
+ openstat> margins --at=means # Marginal effects at means
450
+ ```
451
+
452
+ ### Bootstrap Confidence Intervals
453
+
454
+ Non-parametric bootstrap for any model — no distributional assumptions needed:
455
+
456
+ ```bash
457
+ openstat> ols y ~ x1 + x2
458
+ openstat> bootstrap n=1000 ci=95 # 1000 replications, 95% CI
459
+ openstat> bootstrap n=5000 ci=99 # More replications, 99% CI
460
+ ```
461
+
462
+ Bootstrap uses thread-pool parallelism for speed when n > 100.
463
+
464
+ ### Model Comparison
465
+
466
+ Run multiple models and compare them side-by-side:
467
+
468
+ ```bash
469
+ openstat> ols y ~ x1
470
+ openstat> ols y ~ x1 + x2
471
+ openstat> ols y ~ x1 + x2 + x1:x2
472
+ openstat> estimates table
473
+ ```
474
+
475
+ This produces a publication-ready comparison table with coefficients, standard errors, R², AIC, BIC, and more.
476
+
477
+ ---
478
+
479
+ ## File Formats
480
+
481
+ | Format | Import | Export | Dependency |
482
+ |--------|:------:|:------:|------------|
483
+ | CSV | Yes | Yes | Built-in |
484
+ | Parquet | Yes | Yes | Built-in |
485
+ | Stata (.dta) | Yes | Yes | `pip install openstat[stata]` |
486
+ | Excel (.xlsx) | Yes | Yes | `pip install openstat[excel]` |
487
+
488
+ ---
489
+
490
+ ## Configuration
491
+
492
+ Customize OpenStat by creating `~/.openstat/config.toml`:
493
+
494
+ ```toml
495
+ [data]
496
+ output_dir = "outputs"
497
+ csv_separator = ","
498
+
499
+ [display]
500
+ tabulate_limit = 50
501
+ head_default = 10
502
+
503
+ [undo]
504
+ max_undo_stack = 20
505
+ max_undo_memory_mb = 500
506
+
507
+ [plotting]
508
+ plot_dpi = 150
509
+ plot_figsize_w = 8.0
510
+ plot_figsize_h = 5.0
511
+
512
+ [model]
513
+ condition_threshold = 30
514
+ min_obs_per_predictor = 5
515
+ bootstrap_iterations = 1000
516
+ ```
517
+
518
+ ---
519
+
520
+ ## CLI Options
521
+
522
+ ```bash
523
+ openstat repl # Interactive REPL
524
+ openstat run script.ost # Run a script
525
+ openstat run script.ost --strict # Stop on first error (exit code 1)
526
+ openstat --verbose repl # Verbose logging (INFO)
527
+ openstat --debug repl # Debug logging (DEBUG)
528
+ openstat --version # Show version
529
+ ```
530
+
531
+ Logs are saved to `~/.openstat/logs/openstat.log`.
532
+
533
+ ---
534
+
535
+ ## Aggregation Functions
536
+
537
+ For use with `groupby ... summarize`:
538
+
539
+ | Function | Description |
540
+ |----------|-------------|
541
+ | `mean(col)` | Arithmetic mean |
542
+ | `sd(col)` | Standard deviation (sample) |
543
+ | `sum(col)` | Sum |
544
+ | `min(col)` | Minimum |
545
+ | `max(col)` | Maximum |
546
+ | `median(col)` | Median |
547
+ | `count()` | Row count per group |
548
+
549
+ ---
550
+
551
+ ## Technology Stack
552
+
553
+ OpenStat is built on best-in-class open-source libraries:
554
+
555
+ | Component | Library | Why |
556
+ |-----------|---------|-----|
557
+ | Data Engine | [Polars](https://pola.rs/) | 10-100x faster than pandas, zero-copy, Rust-powered |
558
+ | Statistics | [statsmodels](https://www.statsmodels.org/) | Industry-standard OLS, GLM, quantile regression |
559
+ | Scientific | [SciPy](https://scipy.org/) | Hypothesis tests, distributions |
560
+ | Plotting | [matplotlib](https://matplotlib.org/) | Publication-quality figures |
561
+ | CLI Framework | [Typer](https://typer.tiangolo.com/) | Beautiful CLI with zero boilerplate |
562
+ | Terminal UI | [Rich](https://github.com/Textualize/rich) | Gorgeous tables and formatting |
563
+ | REPL | [prompt-toolkit](https://python-prompt-toolkit.readthedocs.io/) | Tab completion, history, syntax |
564
+
565
+ ---
566
+
567
+ ## Project Structure
568
+
569
+ ```
570
+ OpenStat/
571
+ ├── src/openstat/
572
+ │ ├── cli.py # Typer CLI entry point
573
+ │ ├── repl.py # Interactive REPL with tab completion
574
+ │ ├── session.py # Session state, undo system
575
+ │ ├── config.py # Configuration management (~/.openstat/config.toml)
576
+ │ ├── commands/
577
+ │ │ ├── base.py # @command decorator, registry, CommandArgs
578
+ │ │ ├── data_cmds.py # load, filter, select, derive, sort, merge, ...
579
+ │ │ ├── stat_cmds.py # summarize, ols, logit, poisson, margins, ...
580
+ │ │ ├── plot_cmds.py # plot hist/scatter/line/box/bar/heatmap
581
+ │ │ └── report_cmds.py # report, help
582
+ │ ├── dsl/
583
+ │ │ ├── tokenizer.py # Safe expression tokenizer
584
+ │ │ └── parser.py # Recursive descent parser (no eval!)
585
+ │ ├── stats/
586
+ │ │ └── models.py # OLS, Logit, Probit, Poisson, NegBin, QuantReg, ...
587
+ │ ├── plots/
588
+ │ │ └── plotter.py # matplotlib chart generation
589
+ │ ├── io/
590
+ │ │ └── loader.py # CSV, Parquet, DTA, Excel loaders
591
+ │ └── reporting/
592
+ │ └── report.py # Markdown report generator
593
+ ├── tests/ # 343 tests (and growing!)
594
+ ├── examples/
595
+ │ ├── data.csv # Sample dataset
596
+ │ └── demo.ost # Demo script showcasing all features
597
+ ├── .github/workflows/
598
+ │ └── ci.yml # GitHub Actions: test on 4 Python versions x 2 OS
599
+ ├── pyproject.toml
600
+ ├── LICENSE # MIT
601
+ ├── CONTRIBUTING.md
602
+ └── README.md
603
+ ```
604
+
605
+ ---
606
+
607
+ ## Development
608
+
609
+ ```bash
610
+ # Install with dev dependencies
611
+ pip install -e ".[dev]"
612
+
613
+ # Run the full test suite
614
+ pytest
615
+
616
+ # Run with coverage
617
+ pytest --cov=openstat --cov-report=term-missing
618
+
619
+ # Run a specific test file
620
+ pytest tests/test_v020.py -v
621
+
622
+ # Lint
623
+ pip install ruff
624
+ ruff check src/ tests/
625
+ ```
626
+
627
+ **Current test status:** 343 tests passed, 0 failures across 11 test files.
628
+
629
+ ---
630
+
631
+ ## Contributing
632
+
633
+ **We love contributions!** Whether you're fixing a typo, adding a new command, improving documentation, or building an entire new feature — your contribution matters and is deeply appreciated.
634
+
635
+ OpenStat is built by the community, for the community. Every contribution makes statistical analysis more accessible to researchers, students, and data scientists around the world.
636
+
637
+ ### How to Contribute
638
+
639
+ 1. **Fork** the repository
640
+ 2. **Create** a feature branch (`git checkout -b feature/amazing-feature`)
641
+ 3. **Write** your code and tests
642
+ 4. **Ensure** all tests pass (`pytest`) and lint is clean (`ruff check src/`)
643
+ 5. **Submit** a Pull Request with a clear description
644
+
645
+ ### What Can You Contribute?
646
+
647
+ - **New statistical methods** — Panel data, time series, survival analysis, mixed models
648
+ - **New commands** — Any data manipulation or analysis command you find useful
649
+ - **DSL functions** — Add functions to the expression language
650
+ - **Plot types** — New visualization types
651
+ - **Documentation** — Tutorials, examples, translations
652
+ - **Bug reports** — Found something that doesn't work? Open an issue!
653
+ - **Performance** — Found a bottleneck? We'd love a PR!
654
+ - **File formats** — Support for more data formats (SAS, SPSS, etc.)
655
+
656
+ ### First-Time Contributors Welcome!
657
+
658
+ Never contributed to open source before? No problem! Look for issues labeled `good first issue`. We're happy to mentor and guide you through the process. Every expert was once a beginner.
659
+
660
+ Check out [CONTRIBUTING.md](CONTRIBUTING.md) for detailed setup instructions and coding guidelines.
661
+
662
+ ---
663
+
664
+ ## Roadmap
665
+
666
+ We have big plans for OpenStat. Here's what's coming:
667
+
668
+ ### Completed
669
+
670
+ - [x] OLS, Logit, Probit regression
671
+ - [x] Interaction terms (`x1*x2`, `x1:x2`, multi-way)
672
+ - [x] Cluster-robust standard errors
673
+ - [x] Poisson & Negative Binomial regression
674
+ - [x] Quantile regression
675
+ - [x] Marginal effects (average, at-means)
676
+ - [x] Bootstrap confidence intervals (parallelized)
677
+ - [x] Post-estimation diagnostics (`estat`)
678
+ - [x] Model comparison tables (`estimates table`)
679
+ - [x] Stepwise variable selection (forward/backward)
680
+ - [x] Robust standard errors (HC1)
681
+ - [x] Residual diagnostics with plots
682
+ - [x] VIF multicollinearity check
683
+ - [x] LaTeX table export
684
+ - [x] Data joining/merging
685
+ - [x] Pivot/melt reshaping
686
+ - [x] Safe expression language (no eval)
687
+ - [x] Tab completion in REPL
688
+ - [x] Configuration file support
689
+ - [x] Multi-level undo with memory management
690
+ - [x] CI/CD with GitHub Actions
691
+
692
+ ### Planned
693
+
694
+ - [ ] Panel data / fixed effects / random effects
695
+ - [ ] Time series analysis (ARIMA, VAR)
696
+ - [ ] Survival analysis (Cox PH, Kaplan-Meier)
697
+ - [ ] Mixed / hierarchical linear models
698
+ - [ ] Instrumental variables (2SLS, IV)
699
+ - [ ] DuckDB / LazyFrame backend for large datasets
700
+ - [ ] Plugin / extension system
701
+ - [ ] Web-based GUI
702
+ - [ ] Jupyter notebook integration
703
+ - [ ] SAS (.sas7bdat) and SPSS (.sav) file support
704
+ - [ ] Multiple imputation for missing data
705
+ - [ ] Survey weighting support
706
+
707
+ ---
708
+
709
+ ## Community
710
+
711
+ OpenStat is more than code — it's a community of people who believe that statistical tools should be free and open. If you use OpenStat in your research, teaching, or work, we'd love to hear about it!
712
+
713
+ - **Star this repo** if you find it useful — it helps others discover the project
714
+ - **Share** with colleagues, students, and fellow researchers
715
+ - **Open issues** for bugs, feature requests, or questions
716
+ - **Join the conversation** in GitHub Discussions
717
+
718
+ ---
719
+
720
+ ## Acknowledgements
721
+
722
+ OpenStat stands on the shoulders of giants. Huge thanks to the maintainers and contributors of:
723
+
724
+ - [Polars](https://pola.rs/) — for reimagining what a DataFrame library can be
725
+ - [statsmodels](https://www.statsmodels.org/) — for bringing professional statistics to Python
726
+ - [SciPy](https://scipy.org/) — for decades of scientific computing excellence
727
+ - [Rich](https://github.com/Textualize/rich) — for making terminal output beautiful
728
+ - [prompt-toolkit](https://python-prompt-toolkit.readthedocs.io/) — for the interactive REPL foundation
729
+
730
+ And to every researcher, student, and data scientist who believes in open science. This project is for you.
731
+
732
+ ---
733
+
734
+ ## License
735
+
736
+ MIT License. See [LICENSE](LICENSE) for details.
737
+
738
+ Free as in freedom. Free as in beer. Use it, modify it, share it, sell it — no restrictions.
739
+
740
+ ---
741
+
742
+ <p align="center">
743
+ <strong>If OpenStat helps your work, give it a star! Every star helps more people discover free statistical tools.</strong>
744
+ </p>
745
+
746
+ <p align="center">
747
+ Made with care for the open-source community.
748
+ </p>