openstat-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. openstat/__init__.py +3 -0
  2. openstat/__main__.py +4 -0
  3. openstat/backends/__init__.py +16 -0
  4. openstat/backends/duckdb_backend.py +70 -0
  5. openstat/backends/polars_backend.py +52 -0
  6. openstat/cli.py +92 -0
  7. openstat/commands/__init__.py +82 -0
  8. openstat/commands/adv_stat_cmds.py +1255 -0
  9. openstat/commands/advanced_ml_cmds.py +576 -0
  10. openstat/commands/advreg_cmds.py +207 -0
  11. openstat/commands/alias_cmds.py +135 -0
  12. openstat/commands/arch_cmds.py +82 -0
  13. openstat/commands/arules_cmds.py +111 -0
  14. openstat/commands/automodel_cmds.py +212 -0
  15. openstat/commands/backend_cmds.py +82 -0
  16. openstat/commands/base.py +170 -0
  17. openstat/commands/bayes_cmds.py +71 -0
  18. openstat/commands/causal_cmds.py +269 -0
  19. openstat/commands/cluster_cmds.py +152 -0
  20. openstat/commands/data_cmds.py +996 -0
  21. openstat/commands/datamanip_cmds.py +672 -0
  22. openstat/commands/dataquality_cmds.py +174 -0
  23. openstat/commands/datetime_cmds.py +176 -0
  24. openstat/commands/dimreduce_cmds.py +184 -0
  25. openstat/commands/discrete_cmds.py +149 -0
  26. openstat/commands/dsl_cmds.py +143 -0
  27. openstat/commands/epi_cmds.py +93 -0
  28. openstat/commands/equiv_tobit_cmds.py +94 -0
  29. openstat/commands/esttab_cmds.py +196 -0
  30. openstat/commands/export_beamer_cmds.py +142 -0
  31. openstat/commands/export_cmds.py +201 -0
  32. openstat/commands/export_extra_cmds.py +240 -0
  33. openstat/commands/factor_cmds.py +180 -0
  34. openstat/commands/groupby_cmds.py +155 -0
  35. openstat/commands/help_cmds.py +237 -0
  36. openstat/commands/i18n_cmds.py +43 -0
  37. openstat/commands/import_extra_cmds.py +561 -0
  38. openstat/commands/influence_cmds.py +134 -0
  39. openstat/commands/iv_cmds.py +106 -0
  40. openstat/commands/manova_cmds.py +105 -0
  41. openstat/commands/mediate_cmds.py +233 -0
  42. openstat/commands/meta_cmds.py +284 -0
  43. openstat/commands/mi_cmds.py +228 -0
  44. openstat/commands/mixed_cmds.py +79 -0
  45. openstat/commands/mixture_changepoint_cmds.py +166 -0
  46. openstat/commands/ml_adv_cmds.py +147 -0
  47. openstat/commands/ml_cmds.py +178 -0
  48. openstat/commands/model_eval_cmds.py +142 -0
  49. openstat/commands/network_cmds.py +288 -0
  50. openstat/commands/nlquery_cmds.py +161 -0
  51. openstat/commands/nonparam_cmds.py +149 -0
  52. openstat/commands/outreg_cmds.py +247 -0
  53. openstat/commands/panel_cmds.py +141 -0
  54. openstat/commands/pdf_cmds.py +226 -0
  55. openstat/commands/pipeline_cmds.py +319 -0
  56. openstat/commands/plot_cmds.py +189 -0
  57. openstat/commands/plugin_cmds.py +79 -0
  58. openstat/commands/posthoc_cmds.py +153 -0
  59. openstat/commands/power_cmds.py +172 -0
  60. openstat/commands/profile_cmds.py +246 -0
  61. openstat/commands/rbridge_cmds.py +81 -0
  62. openstat/commands/regex_cmds.py +104 -0
  63. openstat/commands/report_cmds.py +48 -0
  64. openstat/commands/repro_cmds.py +129 -0
  65. openstat/commands/resampling_cmds.py +109 -0
  66. openstat/commands/reshape_cmds.py +223 -0
  67. openstat/commands/sem_cmds.py +177 -0
  68. openstat/commands/stat_cmds.py +1040 -0
  69. openstat/commands/stata_import_cmds.py +215 -0
  70. openstat/commands/string_cmds.py +124 -0
  71. openstat/commands/surv_cmds.py +145 -0
  72. openstat/commands/survey_cmds.py +153 -0
  73. openstat/commands/textanalysis_cmds.py +192 -0
  74. openstat/commands/ts_adv_cmds.py +136 -0
  75. openstat/commands/ts_cmds.py +195 -0
  76. openstat/commands/tui_cmds.py +111 -0
  77. openstat/commands/ux_cmds.py +191 -0
  78. openstat/commands/validate_cmds.py +270 -0
  79. openstat/commands/viz_adv_cmds.py +312 -0
  80. openstat/commands/viz_extra_cmds.py +251 -0
  81. openstat/commands/watch_cmds.py +69 -0
  82. openstat/config.py +106 -0
  83. openstat/dsl/__init__.py +0 -0
  84. openstat/dsl/parser.py +332 -0
  85. openstat/dsl/tokenizer.py +105 -0
  86. openstat/i18n.py +120 -0
  87. openstat/io/__init__.py +0 -0
  88. openstat/io/loader.py +187 -0
  89. openstat/jupyter/__init__.py +18 -0
  90. openstat/jupyter/display.py +18 -0
  91. openstat/jupyter/magic.py +60 -0
  92. openstat/logging_config.py +59 -0
  93. openstat/plots/__init__.py +0 -0
  94. openstat/plots/plotter.py +437 -0
  95. openstat/plots/surv_plots.py +32 -0
  96. openstat/plots/ts_plots.py +59 -0
  97. openstat/plugins/__init__.py +5 -0
  98. openstat/plugins/manager.py +69 -0
  99. openstat/repl.py +457 -0
  100. openstat/reporting/__init__.py +0 -0
  101. openstat/reporting/eda.py +208 -0
  102. openstat/reporting/report.py +67 -0
  103. openstat/script_runner.py +319 -0
  104. openstat/session.py +133 -0
  105. openstat/stats/__init__.py +0 -0
  106. openstat/stats/advanced_regression.py +269 -0
  107. openstat/stats/arch_garch.py +84 -0
  108. openstat/stats/bayesian.py +103 -0
  109. openstat/stats/causal.py +258 -0
  110. openstat/stats/clustering.py +206 -0
  111. openstat/stats/discrete.py +311 -0
  112. openstat/stats/epidemiology.py +119 -0
  113. openstat/stats/equiv_tobit.py +163 -0
  114. openstat/stats/factor.py +174 -0
  115. openstat/stats/imputation.py +282 -0
  116. openstat/stats/influence.py +78 -0
  117. openstat/stats/iv.py +131 -0
  118. openstat/stats/manova.py +124 -0
  119. openstat/stats/mixed.py +128 -0
  120. openstat/stats/ml.py +275 -0
  121. openstat/stats/ml_advanced.py +117 -0
  122. openstat/stats/model_eval.py +183 -0
  123. openstat/stats/models.py +1342 -0
  124. openstat/stats/nonparametric.py +130 -0
  125. openstat/stats/panel.py +179 -0
  126. openstat/stats/power.py +295 -0
  127. openstat/stats/resampling.py +203 -0
  128. openstat/stats/survey.py +213 -0
  129. openstat/stats/survival.py +196 -0
  130. openstat/stats/timeseries.py +142 -0
  131. openstat/stats/ts_advanced.py +114 -0
  132. openstat/types.py +11 -0
  133. openstat/web/__init__.py +1 -0
  134. openstat/web/app.py +117 -0
  135. openstat/web/session_manager.py +73 -0
  136. openstat/web/static/app.js +117 -0
  137. openstat/web/static/index.html +38 -0
  138. openstat/web/static/style.css +103 -0
  139. openstat_cli-1.0.0.dist-info/METADATA +748 -0
  140. openstat_cli-1.0.0.dist-info/RECORD +143 -0
  141. openstat_cli-1.0.0.dist-info/WHEEL +4 -0
  142. openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
  143. openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,153 @@
1
+ """Post-hoc comparison commands: posthoc (Tukey HSD, Bonferroni, Scheffé)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ import numpy as np
8
+ from scipy import stats
9
+ from statsmodels.stats.multicomp import MultiComparison
10
+
11
+ from openstat.commands.base import command
12
+ from openstat.session import Session
13
+
14
+
15
+ def _fmt_table(title: str, headers: list[str], rows: list[tuple]) -> str:
16
+ all_rows = [tuple(str(v) for v in r) for r in rows]
17
+ widths = [
18
+ max(len(headers[i]), max((len(r[i]) for r in all_rows), default=0))
19
+ for i in range(len(headers))
20
+ ]
21
+ sep = "-" * (sum(widths) + 3 * len(widths) + 1)
22
+ header_line = " | ".join(f"{h:<{w}}" for h, w in zip(headers, widths))
23
+ lines = [f"\n{title}", "=" * len(sep), header_line, sep]
24
+ for row in all_rows:
25
+ lines.append(" | ".join(f"{v:<{w}}" for v, w in zip(row, widths)))
26
+ lines.append("=" * len(sep))
27
+ return "\n".join(lines)
28
+
29
+
30
+ @command("posthoc", usage="posthoc <var> by(<group>) [--tukey|--bonferroni|--scheffe]")
31
+ def cmd_posthoc(session: Session, args: str) -> str:
32
+ """Post-hoc pairwise comparisons after ANOVA (Tukey HSD, Bonferroni, Scheffé).
33
+
34
+ Examples:
35
+ posthoc score by(region)
36
+ posthoc income by(education) --bonferroni
37
+ posthoc age by(group) --scheffe
38
+ """
39
+ df = session.require_data()
40
+
41
+ m = re.search(r"by\((\w+)\)", args)
42
+ if not m:
43
+ return "Usage: posthoc <var> by(<group>) [--tukey|--bonferroni|--scheffe]"
44
+ group_col = m.group(1)
45
+
46
+ rest = re.sub(r"by\([^)]*\)", "", args)
47
+ tokens = [t for t in rest.split() if not t.startswith("--")]
48
+ if not tokens:
49
+ return "Usage: posthoc <var> by(<group>) [--tukey|--bonferroni|--scheffe]"
50
+ var = tokens[0]
51
+
52
+ if var not in df.columns:
53
+ return f"Column not found: {var}"
54
+ if group_col not in df.columns:
55
+ return f"Column not found: {group_col}"
56
+
57
+ method = "tukey"
58
+ if "--bonferroni" in args:
59
+ method = "bonferroni"
60
+ elif "--scheffe" in args:
61
+ method = "scheffe"
62
+
63
+ sub = df.select([var, group_col]).drop_nulls()
64
+ values = sub[var].to_numpy(allow_copy=True).astype(float)
65
+ groups = sub[group_col].to_list()
66
+
67
+ group_labels = sorted(set(str(g) for g in groups))
68
+ groups_str = [str(g) for g in groups]
69
+
70
+ if len(group_labels) < 2:
71
+ return "Need at least 2 groups for post-hoc comparison."
72
+
73
+ # Run overall ANOVA first
74
+ group_arrays = [values[np.array([g == lbl for g in groups_str])] for lbl in group_labels]
75
+ f_stat, p_overall = stats.f_oneway(*group_arrays)
76
+ anova_line = f"Overall ANOVA: F = {f_stat:.4f}, p = {p_overall:.4f}"
77
+
78
+ try:
79
+ if method == "tukey":
80
+ mc = MultiComparison(values, groups_str)
81
+ result = mc.tukeyhsd()
82
+ summary_data = result.summary().data
83
+ headers = ["Group1", "Group2", "MeanDiff", "Lower", "Upper", "p-adj", "Reject H0"]
84
+ rows = []
85
+ for row in summary_data[1:]:
86
+ g1, g2, meandiff, p_adj, lower, upper, reject = row
87
+ rows.append((
88
+ str(g1), str(g2),
89
+ f"{float(meandiff):.4f}",
90
+ f"{float(lower):.4f}",
91
+ f"{float(upper):.4f}",
92
+ f"{float(p_adj):.4f}",
93
+ "Yes" if reject else "No",
94
+ ))
95
+ return anova_line + _fmt_table("Tukey HSD Post-hoc Comparison", headers, rows)
96
+
97
+ elif method == "bonferroni":
98
+ n_pairs = len(group_labels) * (len(group_labels) - 1) // 2
99
+ alpha_adj = 0.05 / n_pairs
100
+ rows = []
101
+ for i in range(len(group_labels)):
102
+ for j in range(i + 1, len(group_labels)):
103
+ a = group_arrays[i]
104
+ b = group_arrays[j]
105
+ t_stat, p_raw = stats.ttest_ind(a, b)
106
+ p_adj = min(p_raw * n_pairs, 1.0)
107
+ diff = np.mean(a) - np.mean(b)
108
+ rows.append((
109
+ group_labels[i], group_labels[j],
110
+ f"{diff:.4f}",
111
+ f"{t_stat:.4f}",
112
+ f"{p_raw:.4f}",
113
+ f"{p_adj:.4f}",
114
+ "Yes" if p_adj < 0.05 else "No",
115
+ ))
116
+ headers = ["Group1", "Group2", "MeanDiff", "t-stat", "p-raw", "p-adj(Bonf)", "Reject H0"]
117
+ note = f"\n (Bonferroni correction: α_adj = 0.05/{n_pairs} = {alpha_adj:.5f})"
118
+ return anova_line + _fmt_table("Bonferroni Post-hoc Comparison", headers, rows) + note
119
+
120
+ elif method == "scheffe":
121
+ k = len(group_labels)
122
+ n_total = sum(len(d) for d in group_arrays)
123
+ n_per = [len(d) for d in group_arrays]
124
+ means = [np.mean(d) for d in group_arrays]
125
+
126
+ ss_within = sum(np.sum((d - np.mean(d)) ** 2) for d in group_arrays)
127
+ df_within = n_total - k
128
+ mse = ss_within / df_within
129
+ f_crit = stats.f.ppf(0.95, k - 1, df_within)
130
+ critical = (k - 1) * f_crit
131
+
132
+ rows = []
133
+ for i in range(k):
134
+ for j in range(i + 1, k):
135
+ diff = means[i] - means[j]
136
+ f_s = diff ** 2 / (mse * (1.0 / n_per[i] + 1.0 / n_per[j]))
137
+ p_val = 1.0 - stats.f.cdf(f_s / (k - 1), k - 1, df_within)
138
+ rows.append((
139
+ group_labels[i], group_labels[j],
140
+ f"{diff:.4f}",
141
+ f"{f_s:.4f}",
142
+ f"{critical:.4f}",
143
+ f"{p_val:.4f}",
144
+ "Yes" if f_s > critical else "No",
145
+ ))
146
+ headers = ["Group1", "Group2", "MeanDiff", "F*", "F-critical", "p-value", "Reject H0"]
147
+ note = f"\n (Scheffé critical value = (k-1)×F_crit = {k-1}×{f_crit:.4f} = {critical:.4f})"
148
+ return anova_line + _fmt_table("Scheffé Post-hoc Comparison", headers, rows) + note
149
+
150
+ except Exception as exc:
151
+ return f"posthoc error: {exc}"
152
+
153
+ return "Unknown method."
@@ -0,0 +1,172 @@
1
+ """Power analysis commands: power, sampsi."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ from openstat.commands.base import command
8
+ from openstat.session import Session
9
+ from openstat.stats.power import (
10
+ power_onemean,
11
+ power_twomeans,
12
+ power_oneproportion,
13
+ power_twoproportions,
14
+ power_ols,
15
+ sampsi as _sampsi,
16
+ )
17
+
18
+
19
+ # ── Stata-style argument parser ────────────────────────────────────────────
20
+
21
+ def _stata_parse(raw: str) -> tuple[list[str], dict[str, str]]:
22
+ """Parse Stata-style args: positional tokens and key(value) options.
23
+
24
+ Handles:
25
+ - positional tokens (bare words, numbers)
26
+ - key(value) options e.g. n(50), alpha(0.05)
27
+ - key=value options e.g. n=50
28
+ - --flag / bare flag e.g. --onesided, onesided
29
+ - commas are ignored (Stata separator)
30
+ """
31
+ opts: dict[str, str] = {}
32
+ positional: list[str] = []
33
+ flags: set[str] = set()
34
+
35
+ # key(value)
36
+ for m in re.finditer(r'(\w+)\(([^)]*)\)', raw):
37
+ opts[m.group(1).lower()] = m.group(2)
38
+
39
+ # Remove key(value) tokens from raw for further parsing
40
+ rest = re.sub(r'\w+\([^)]*\)', '', raw)
41
+
42
+ for tok in rest.split():
43
+ tok = tok.strip(',')
44
+ if not tok:
45
+ continue
46
+ if '=' in tok:
47
+ k, v = tok.split('=', 1)
48
+ opts[k.lower().lstrip('-')] = v
49
+ elif tok.startswith('--'):
50
+ flags.add(tok.lstrip('-').lower())
51
+ elif re.match(r'^-?(\d+\.?\d*|\.\d+)$', tok):
52
+ positional.append(tok)
53
+ elif re.match(r'^\w+$', tok):
54
+ # Could be a sub-command, flag, or positional
55
+ positional.append(tok)
56
+
57
+ return positional, opts, flags
58
+
59
+
60
+ def _fmt_row(label: str, value) -> str:
61
+ return f" {label:<30} {value}"
62
+
63
+
64
+ def _power_table(result: dict) -> str:
65
+ lines = [f"\n{result['test']}", "-" * 50]
66
+ skip = {"test"}
67
+ for k, v in result.items():
68
+ if k in skip:
69
+ continue
70
+ if isinstance(v, float):
71
+ lines.append(_fmt_row(k, f"{v:.4f}"))
72
+ else:
73
+ lines.append(_fmt_row(k, v))
74
+ lines.append("-" * 50)
75
+ return "\n".join(lines)
76
+
77
+
78
+ # ── Command ────────────────────────────────────────────────────────────────
79
+
80
+ @command("power", usage="power onemean|twomeans|oneprop|twoprop|ols [options]")
81
+ def cmd_power(session: Session, args: str) -> str:
82
+ """Power analysis for common statistical tests."""
83
+ positional, opts, flags = _stata_parse(args)
84
+
85
+ if not positional:
86
+ return (
87
+ "Usage: power <subcommand> [options]\n"
88
+ "Subcommands: onemean, twomeans, oneprop, twoprop, ols\n\n"
89
+ "Examples:\n"
90
+ " power onemean, n(50) delta(0.5) sd(1)\n"
91
+ " power twomeans, n(80) delta(0.5) sd(1)\n"
92
+ " power oneprop, n(100) p0(0.5) pa(0.65)\n"
93
+ " power twoprop, p1(0.3) p2(0.5) power(0.80)\n"
94
+ " power ols, n(100) f2(0.15) k(3)"
95
+ )
96
+
97
+ sub = positional[0].lower()
98
+ alpha = float(opts.get("alpha", 0.05))
99
+ n = int(opts["n"]) if "n" in opts else None
100
+ pwr = float(opts["power"]) if "power" in opts else None
101
+ two_sided = "onesided" not in flags
102
+
103
+ try:
104
+ if sub in ("onemean", "one_mean"):
105
+ delta = float(opts.get("delta", 0.5))
106
+ sd = float(opts.get("sd", 1.0))
107
+ es = float(opts["es"]) if "es" in opts else None
108
+ result = power_onemean(
109
+ effect_size=es, alpha=alpha, n=n, power=pwr,
110
+ sd=sd, delta=delta, two_sided=two_sided,
111
+ )
112
+
113
+ elif sub in ("twomeans", "two_means"):
114
+ delta = float(opts.get("delta", 0.5))
115
+ sd = float(opts.get("sd", 1.0))
116
+ ratio = float(opts.get("ratio", 1.0))
117
+ n1 = int(opts["n1"]) if "n1" in opts else n
118
+ es = float(opts["es"]) if "es" in opts else None
119
+ result = power_twomeans(
120
+ effect_size=es, alpha=alpha, n=n1, power=pwr,
121
+ ratio=ratio, sd=sd, delta=delta, two_sided=two_sided,
122
+ )
123
+
124
+ elif sub in ("oneprop", "one_prop", "onepropo"):
125
+ p0 = float(opts.get("p0", 0.5))
126
+ pa = float(opts.get("pa", 0.6))
127
+ result = power_oneproportion(
128
+ p0=p0, pa=pa, alpha=alpha, n=n, power=pwr, two_sided=two_sided,
129
+ )
130
+
131
+ elif sub in ("twoprop", "two_prop", "twopropo"):
132
+ p1 = float(opts.get("p1", 0.3))
133
+ p2 = float(opts.get("p2", 0.5))
134
+ result = power_twoproportions(
135
+ p1=p1, p2=p2, alpha=alpha, n=n, power=pwr, two_sided=two_sided,
136
+ )
137
+
138
+ elif sub == "ols":
139
+ f2 = float(opts["f2"]) if "f2" in opts else None
140
+ k = int(opts.get("k", 1))
141
+ result = power_ols(f2=f2, alpha=alpha, n=n, power=pwr, k=k)
142
+
143
+ else:
144
+ return f"Unknown power subcommand: {sub}"
145
+
146
+ except (ValueError, TypeError) as exc:
147
+ return f"Power analysis error: {exc}"
148
+
149
+ return _power_table(result)
150
+
151
+
152
+ @command("sampsi", usage="sampsi mu1 mu2 [, sd(1) alpha(0.05) power(0.80)]")
153
+ def cmd_sampsi(session: Session, args: str) -> str:
154
+ """Compute required sample size (Stata-style sampsi)."""
155
+ positional, opts, flags = _stata_parse(args)
156
+
157
+ # Filter only numeric positionals
158
+ nums = [p for p in positional if re.match(r'^-?(\d+\.?\d*|\.\d+)$', p)]
159
+ if len(nums) < 2:
160
+ return "Usage: sampsi mu1 mu2 [, sd(1) alpha(0.05) power(0.80)]"
161
+
162
+ try:
163
+ mu1 = float(nums[0])
164
+ mu2 = float(nums[1])
165
+ sd = float(opts.get("sd", 1.0))
166
+ alpha = float(opts.get("alpha", 0.05))
167
+ pwr = float(opts.get("power", 0.80))
168
+ result = _sampsi(mu1=mu1, mu2=mu2, sd=sd, alpha=alpha, power=pwr)
169
+ except (ValueError, TypeError) as exc:
170
+ return f"sampsi error: {exc}"
171
+
172
+ return _power_table(result)
@@ -0,0 +1,246 @@
1
+ """Data profile and data dictionary commands."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from openstat.commands.base import command, CommandArgs, friendly_error
6
+ from openstat.session import Session
7
+
8
+
9
+ @command("profile", usage="profile [col1 col2 ...] [--out=report.html]")
10
+ def cmd_profile(session: Session, args: str) -> str:
11
+ """Generate a comprehensive data profile report.
12
+
13
+ Shows for each column: type, missing count/%, unique values,
14
+ min/max/mean/std/median/mode, top values, distribution shape.
15
+
16
+ Options:
17
+ --out=<path> save as HTML report (default: outputs/profile.html)
18
+ --cols=<list> comma-separated column subset
19
+
20
+ Examples:
21
+ profile
22
+ profile income age education
23
+ profile --out=data_profile.html
24
+ """
25
+ import polars as pl
26
+
27
+ ca = CommandArgs(args)
28
+ try:
29
+ df = session.require_data()
30
+ except RuntimeError as e:
31
+ return str(e)
32
+
33
+ # Column subset
34
+ cols_opt = ca.options.get("cols")
35
+ if cols_opt:
36
+ cols = [c.strip() for c in cols_opt.split(",")]
37
+ elif ca.positional:
38
+ cols = ca.positional
39
+ else:
40
+ cols = df.columns
41
+
42
+ missing_cols = [c for c in cols if c not in df.columns]
43
+ if missing_cols:
44
+ return f"Columns not found: {', '.join(missing_cols)}"
45
+
46
+ NUMERIC = (pl.Float32, pl.Float64, pl.Int8, pl.Int16, pl.Int32, pl.Int64,
47
+ pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64)
48
+
49
+ lines = [
50
+ f"Data Profile: {session.dataset_name or 'dataset'}",
51
+ f"Shape: {df.height:,} rows × {df.width} columns | Showing {len(cols)} columns",
52
+ "=" * 72,
53
+ ]
54
+
55
+ for col in cols:
56
+ series = df[col]
57
+ n_miss = series.null_count()
58
+ miss_pct = 100 * n_miss / df.height if df.height else 0
59
+ n_uniq = series.drop_nulls().n_unique()
60
+ dtype = str(series.dtype)
61
+
62
+ lines.append(f"\n {col} [{dtype}]")
63
+ lines.append(f" Missing: {n_miss:,} ({miss_pct:.1f}%)")
64
+ lines.append(f" Unique: {n_uniq:,}")
65
+
66
+ if series.dtype in NUMERIC:
67
+ s = series.drop_nulls()
68
+ if s.len() > 0:
69
+ lines.append(f" Mean: {s.mean():.4f}")
70
+ lines.append(f" Std: {s.std():.4f}" if s.len() > 1 else " Std: —")
71
+ lines.append(f" Min: {s.min():.4f}")
72
+ lines.append(f" Median: {s.median():.4f}")
73
+ lines.append(f" Max: {s.max():.4f}")
74
+ # Skewness / kurtosis
75
+ try:
76
+ import numpy as np
77
+ arr = s.to_numpy()
78
+ from scipy.stats import skew, kurtosis
79
+ lines.append(f" Skewness: {skew(arr):.3f}")
80
+ lines.append(f" Kurtosis: {kurtosis(arr):.3f}")
81
+ except Exception:
82
+ pass
83
+ # Zeros / negatives
84
+ n_zero = int((s == 0).sum())
85
+ n_neg = int((s < 0).sum())
86
+ if n_zero or n_neg:
87
+ lines.append(f" Zeros: {n_zero:,} Negative: {n_neg:,}")
88
+ else:
89
+ # Categorical / string
90
+ s = series.drop_nulls().cast(pl.Utf8)
91
+ top = s.value_counts().sort("count", descending=True).head(5)
92
+ if top.height > 0:
93
+ top_vals = ", ".join(
94
+ f"{row[0]}({row[1]})" for row in top.iter_rows()
95
+ )
96
+ lines.append(f" Top 5: {top_vals}")
97
+
98
+ lines.append("\n" + "=" * 72)
99
+
100
+ # HTML output
101
+ out_path = ca.options.get("out")
102
+ if out_path:
103
+ try:
104
+ _save_profile_html(lines, out_path, session)
105
+ lines.append(f"\nHTML report saved: {out_path}")
106
+ except Exception as exc:
107
+ lines.append(f"\nHTML save failed: {exc}")
108
+
109
+ return "\n".join(lines)
110
+
111
+
112
+ def _save_profile_html(lines: list[str], path: str, session: Session) -> None:
113
+ """Save a simple HTML version of the profile."""
114
+ import polars as pl
115
+ from pathlib import Path
116
+
117
+ df = session.df
118
+ NUMERIC = (pl.Float32, pl.Float64, pl.Int8, pl.Int16, pl.Int32, pl.Int64,
119
+ pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64)
120
+
121
+ rows_html = ""
122
+ if df is not None:
123
+ for col in df.columns:
124
+ series = df[col]
125
+ n_miss = series.null_count()
126
+ miss_pct = f"{100*n_miss/df.height:.1f}%" if df.height else "—"
127
+ n_uniq = series.drop_nulls().n_unique()
128
+ dtype = str(series.dtype)
129
+ if series.dtype in NUMERIC:
130
+ s = series.drop_nulls()
131
+ stats = f"mean={s.mean():.3f}, std={s.std():.3f}" if s.len() > 0 else "—"
132
+ else:
133
+ stats = f"{n_uniq} unique values"
134
+ rows_html += (
135
+ f"<tr><td>{col}</td><td>{dtype}</td>"
136
+ f"<td>{n_miss} ({miss_pct})</td>"
137
+ f"<td>{n_uniq}</td><td>{stats}</td></tr>\n"
138
+ )
139
+
140
+ html = f"""<!DOCTYPE html>
141
+ <html><head><meta charset="utf-8">
142
+ <title>OpenStat Data Profile</title>
143
+ <style>
144
+ body {{font-family: sans-serif; margin: 2em; background: #f9f9f9;}}
145
+ h1 {{color: #333;}} table {{border-collapse: collapse; width: 100%;}}
146
+ th {{background: #4C72B0; color: white; padding: 8px;}}
147
+ td {{border: 1px solid #ddd; padding: 6px;}}
148
+ tr:nth-child(even) {{background: #f0f4ff;}}
149
+ </style></head><body>
150
+ <h1>Data Profile: {session.dataset_name or "dataset"}</h1>
151
+ <p>Shape: {session.shape_str}</p>
152
+ <table>
153
+ <tr><th>Column</th><th>Type</th><th>Missing</th><th>Unique</th><th>Stats</th></tr>
154
+ {rows_html}
155
+ </table>
156
+ <pre>{"chr(10)".join(lines)}</pre>
157
+ </body></html>"""
158
+
159
+ Path(path).parent.mkdir(parents=True, exist_ok=True)
160
+ Path(path).write_text(html, encoding="utf-8")
161
+
162
+
163
+ @command("datadict", usage="datadict [--out=dict.xlsx|dict.md]")
164
+ def cmd_datadict(session: Session, args: str) -> str:
165
+ """Generate a data dictionary for the current dataset.
166
+
167
+ Creates a table with: variable name, type, missing%, unique count,
168
+ min/max/mean for numeric, top values for categorical.
169
+
170
+ Options:
171
+ --out=<path> save to Excel (.xlsx) or Markdown (.md)
172
+ Default: outputs/data_dictionary.md
173
+
174
+ Examples:
175
+ datadict
176
+ datadict --out=dictionary.xlsx
177
+ datadict --out=docs/variables.md
178
+ """
179
+ import polars as pl
180
+ from pathlib import Path
181
+
182
+ ca = CommandArgs(args)
183
+ out_path = ca.options.get("out", "outputs/data_dictionary.md")
184
+
185
+ try:
186
+ df = session.require_data()
187
+ except RuntimeError as e:
188
+ return str(e)
189
+
190
+ NUMERIC = (pl.Float32, pl.Float64, pl.Int8, pl.Int16, pl.Int32, pl.Int64,
191
+ pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64)
192
+
193
+ records = []
194
+ for col in df.columns:
195
+ series = df[col]
196
+ n_miss = series.null_count()
197
+ miss_pct = f"{100*n_miss/df.height:.1f}%"
198
+ n_uniq = series.drop_nulls().n_unique()
199
+ dtype = str(series.dtype)
200
+
201
+ if series.dtype in NUMERIC:
202
+ s = series.drop_nulls()
203
+ if s.len() > 0:
204
+ extra = f"mean={s.mean():.3f}; range=[{s.min():.3f},{s.max():.3f}]"
205
+ else:
206
+ extra = "all missing"
207
+ else:
208
+ top = series.drop_nulls().cast(pl.Utf8).value_counts().sort("count", descending=True).head(3)
209
+ top_vals = "; ".join(str(r[0]) for r in top.iter_rows())
210
+ extra = f"top: {top_vals}"
211
+
212
+ records.append({
213
+ "Variable": col,
214
+ "Type": dtype,
215
+ "Missing": f"{n_miss} ({miss_pct})",
216
+ "Unique": str(n_uniq),
217
+ "Notes": extra,
218
+ "Description": "", # user fills in
219
+ })
220
+
221
+ Path(out_path).parent.mkdir(parents=True, exist_ok=True)
222
+
223
+ if out_path.endswith(".xlsx"):
224
+ try:
225
+ dict_df = pl.DataFrame(records)
226
+ dict_df.write_excel(out_path)
227
+ return f"Data dictionary saved: {out_path} ({len(records)} variables)"
228
+ except ImportError:
229
+ return "xlsxwriter required for Excel output. Try --out=dict.md"
230
+
231
+ else: # Markdown
232
+ lines = [
233
+ f"# Data Dictionary: {session.dataset_name or 'dataset'}",
234
+ f"",
235
+ f"Shape: {df.height:,} rows × {df.width} columns",
236
+ f"",
237
+ "| Variable | Type | Missing | Unique | Notes | Description |",
238
+ "|---|---|---|---|---|---|",
239
+ ]
240
+ for r in records:
241
+ lines.append(
242
+ f"| {r['Variable']} | {r['Type']} | {r['Missing']} | "
243
+ f"{r['Unique']} | {r['Notes']} | {r['Description']} |"
244
+ )
245
+ Path(out_path).write_text("\n".join(lines), encoding="utf-8")
246
+ return f"Data dictionary saved: {out_path} ({len(records)} variables)"
@@ -0,0 +1,81 @@
1
+ """R bridge command: run R code from OpenStat (requires rpy2)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from openstat.commands.base import command, CommandArgs, friendly_error
6
+ from openstat.session import Session
7
+
8
+
9
+ @command("r", usage='r "<R code>"')
10
+ def cmd_r(session: Session, args: str) -> str:
11
+ """Execute R code in the current session (requires rpy2).
12
+
13
+ The current dataset is available in R as 'data'.
14
+ Results printed in R are captured and returned.
15
+ Modified 'data' is pulled back into the OpenStat session.
16
+
17
+ Examples:
18
+ r "summary(data)"
19
+ r "cor(data[, sapply(data, is.numeric)])"
20
+ r "data$log_income <- log(data$income + 1)"
21
+ r "lm_result <- lm(y ~ x1 + x2, data=data); summary(lm_result)"
22
+ """
23
+ try:
24
+ import rpy2.robjects as ro
25
+ from rpy2.robjects import pandas2ri
26
+ from rpy2.robjects.conversion import localconverter
27
+ import rpy2.rinterface_lib.callbacks as rcb
28
+ except ImportError:
29
+ return (
30
+ "rpy2 is required for the R bridge.\n"
31
+ "Install: pip install rpy2\n"
32
+ "Also requires a working R installation."
33
+ )
34
+
35
+ code = args.strip().strip('"\'')
36
+ if not code:
37
+ return 'Usage: r "<R code>"'
38
+
39
+ import io as _io
40
+ output_lines: list[str] = []
41
+
42
+ # Capture R output
43
+ def _capture(x):
44
+ output_lines.append(x)
45
+
46
+ old_write = rcb.consolewrite_print
47
+ rcb.consolewrite_print = _capture
48
+
49
+ try:
50
+ with localconverter(ro.default_converter + pandas2ri.converter):
51
+ # Push current dataframe into R as 'data'
52
+ if session.df is not None:
53
+ try:
54
+ r_df = ro.conversion.py2rpy(session.df.to_pandas())
55
+ ro.globalenv["data"] = r_df
56
+ except Exception:
57
+ pass # non-critical
58
+
59
+ # Execute R code
60
+ ro.r(code)
61
+
62
+ # Pull 'data' back if it was modified
63
+ try:
64
+ r_data = ro.globalenv.get("data")
65
+ if r_data is not None:
66
+ import polars as pl
67
+ pd_df = ro.conversion.rpy2py(r_data)
68
+ new_df = pl.from_pandas(pd_df)
69
+ if session.df is None or not new_df.equals(session.df):
70
+ session.snapshot()
71
+ session.df = new_df
72
+ except Exception:
73
+ pass
74
+
75
+ except Exception as e:
76
+ return friendly_error(e, "R bridge")
77
+ finally:
78
+ rcb.consolewrite_print = old_write
79
+
80
+ result = "".join(output_lines).strip()
81
+ return result or "[R code executed — no output]"