openstat-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstat/__init__.py +3 -0
- openstat/__main__.py +4 -0
- openstat/backends/__init__.py +16 -0
- openstat/backends/duckdb_backend.py +70 -0
- openstat/backends/polars_backend.py +52 -0
- openstat/cli.py +92 -0
- openstat/commands/__init__.py +82 -0
- openstat/commands/adv_stat_cmds.py +1255 -0
- openstat/commands/advanced_ml_cmds.py +576 -0
- openstat/commands/advreg_cmds.py +207 -0
- openstat/commands/alias_cmds.py +135 -0
- openstat/commands/arch_cmds.py +82 -0
- openstat/commands/arules_cmds.py +111 -0
- openstat/commands/automodel_cmds.py +212 -0
- openstat/commands/backend_cmds.py +82 -0
- openstat/commands/base.py +170 -0
- openstat/commands/bayes_cmds.py +71 -0
- openstat/commands/causal_cmds.py +269 -0
- openstat/commands/cluster_cmds.py +152 -0
- openstat/commands/data_cmds.py +996 -0
- openstat/commands/datamanip_cmds.py +672 -0
- openstat/commands/dataquality_cmds.py +174 -0
- openstat/commands/datetime_cmds.py +176 -0
- openstat/commands/dimreduce_cmds.py +184 -0
- openstat/commands/discrete_cmds.py +149 -0
- openstat/commands/dsl_cmds.py +143 -0
- openstat/commands/epi_cmds.py +93 -0
- openstat/commands/equiv_tobit_cmds.py +94 -0
- openstat/commands/esttab_cmds.py +196 -0
- openstat/commands/export_beamer_cmds.py +142 -0
- openstat/commands/export_cmds.py +201 -0
- openstat/commands/export_extra_cmds.py +240 -0
- openstat/commands/factor_cmds.py +180 -0
- openstat/commands/groupby_cmds.py +155 -0
- openstat/commands/help_cmds.py +237 -0
- openstat/commands/i18n_cmds.py +43 -0
- openstat/commands/import_extra_cmds.py +561 -0
- openstat/commands/influence_cmds.py +134 -0
- openstat/commands/iv_cmds.py +106 -0
- openstat/commands/manova_cmds.py +105 -0
- openstat/commands/mediate_cmds.py +233 -0
- openstat/commands/meta_cmds.py +284 -0
- openstat/commands/mi_cmds.py +228 -0
- openstat/commands/mixed_cmds.py +79 -0
- openstat/commands/mixture_changepoint_cmds.py +166 -0
- openstat/commands/ml_adv_cmds.py +147 -0
- openstat/commands/ml_cmds.py +178 -0
- openstat/commands/model_eval_cmds.py +142 -0
- openstat/commands/network_cmds.py +288 -0
- openstat/commands/nlquery_cmds.py +161 -0
- openstat/commands/nonparam_cmds.py +149 -0
- openstat/commands/outreg_cmds.py +247 -0
- openstat/commands/panel_cmds.py +141 -0
- openstat/commands/pdf_cmds.py +226 -0
- openstat/commands/pipeline_cmds.py +319 -0
- openstat/commands/plot_cmds.py +189 -0
- openstat/commands/plugin_cmds.py +79 -0
- openstat/commands/posthoc_cmds.py +153 -0
- openstat/commands/power_cmds.py +172 -0
- openstat/commands/profile_cmds.py +246 -0
- openstat/commands/rbridge_cmds.py +81 -0
- openstat/commands/regex_cmds.py +104 -0
- openstat/commands/report_cmds.py +48 -0
- openstat/commands/repro_cmds.py +129 -0
- openstat/commands/resampling_cmds.py +109 -0
- openstat/commands/reshape_cmds.py +223 -0
- openstat/commands/sem_cmds.py +177 -0
- openstat/commands/stat_cmds.py +1040 -0
- openstat/commands/stata_import_cmds.py +215 -0
- openstat/commands/string_cmds.py +124 -0
- openstat/commands/surv_cmds.py +145 -0
- openstat/commands/survey_cmds.py +153 -0
- openstat/commands/textanalysis_cmds.py +192 -0
- openstat/commands/ts_adv_cmds.py +136 -0
- openstat/commands/ts_cmds.py +195 -0
- openstat/commands/tui_cmds.py +111 -0
- openstat/commands/ux_cmds.py +191 -0
- openstat/commands/validate_cmds.py +270 -0
- openstat/commands/viz_adv_cmds.py +312 -0
- openstat/commands/viz_extra_cmds.py +251 -0
- openstat/commands/watch_cmds.py +69 -0
- openstat/config.py +106 -0
- openstat/dsl/__init__.py +0 -0
- openstat/dsl/parser.py +332 -0
- openstat/dsl/tokenizer.py +105 -0
- openstat/i18n.py +120 -0
- openstat/io/__init__.py +0 -0
- openstat/io/loader.py +187 -0
- openstat/jupyter/__init__.py +18 -0
- openstat/jupyter/display.py +18 -0
- openstat/jupyter/magic.py +60 -0
- openstat/logging_config.py +59 -0
- openstat/plots/__init__.py +0 -0
- openstat/plots/plotter.py +437 -0
- openstat/plots/surv_plots.py +32 -0
- openstat/plots/ts_plots.py +59 -0
- openstat/plugins/__init__.py +5 -0
- openstat/plugins/manager.py +69 -0
- openstat/repl.py +457 -0
- openstat/reporting/__init__.py +0 -0
- openstat/reporting/eda.py +208 -0
- openstat/reporting/report.py +67 -0
- openstat/script_runner.py +319 -0
- openstat/session.py +133 -0
- openstat/stats/__init__.py +0 -0
- openstat/stats/advanced_regression.py +269 -0
- openstat/stats/arch_garch.py +84 -0
- openstat/stats/bayesian.py +103 -0
- openstat/stats/causal.py +258 -0
- openstat/stats/clustering.py +206 -0
- openstat/stats/discrete.py +311 -0
- openstat/stats/epidemiology.py +119 -0
- openstat/stats/equiv_tobit.py +163 -0
- openstat/stats/factor.py +174 -0
- openstat/stats/imputation.py +282 -0
- openstat/stats/influence.py +78 -0
- openstat/stats/iv.py +131 -0
- openstat/stats/manova.py +124 -0
- openstat/stats/mixed.py +128 -0
- openstat/stats/ml.py +275 -0
- openstat/stats/ml_advanced.py +117 -0
- openstat/stats/model_eval.py +183 -0
- openstat/stats/models.py +1342 -0
- openstat/stats/nonparametric.py +130 -0
- openstat/stats/panel.py +179 -0
- openstat/stats/power.py +295 -0
- openstat/stats/resampling.py +203 -0
- openstat/stats/survey.py +213 -0
- openstat/stats/survival.py +196 -0
- openstat/stats/timeseries.py +142 -0
- openstat/stats/ts_advanced.py +114 -0
- openstat/types.py +11 -0
- openstat/web/__init__.py +1 -0
- openstat/web/app.py +117 -0
- openstat/web/session_manager.py +73 -0
- openstat/web/static/app.js +117 -0
- openstat/web/static/index.html +38 -0
- openstat/web/static/style.css +103 -0
- openstat_cli-1.0.0.dist-info/METADATA +748 -0
- openstat_cli-1.0.0.dist-info/RECORD +143 -0
- openstat_cli-1.0.0.dist-info/WHEEL +4 -0
- openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
- openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
"""Network analysis commands: network descriptives, centrality, community detection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from openstat.commands.base import command
|
|
6
|
+
from openstat.session import Session
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _require_nx():
|
|
10
|
+
try:
|
|
11
|
+
import networkx as nx
|
|
12
|
+
return nx
|
|
13
|
+
except ImportError:
|
|
14
|
+
raise ImportError("Network analysis requires networkx. Install with: pip install networkx")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@command("network", usage="network <subcommand> ...")
|
|
18
|
+
def cmd_network(session: Session, args: str) -> str:
|
|
19
|
+
"""Network analysis using NetworkX.
|
|
20
|
+
|
|
21
|
+
Subcommands:
|
|
22
|
+
network build from <source_col> to <target_col> [weight=<col>]
|
|
23
|
+
network describe
|
|
24
|
+
network centrality [--degree|--betweenness|--closeness|--eigenvector]
|
|
25
|
+
network community [--louvain|--greedy]
|
|
26
|
+
network plot [--layout=spring|circular|kamada]
|
|
27
|
+
|
|
28
|
+
Examples:
|
|
29
|
+
network build from sender to receiver
|
|
30
|
+
network build from from_node to to_node weight=strength
|
|
31
|
+
network describe
|
|
32
|
+
network centrality --degree
|
|
33
|
+
network community --greedy
|
|
34
|
+
network plot
|
|
35
|
+
"""
|
|
36
|
+
from openstat.commands.base import CommandArgs
|
|
37
|
+
ca = CommandArgs(args)
|
|
38
|
+
if not ca.positional:
|
|
39
|
+
return cmd_network.__doc__ or "Usage: network <subcommand>"
|
|
40
|
+
|
|
41
|
+
subcmd = ca.positional[0].lower()
|
|
42
|
+
|
|
43
|
+
if subcmd == "build":
|
|
44
|
+
return _build_network(session, args)
|
|
45
|
+
elif subcmd == "describe":
|
|
46
|
+
return _describe_network(session)
|
|
47
|
+
elif subcmd == "centrality":
|
|
48
|
+
return _centrality(session, args)
|
|
49
|
+
elif subcmd == "community":
|
|
50
|
+
return _community(session, args)
|
|
51
|
+
elif subcmd == "plot":
|
|
52
|
+
return _plot_network(session, args)
|
|
53
|
+
else:
|
|
54
|
+
return (
|
|
55
|
+
f"Unknown subcommand: {subcmd}\n"
|
|
56
|
+
"Available: build, describe, centrality, community, plot"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _build_network(session: Session, args: str) -> str:
|
|
61
|
+
nx = _require_nx()
|
|
62
|
+
import re
|
|
63
|
+
|
|
64
|
+
df = session.require_data()
|
|
65
|
+
|
|
66
|
+
m_from = re.search(r"from\s+(\w+)", args)
|
|
67
|
+
m_to = re.search(r"to\s+(\w+)", args)
|
|
68
|
+
if not m_from or not m_to:
|
|
69
|
+
return "Usage: network build from <source_col> to <target_col> [weight=<col>]"
|
|
70
|
+
|
|
71
|
+
src_col = m_from.group(1)
|
|
72
|
+
tgt_col = m_to.group(1)
|
|
73
|
+
|
|
74
|
+
m_w = re.search(r"weight[= ](\w+)", args)
|
|
75
|
+
weight_col = m_w.group(1) if m_w else None
|
|
76
|
+
|
|
77
|
+
for c in [src_col, tgt_col] + ([weight_col] if weight_col else []):
|
|
78
|
+
if c not in df.columns:
|
|
79
|
+
return f"Column not found: {c}"
|
|
80
|
+
|
|
81
|
+
sub = df.select([c for c in [src_col, tgt_col, weight_col] if c]).drop_nulls()
|
|
82
|
+
|
|
83
|
+
G = nx.DiGraph() if "--directed" in args else nx.Graph()
|
|
84
|
+
for row in sub.iter_rows():
|
|
85
|
+
src, tgt = str(row[0]), str(row[1])
|
|
86
|
+
w = float(row[2]) if weight_col else 1.0
|
|
87
|
+
if G.has_edge(src, tgt):
|
|
88
|
+
G[src][tgt]["weight"] += w
|
|
89
|
+
else:
|
|
90
|
+
G.add_edge(src, tgt, weight=w)
|
|
91
|
+
|
|
92
|
+
session._network = G
|
|
93
|
+
session._network_weight_col = weight_col
|
|
94
|
+
|
|
95
|
+
directed_str = "directed" if G.is_directed() else "undirected"
|
|
96
|
+
return (
|
|
97
|
+
f"Network built: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges ({directed_str})\n"
|
|
98
|
+
f"Source: '{src_col}' → Target: '{tgt_col}'"
|
|
99
|
+
+ (f" Weight: '{weight_col}'" if weight_col else "")
|
|
100
|
+
+ "\nUse 'network describe', 'network centrality', 'network community', 'network plot'"
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _describe_network(session: Session) -> str:
|
|
105
|
+
nx = _require_nx()
|
|
106
|
+
G = getattr(session, "_network", None)
|
|
107
|
+
if G is None:
|
|
108
|
+
return "No network built. Use 'network build from <src> to <tgt>' first."
|
|
109
|
+
|
|
110
|
+
n = G.number_of_nodes()
|
|
111
|
+
e = G.number_of_edges()
|
|
112
|
+
density = nx.density(G)
|
|
113
|
+
is_connected = nx.is_connected(G.to_undirected()) if G.is_directed() else nx.is_connected(G)
|
|
114
|
+
|
|
115
|
+
lines = [
|
|
116
|
+
f"Nodes: {n}",
|
|
117
|
+
f"Edges: {e}",
|
|
118
|
+
f"Density: {density:.4f}",
|
|
119
|
+
f"Connected: {is_connected}",
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
if not G.is_directed():
|
|
123
|
+
if n > 0 and e > 0:
|
|
124
|
+
try:
|
|
125
|
+
avg_clust = nx.average_clustering(G)
|
|
126
|
+
lines.append(f"Avg Clustering: {avg_clust:.4f}")
|
|
127
|
+
except Exception:
|
|
128
|
+
pass
|
|
129
|
+
try:
|
|
130
|
+
if nx.is_connected(G):
|
|
131
|
+
avg_path = nx.average_shortest_path_length(G)
|
|
132
|
+
lines.append(f"Avg Path Length: {avg_path:.4f}")
|
|
133
|
+
diam = nx.diameter(G)
|
|
134
|
+
lines.append(f"Diameter: {diam}")
|
|
135
|
+
except Exception:
|
|
136
|
+
pass
|
|
137
|
+
|
|
138
|
+
# Degree distribution summary
|
|
139
|
+
import numpy as np
|
|
140
|
+
degrees = [d for _, d in G.degree()]
|
|
141
|
+
if degrees:
|
|
142
|
+
lines += [
|
|
143
|
+
"",
|
|
144
|
+
f"Degree — Min: {min(degrees)} Mean: {np.mean(degrees):.2f} Max: {max(degrees)}",
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
# Top 5 nodes by degree
|
|
148
|
+
top = sorted(G.degree(), key=lambda x: x[1], reverse=True)[:5]
|
|
149
|
+
if top:
|
|
150
|
+
lines.append("")
|
|
151
|
+
lines.append("Top nodes by degree:")
|
|
152
|
+
for node, deg in top:
|
|
153
|
+
lines.append(f" {str(node):<20} degree = {deg}")
|
|
154
|
+
|
|
155
|
+
return "\n" + "=" * 50 + "\nNetwork Descriptives\n" + "=" * 50 + "\n" + "\n".join(lines) + "\n" + "=" * 50
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _centrality(session: Session, args: str) -> str:
|
|
159
|
+
nx = _require_nx()
|
|
160
|
+
G = getattr(session, "_network", None)
|
|
161
|
+
if G is None:
|
|
162
|
+
return "No network built. Use 'network build' first."
|
|
163
|
+
|
|
164
|
+
if "--betweenness" in args:
|
|
165
|
+
label, scores = "Betweenness Centrality", nx.betweenness_centrality(G)
|
|
166
|
+
elif "--closeness" in args:
|
|
167
|
+
label, scores = "Closeness Centrality", nx.closeness_centrality(G)
|
|
168
|
+
elif "--eigenvector" in args:
|
|
169
|
+
try:
|
|
170
|
+
label, scores = "Eigenvector Centrality", nx.eigenvector_centrality(G, max_iter=1000)
|
|
171
|
+
except nx.PowerIterationFailedConvergence:
|
|
172
|
+
label, scores = "Degree Centrality (fallback)", nx.degree_centrality(G)
|
|
173
|
+
else:
|
|
174
|
+
label, scores = "Degree Centrality", nx.degree_centrality(G)
|
|
175
|
+
|
|
176
|
+
top_n = 20
|
|
177
|
+
sorted_nodes = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
|
|
178
|
+
|
|
179
|
+
lines = [f"Top {min(top_n, len(sorted_nodes))} nodes by {label}:"]
|
|
180
|
+
lines.append(f" {'Node':<25} {'Score':>10}")
|
|
181
|
+
lines.append(" " + "-" * 37)
|
|
182
|
+
for node, score in sorted_nodes:
|
|
183
|
+
lines.append(f" {str(node):<25} {score:>10.4f}")
|
|
184
|
+
|
|
185
|
+
return "\n" + "=" * 50 + f"\n{label}\n" + "=" * 50 + "\n" + "\n".join(lines) + "\n" + "=" * 50
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _community(session: Session, args: str) -> str:
|
|
189
|
+
nx = _require_nx()
|
|
190
|
+
G = getattr(session, "_network", None)
|
|
191
|
+
if G is None:
|
|
192
|
+
return "No network built. Use 'network build' first."
|
|
193
|
+
|
|
194
|
+
G_undir = G.to_undirected() if G.is_directed() else G
|
|
195
|
+
|
|
196
|
+
try:
|
|
197
|
+
if "--louvain" in args:
|
|
198
|
+
try:
|
|
199
|
+
import community as community_louvain
|
|
200
|
+
partition = community_louvain.best_partition(G_undir)
|
|
201
|
+
communities = {}
|
|
202
|
+
for node, comm_id in partition.items():
|
|
203
|
+
communities.setdefault(comm_id, []).append(node)
|
|
204
|
+
method = "Louvain"
|
|
205
|
+
except ImportError:
|
|
206
|
+
return "Louvain requires python-louvain. Install with: pip install python-louvain"
|
|
207
|
+
else:
|
|
208
|
+
# Greedy modularity (built into networkx)
|
|
209
|
+
from networkx.algorithms.community import greedy_modularity_communities
|
|
210
|
+
comms = list(greedy_modularity_communities(G_undir))
|
|
211
|
+
communities = {i: list(c) for i, c in enumerate(comms)}
|
|
212
|
+
method = "Greedy Modularity"
|
|
213
|
+
|
|
214
|
+
n_comm = len(communities)
|
|
215
|
+
modularity = None
|
|
216
|
+
try:
|
|
217
|
+
from networkx.algorithms.community.quality import modularity as nx_mod
|
|
218
|
+
modularity = nx_mod(G_undir, [set(v) for v in communities.values()])
|
|
219
|
+
except Exception:
|
|
220
|
+
pass
|
|
221
|
+
|
|
222
|
+
lines = [
|
|
223
|
+
f"Method: {method}",
|
|
224
|
+
f"Communities found: {n_comm}",
|
|
225
|
+
]
|
|
226
|
+
if modularity is not None:
|
|
227
|
+
lines.append(f"Modularity: {modularity:.4f}")
|
|
228
|
+
lines.append("")
|
|
229
|
+
|
|
230
|
+
for cid, members in sorted(communities.items(), key=lambda x: -len(x[1])):
|
|
231
|
+
sample = ", ".join(str(m) for m in members[:5])
|
|
232
|
+
if len(members) > 5:
|
|
233
|
+
sample += f", ... (+{len(members)-5})"
|
|
234
|
+
lines.append(f" Community {cid+1:>3}: {len(members):>4} nodes [{sample}]")
|
|
235
|
+
|
|
236
|
+
return "\n" + "=" * 55 + f"\nCommunity Detection ({method})\n" + "=" * 55 + "\n" + "\n".join(lines) + "\n" + "=" * 55
|
|
237
|
+
|
|
238
|
+
except Exception as exc:
|
|
239
|
+
return f"community detection error: {exc}"
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def _plot_network(session: Session, args: str) -> str:
|
|
243
|
+
nx = _require_nx()
|
|
244
|
+
G = getattr(session, "_network", None)
|
|
245
|
+
if G is None:
|
|
246
|
+
return "No network built. Use 'network build' first."
|
|
247
|
+
|
|
248
|
+
import matplotlib
|
|
249
|
+
matplotlib.use("Agg")
|
|
250
|
+
import matplotlib.pyplot as plt
|
|
251
|
+
from openstat.plots.plotter import _unique_path
|
|
252
|
+
|
|
253
|
+
# Layout
|
|
254
|
+
if "--circular" in args:
|
|
255
|
+
pos = nx.circular_layout(G)
|
|
256
|
+
layout_name = "circular"
|
|
257
|
+
elif "--kamada" in args:
|
|
258
|
+
pos = nx.kamada_kawai_layout(G)
|
|
259
|
+
layout_name = "kamada-kawai"
|
|
260
|
+
else:
|
|
261
|
+
pos = nx.spring_layout(G, seed=42)
|
|
262
|
+
layout_name = "spring"
|
|
263
|
+
|
|
264
|
+
n = G.number_of_nodes()
|
|
265
|
+
fig_size = min(max(6, n * 0.3), 16)
|
|
266
|
+
fig, ax = plt.subplots(figsize=(fig_size, fig_size))
|
|
267
|
+
|
|
268
|
+
node_size = max(50, min(500, 2000 // (n + 1)))
|
|
269
|
+
nx.draw_networkx(
|
|
270
|
+
G, pos=pos, ax=ax,
|
|
271
|
+
node_size=node_size,
|
|
272
|
+
node_color="#4C72B0",
|
|
273
|
+
edge_color="#AAAAAA",
|
|
274
|
+
font_size=max(6, min(10, 120 // (n + 1))),
|
|
275
|
+
arrows=G.is_directed(),
|
|
276
|
+
width=0.8,
|
|
277
|
+
alpha=0.9,
|
|
278
|
+
)
|
|
279
|
+
ax.set_title(f"Network ({n} nodes, {G.number_of_edges()} edges) — {layout_name} layout")
|
|
280
|
+
ax.axis("off")
|
|
281
|
+
fig.tight_layout()
|
|
282
|
+
|
|
283
|
+
session.output_dir.mkdir(parents=True, exist_ok=True)
|
|
284
|
+
path = _unique_path(session.output_dir, "network_plot")
|
|
285
|
+
fig.savefig(path, dpi=150)
|
|
286
|
+
plt.close(fig)
|
|
287
|
+
session.plot_paths.append(str(path))
|
|
288
|
+
return f"Network plot saved: {path}"
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""Natural language query: 'ask' command using OpenAI or Anthropic API."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from openstat.commands.base import command, CommandArgs, friendly_error
|
|
6
|
+
from openstat.session import Session
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _build_context(session: Session) -> str:
|
|
10
|
+
"""Build a compact dataset context string for the LLM."""
|
|
11
|
+
import polars as pl
|
|
12
|
+
|
|
13
|
+
if session.df is None:
|
|
14
|
+
return "No dataset loaded."
|
|
15
|
+
|
|
16
|
+
df = session.df
|
|
17
|
+
NUMERIC = (pl.Float32, pl.Float64, pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
18
|
+
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64)
|
|
19
|
+
|
|
20
|
+
col_info = []
|
|
21
|
+
for c in df.columns[:30]: # cap at 30 cols
|
|
22
|
+
dtype = str(df[c].dtype)
|
|
23
|
+
n_miss = df[c].null_count()
|
|
24
|
+
if df[c].dtype in NUMERIC:
|
|
25
|
+
col_data = df[c].drop_nulls()
|
|
26
|
+
if col_data.len() > 0:
|
|
27
|
+
extra = f"mean={col_data.mean():.2f}, sd={col_data.std():.2f}"
|
|
28
|
+
else:
|
|
29
|
+
extra = "all null"
|
|
30
|
+
else:
|
|
31
|
+
n_uniq = df[c].drop_nulls().n_unique()
|
|
32
|
+
extra = f"{n_uniq} unique values"
|
|
33
|
+
miss_str = f", {n_miss} missing" if n_miss else ""
|
|
34
|
+
col_info.append(f" {c} ({dtype}{miss_str}): {extra}")
|
|
35
|
+
|
|
36
|
+
lines = [
|
|
37
|
+
f"Dataset: {session.dataset_name or 'unknown'}",
|
|
38
|
+
f"Shape: {df.height} rows × {df.width} columns",
|
|
39
|
+
"Columns:",
|
|
40
|
+
] + col_info
|
|
41
|
+
|
|
42
|
+
if session.results:
|
|
43
|
+
lines.append("\nLast model: " + session.results[-1].name +
|
|
44
|
+
" — " + session.results[-1].formula)
|
|
45
|
+
|
|
46
|
+
return "\n".join(lines)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _ask_openai(question: str, context: str, model: str) -> str:
|
|
50
|
+
try:
|
|
51
|
+
from openai import OpenAI
|
|
52
|
+
except ImportError:
|
|
53
|
+
return None # type: ignore
|
|
54
|
+
|
|
55
|
+
client = OpenAI() # uses OPENAI_API_KEY env var
|
|
56
|
+
resp = client.chat.completions.create(
|
|
57
|
+
model=model or "gpt-4o-mini",
|
|
58
|
+
messages=[
|
|
59
|
+
{
|
|
60
|
+
"role": "system",
|
|
61
|
+
"content": (
|
|
62
|
+
"You are a helpful statistical analysis assistant for OpenStat, "
|
|
63
|
+
"a Python-based data analysis REPL similar to Stata. "
|
|
64
|
+
"Answer questions about the dataset concisely. "
|
|
65
|
+
"When relevant, suggest the exact OpenStat command to use."
|
|
66
|
+
),
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
"role": "user",
|
|
70
|
+
"content": f"Dataset context:\n{context}\n\nQuestion: {question}",
|
|
71
|
+
},
|
|
72
|
+
],
|
|
73
|
+
max_tokens=500,
|
|
74
|
+
)
|
|
75
|
+
return resp.choices[0].message.content.strip()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _ask_anthropic(question: str, context: str, model: str) -> str:
|
|
79
|
+
try:
|
|
80
|
+
import anthropic
|
|
81
|
+
except ImportError:
|
|
82
|
+
return None # type: ignore
|
|
83
|
+
|
|
84
|
+
client = anthropic.Anthropic() # uses ANTHROPIC_API_KEY env var
|
|
85
|
+
resp = client.messages.create(
|
|
86
|
+
model=model or "claude-haiku-4-5-20251001",
|
|
87
|
+
max_tokens=500,
|
|
88
|
+
system=(
|
|
89
|
+
"You are a helpful statistical analysis assistant for OpenStat, "
|
|
90
|
+
"a Python-based data analysis REPL similar to Stata. "
|
|
91
|
+
"Answer questions about the dataset concisely. "
|
|
92
|
+
"When relevant, suggest the exact OpenStat command to use."
|
|
93
|
+
),
|
|
94
|
+
messages=[
|
|
95
|
+
{
|
|
96
|
+
"role": "user",
|
|
97
|
+
"content": f"Dataset context:\n{context}\n\nQuestion: {question}",
|
|
98
|
+
}
|
|
99
|
+
],
|
|
100
|
+
)
|
|
101
|
+
return resp.content[0].text.strip()
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@command("ask", usage='ask "<natural language question>"')
|
|
105
|
+
def cmd_ask(session: Session, args: str) -> str:
|
|
106
|
+
"""Ask a natural language question about your dataset (requires AI API key).
|
|
107
|
+
|
|
108
|
+
Uses OpenAI (OPENAI_API_KEY) or Anthropic (ANTHROPIC_API_KEY), whichever
|
|
109
|
+
is available. The assistant can suggest OpenStat commands, explain results,
|
|
110
|
+
and answer statistical questions.
|
|
111
|
+
|
|
112
|
+
Options:
|
|
113
|
+
--provider=openai|anthropic (default: auto-detect from env)
|
|
114
|
+
--model=<model-name> (default: gpt-4o-mini / claude-haiku-4-5)
|
|
115
|
+
|
|
116
|
+
Examples:
|
|
117
|
+
ask "What's the correlation between income and education?"
|
|
118
|
+
ask "Which variables have the most missing data?"
|
|
119
|
+
ask "What regression model should I use for this binary outcome?"
|
|
120
|
+
ask "Explain the OLS results" --provider=anthropic
|
|
121
|
+
"""
|
|
122
|
+
import os
|
|
123
|
+
|
|
124
|
+
ca = CommandArgs(args)
|
|
125
|
+
# Question is the rest of args after stripping flags
|
|
126
|
+
question = ca.strip_flags_and_options().strip().strip('"\'')
|
|
127
|
+
if not question:
|
|
128
|
+
return 'Usage: ask "<question>" [--provider=openai|anthropic] [--model=<name>]'
|
|
129
|
+
|
|
130
|
+
provider = ca.options.get("provider", "").lower()
|
|
131
|
+
model = ca.options.get("model", "")
|
|
132
|
+
|
|
133
|
+
context = _build_context(session)
|
|
134
|
+
|
|
135
|
+
# Auto-detect provider
|
|
136
|
+
if not provider:
|
|
137
|
+
if os.getenv("ANTHROPIC_API_KEY"):
|
|
138
|
+
provider = "anthropic"
|
|
139
|
+
elif os.getenv("OPENAI_API_KEY"):
|
|
140
|
+
provider = "openai"
|
|
141
|
+
else:
|
|
142
|
+
return (
|
|
143
|
+
"No AI API key found.\n"
|
|
144
|
+
"Set OPENAI_API_KEY or ANTHROPIC_API_KEY environment variable.\n"
|
|
145
|
+
"Install: pip install openai OR pip install anthropic"
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
try:
|
|
149
|
+
if provider == "openai":
|
|
150
|
+
result = _ask_openai(question, context, model)
|
|
151
|
+
if result is None:
|
|
152
|
+
return "openai package not installed. Run: pip install openai"
|
|
153
|
+
elif provider == "anthropic":
|
|
154
|
+
result = _ask_anthropic(question, context, model)
|
|
155
|
+
if result is None:
|
|
156
|
+
return "anthropic package not installed. Run: pip install anthropic"
|
|
157
|
+
else:
|
|
158
|
+
return f"Unknown provider: {provider}. Use 'openai' or 'anthropic'."
|
|
159
|
+
return result
|
|
160
|
+
except Exception as e:
|
|
161
|
+
return friendly_error(e, "ask")
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""Nonparametric test commands: ranksum, signrank, kwallis, spearman."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from openstat.commands.base import command
|
|
8
|
+
from openstat.session import Session
|
|
9
|
+
from openstat.stats.nonparametric import (
|
|
10
|
+
spearman_corr,
|
|
11
|
+
ranksum_test,
|
|
12
|
+
signrank_test,
|
|
13
|
+
kruskal_wallis_test,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _stata_opts(raw: str) -> tuple[list[str], dict[str, str]]:
|
|
18
|
+
opts: dict[str, str] = {}
|
|
19
|
+
for m in re.finditer(r'(\w+)\(([^)]*)\)', raw):
|
|
20
|
+
opts[m.group(1).lower()] = m.group(2)
|
|
21
|
+
rest = re.sub(r'\w+\([^)]*\)', '', raw)
|
|
22
|
+
positional = [t.strip(',') for t in rest.split() if t.strip(',')]
|
|
23
|
+
return positional, opts
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _fmt(d: dict) -> str:
|
|
27
|
+
lines = [f"\n{d.get('test', 'Result')}", "-" * 55]
|
|
28
|
+
skip = {"test", "groups", "n_per_group", "_model"}
|
|
29
|
+
for k, v in d.items():
|
|
30
|
+
if k in skip:
|
|
31
|
+
continue
|
|
32
|
+
if isinstance(v, float):
|
|
33
|
+
lines.append(f" {k:<30} {v:.6f}")
|
|
34
|
+
elif isinstance(v, list):
|
|
35
|
+
lines.append(f" {k:<30} {v}")
|
|
36
|
+
else:
|
|
37
|
+
lines.append(f" {k:<30} {v}")
|
|
38
|
+
lines.append("-" * 55)
|
|
39
|
+
return "\n".join(lines)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@command("ranksum", usage="ranksum var by(groupvar) [--less|--greater]")
|
|
43
|
+
def cmd_ranksum(session: Session, args: str) -> str:
|
|
44
|
+
"""Wilcoxon rank-sum (Mann-Whitney U) test for two independent groups."""
|
|
45
|
+
df = session.require_data()
|
|
46
|
+
positional, opts = _stata_opts(args)
|
|
47
|
+
if not positional:
|
|
48
|
+
return "Usage: ranksum var by(groupvar)"
|
|
49
|
+
|
|
50
|
+
var = positional[0]
|
|
51
|
+
by = opts.get("by")
|
|
52
|
+
if by is None:
|
|
53
|
+
return "Specify group variable: ranksum var by(groupvar)"
|
|
54
|
+
|
|
55
|
+
alt = "two-sided"
|
|
56
|
+
if "--less" in args:
|
|
57
|
+
alt = "less"
|
|
58
|
+
elif "--greater" in args:
|
|
59
|
+
alt = "greater"
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
r = ranksum_test(df, var, by, alternative=alt)
|
|
63
|
+
return _fmt(r)
|
|
64
|
+
except Exception as exc:
|
|
65
|
+
return f"ranksum error: {exc}"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@command("signrank", usage="signrank var1 [var2] [mu(0)]")
|
|
69
|
+
def cmd_signrank(session: Session, args: str) -> str:
|
|
70
|
+
"""Wilcoxon signed-rank test (one-sample or paired)."""
|
|
71
|
+
df = session.require_data()
|
|
72
|
+
positional, opts = _stata_opts(args)
|
|
73
|
+
if not positional:
|
|
74
|
+
return "Usage: signrank var1 [var2] [mu(0)]"
|
|
75
|
+
|
|
76
|
+
mu = float(opts.get("mu", 0.0))
|
|
77
|
+
var1 = positional[0]
|
|
78
|
+
var2 = positional[1] if len(positional) > 1 and positional[1] in df.columns else None
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
r = signrank_test(df, var1, var2, mu=mu)
|
|
82
|
+
return _fmt(r)
|
|
83
|
+
except Exception as exc:
|
|
84
|
+
return f"signrank error: {exc}"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@command("kwallis", usage="kwallis var by(groupvar)")
|
|
88
|
+
def cmd_kwallis(session: Session, args: str) -> str:
|
|
89
|
+
"""Kruskal-Wallis H test for k independent groups."""
|
|
90
|
+
df = session.require_data()
|
|
91
|
+
positional, opts = _stata_opts(args)
|
|
92
|
+
if not positional:
|
|
93
|
+
return "Usage: kwallis var by(groupvar)"
|
|
94
|
+
|
|
95
|
+
var = positional[0]
|
|
96
|
+
by = opts.get("by")
|
|
97
|
+
if by is None:
|
|
98
|
+
return "Specify group variable: kwallis var by(groupvar)"
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
r = kruskal_wallis_test(df, var, by)
|
|
102
|
+
lines = [_fmt(r)]
|
|
103
|
+
lines.append("\nGroup counts:")
|
|
104
|
+
for g, n in zip(r["groups"], r["n_per_group"]):
|
|
105
|
+
lines.append(f" {g!s:<20} n = {n}")
|
|
106
|
+
return "\n".join(lines)
|
|
107
|
+
except Exception as exc:
|
|
108
|
+
return f"kwallis error: {exc}"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@command("spearman", usage="spearman var1 var2 [var3 ...]")
|
|
112
|
+
def cmd_spearman(session: Session, args: str) -> str:
|
|
113
|
+
"""Spearman rank correlation matrix."""
|
|
114
|
+
df = session.require_data()
|
|
115
|
+
positional, opts = _stata_opts(args)
|
|
116
|
+
cols = [c for c in positional if c in df.columns]
|
|
117
|
+
if len(cols) < 2:
|
|
118
|
+
return "spearman requires at least 2 numeric variables."
|
|
119
|
+
|
|
120
|
+
try:
|
|
121
|
+
r = spearman_corr(df, cols)
|
|
122
|
+
except Exception as exc:
|
|
123
|
+
return f"spearman error: {exc}"
|
|
124
|
+
|
|
125
|
+
rho = r["rho"]
|
|
126
|
+
pvals = r["pvalues"]
|
|
127
|
+
k = len(cols)
|
|
128
|
+
w = max(len(c) for c in cols) + 2
|
|
129
|
+
|
|
130
|
+
lines = ["\nSpearman Rank Correlation", "=" * (w + k * 9 + 2)]
|
|
131
|
+
header = " " * w + "".join(f" {c[:7]:>7}" for c in cols)
|
|
132
|
+
lines.append(header)
|
|
133
|
+
lines.append("-" * (w + k * 9 + 2))
|
|
134
|
+
for i, ci in enumerate(cols):
|
|
135
|
+
row = f"{ci:<{w}}"
|
|
136
|
+
for j in range(k):
|
|
137
|
+
row += f" {rho[i][j]:>7.4f}"
|
|
138
|
+
lines.append(row)
|
|
139
|
+
lines.append("")
|
|
140
|
+
lines.append("P-values:")
|
|
141
|
+
for i, ci in enumerate(cols):
|
|
142
|
+
row = f"{ci:<{w}}"
|
|
143
|
+
for j in range(k):
|
|
144
|
+
if i == j:
|
|
145
|
+
row += f" {' .':>7}"
|
|
146
|
+
else:
|
|
147
|
+
row += f" {pvals[i][j]:>7.4f}"
|
|
148
|
+
lines.append(row)
|
|
149
|
+
return "\n".join(lines)
|