openstat-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstat/__init__.py +3 -0
- openstat/__main__.py +4 -0
- openstat/backends/__init__.py +16 -0
- openstat/backends/duckdb_backend.py +70 -0
- openstat/backends/polars_backend.py +52 -0
- openstat/cli.py +92 -0
- openstat/commands/__init__.py +82 -0
- openstat/commands/adv_stat_cmds.py +1255 -0
- openstat/commands/advanced_ml_cmds.py +576 -0
- openstat/commands/advreg_cmds.py +207 -0
- openstat/commands/alias_cmds.py +135 -0
- openstat/commands/arch_cmds.py +82 -0
- openstat/commands/arules_cmds.py +111 -0
- openstat/commands/automodel_cmds.py +212 -0
- openstat/commands/backend_cmds.py +82 -0
- openstat/commands/base.py +170 -0
- openstat/commands/bayes_cmds.py +71 -0
- openstat/commands/causal_cmds.py +269 -0
- openstat/commands/cluster_cmds.py +152 -0
- openstat/commands/data_cmds.py +996 -0
- openstat/commands/datamanip_cmds.py +672 -0
- openstat/commands/dataquality_cmds.py +174 -0
- openstat/commands/datetime_cmds.py +176 -0
- openstat/commands/dimreduce_cmds.py +184 -0
- openstat/commands/discrete_cmds.py +149 -0
- openstat/commands/dsl_cmds.py +143 -0
- openstat/commands/epi_cmds.py +93 -0
- openstat/commands/equiv_tobit_cmds.py +94 -0
- openstat/commands/esttab_cmds.py +196 -0
- openstat/commands/export_beamer_cmds.py +142 -0
- openstat/commands/export_cmds.py +201 -0
- openstat/commands/export_extra_cmds.py +240 -0
- openstat/commands/factor_cmds.py +180 -0
- openstat/commands/groupby_cmds.py +155 -0
- openstat/commands/help_cmds.py +237 -0
- openstat/commands/i18n_cmds.py +43 -0
- openstat/commands/import_extra_cmds.py +561 -0
- openstat/commands/influence_cmds.py +134 -0
- openstat/commands/iv_cmds.py +106 -0
- openstat/commands/manova_cmds.py +105 -0
- openstat/commands/mediate_cmds.py +233 -0
- openstat/commands/meta_cmds.py +284 -0
- openstat/commands/mi_cmds.py +228 -0
- openstat/commands/mixed_cmds.py +79 -0
- openstat/commands/mixture_changepoint_cmds.py +166 -0
- openstat/commands/ml_adv_cmds.py +147 -0
- openstat/commands/ml_cmds.py +178 -0
- openstat/commands/model_eval_cmds.py +142 -0
- openstat/commands/network_cmds.py +288 -0
- openstat/commands/nlquery_cmds.py +161 -0
- openstat/commands/nonparam_cmds.py +149 -0
- openstat/commands/outreg_cmds.py +247 -0
- openstat/commands/panel_cmds.py +141 -0
- openstat/commands/pdf_cmds.py +226 -0
- openstat/commands/pipeline_cmds.py +319 -0
- openstat/commands/plot_cmds.py +189 -0
- openstat/commands/plugin_cmds.py +79 -0
- openstat/commands/posthoc_cmds.py +153 -0
- openstat/commands/power_cmds.py +172 -0
- openstat/commands/profile_cmds.py +246 -0
- openstat/commands/rbridge_cmds.py +81 -0
- openstat/commands/regex_cmds.py +104 -0
- openstat/commands/report_cmds.py +48 -0
- openstat/commands/repro_cmds.py +129 -0
- openstat/commands/resampling_cmds.py +109 -0
- openstat/commands/reshape_cmds.py +223 -0
- openstat/commands/sem_cmds.py +177 -0
- openstat/commands/stat_cmds.py +1040 -0
- openstat/commands/stata_import_cmds.py +215 -0
- openstat/commands/string_cmds.py +124 -0
- openstat/commands/surv_cmds.py +145 -0
- openstat/commands/survey_cmds.py +153 -0
- openstat/commands/textanalysis_cmds.py +192 -0
- openstat/commands/ts_adv_cmds.py +136 -0
- openstat/commands/ts_cmds.py +195 -0
- openstat/commands/tui_cmds.py +111 -0
- openstat/commands/ux_cmds.py +191 -0
- openstat/commands/validate_cmds.py +270 -0
- openstat/commands/viz_adv_cmds.py +312 -0
- openstat/commands/viz_extra_cmds.py +251 -0
- openstat/commands/watch_cmds.py +69 -0
- openstat/config.py +106 -0
- openstat/dsl/__init__.py +0 -0
- openstat/dsl/parser.py +332 -0
- openstat/dsl/tokenizer.py +105 -0
- openstat/i18n.py +120 -0
- openstat/io/__init__.py +0 -0
- openstat/io/loader.py +187 -0
- openstat/jupyter/__init__.py +18 -0
- openstat/jupyter/display.py +18 -0
- openstat/jupyter/magic.py +60 -0
- openstat/logging_config.py +59 -0
- openstat/plots/__init__.py +0 -0
- openstat/plots/plotter.py +437 -0
- openstat/plots/surv_plots.py +32 -0
- openstat/plots/ts_plots.py +59 -0
- openstat/plugins/__init__.py +5 -0
- openstat/plugins/manager.py +69 -0
- openstat/repl.py +457 -0
- openstat/reporting/__init__.py +0 -0
- openstat/reporting/eda.py +208 -0
- openstat/reporting/report.py +67 -0
- openstat/script_runner.py +319 -0
- openstat/session.py +133 -0
- openstat/stats/__init__.py +0 -0
- openstat/stats/advanced_regression.py +269 -0
- openstat/stats/arch_garch.py +84 -0
- openstat/stats/bayesian.py +103 -0
- openstat/stats/causal.py +258 -0
- openstat/stats/clustering.py +206 -0
- openstat/stats/discrete.py +311 -0
- openstat/stats/epidemiology.py +119 -0
- openstat/stats/equiv_tobit.py +163 -0
- openstat/stats/factor.py +174 -0
- openstat/stats/imputation.py +282 -0
- openstat/stats/influence.py +78 -0
- openstat/stats/iv.py +131 -0
- openstat/stats/manova.py +124 -0
- openstat/stats/mixed.py +128 -0
- openstat/stats/ml.py +275 -0
- openstat/stats/ml_advanced.py +117 -0
- openstat/stats/model_eval.py +183 -0
- openstat/stats/models.py +1342 -0
- openstat/stats/nonparametric.py +130 -0
- openstat/stats/panel.py +179 -0
- openstat/stats/power.py +295 -0
- openstat/stats/resampling.py +203 -0
- openstat/stats/survey.py +213 -0
- openstat/stats/survival.py +196 -0
- openstat/stats/timeseries.py +142 -0
- openstat/stats/ts_advanced.py +114 -0
- openstat/types.py +11 -0
- openstat/web/__init__.py +1 -0
- openstat/web/app.py +117 -0
- openstat/web/session_manager.py +73 -0
- openstat/web/static/app.js +117 -0
- openstat/web/static/index.html +38 -0
- openstat/web/static/style.css +103 -0
- openstat_cli-1.0.0.dist-info/METADATA +748 -0
- openstat_cli-1.0.0.dist-info/RECORD +143 -0
- openstat_cli-1.0.0.dist-info/WHEEL +4 -0
- openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
- openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""Text analysis: TF-IDF, word frequency, topic modeling (LDA)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
from openstat.commands.base import command, CommandArgs, friendly_error
|
|
5
|
+
from openstat.session import Session
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@command("textfreq", usage="textfreq <col> [--top=20] [--stopwords]")
|
|
9
|
+
def cmd_textfreq(session: Session, args: str) -> str:
|
|
10
|
+
"""Word frequency analysis for a text column.
|
|
11
|
+
|
|
12
|
+
Options:
|
|
13
|
+
--top=<n> show top N words (default: 20)
|
|
14
|
+
--stopwords remove common English stopwords
|
|
15
|
+
--min=<n> minimum word length (default: 2)
|
|
16
|
+
|
|
17
|
+
Examples:
|
|
18
|
+
textfreq review_text --top=30 --stopwords
|
|
19
|
+
textfreq title --min=4
|
|
20
|
+
"""
|
|
21
|
+
import re
|
|
22
|
+
from collections import Counter
|
|
23
|
+
import polars as pl
|
|
24
|
+
|
|
25
|
+
ca = CommandArgs(args)
|
|
26
|
+
if not ca.positional:
|
|
27
|
+
return "Usage: textfreq <col> [--top=20]"
|
|
28
|
+
|
|
29
|
+
col = ca.positional[0]
|
|
30
|
+
top_n = int(ca.options.get("top", 20))
|
|
31
|
+
use_stopwords = "stopwords" in ca.flags
|
|
32
|
+
min_len = int(ca.options.get("min", 2))
|
|
33
|
+
|
|
34
|
+
STOPWORDS = {
|
|
35
|
+
"the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
|
|
36
|
+
"of", "with", "by", "from", "is", "was", "are", "were", "be", "been",
|
|
37
|
+
"have", "has", "had", "do", "does", "did", "will", "would", "could",
|
|
38
|
+
"should", "may", "might", "this", "that", "these", "those", "it", "its",
|
|
39
|
+
"i", "we", "you", "he", "she", "they", "my", "your", "our", "their",
|
|
40
|
+
"not", "no", "as", "if", "so", "than", "then", "when", "where", "how",
|
|
41
|
+
"what", "which", "who",
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
df = session.require_data()
|
|
46
|
+
if col not in df.columns:
|
|
47
|
+
return f"Column not found: {col}"
|
|
48
|
+
|
|
49
|
+
texts = df[col].drop_nulls().cast(pl.Utf8).to_list()
|
|
50
|
+
all_words = []
|
|
51
|
+
for text in texts:
|
|
52
|
+
words = re.findall(r"[a-zA-Z]+", text.lower())
|
|
53
|
+
if min_len > 1:
|
|
54
|
+
words = [w for w in words if len(w) >= min_len]
|
|
55
|
+
if use_stopwords:
|
|
56
|
+
words = [w for w in words if w not in STOPWORDS]
|
|
57
|
+
all_words.extend(words)
|
|
58
|
+
|
|
59
|
+
if not all_words:
|
|
60
|
+
return "No words found after filtering."
|
|
61
|
+
|
|
62
|
+
counter = Counter(all_words)
|
|
63
|
+
most_common = counter.most_common(top_n)
|
|
64
|
+
|
|
65
|
+
lines = [f"Word Frequency — {col} (docs={len(texts)}, unique_words={len(counter)})", ""]
|
|
66
|
+
lines.append(f" {'Rank':<6} {'Word':<25} {'Count':>8} {'%':>7}")
|
|
67
|
+
lines.append(" " + "-" * 50)
|
|
68
|
+
total = len(all_words)
|
|
69
|
+
for rank, (word, cnt) in enumerate(most_common, 1):
|
|
70
|
+
lines.append(f" {rank:<6} {word:<25} {cnt:>8,} {100*cnt/total:>6.2f}%")
|
|
71
|
+
return "\n".join(lines)
|
|
72
|
+
except Exception as e:
|
|
73
|
+
return friendly_error(e, "textfreq")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@command("tfidf", usage="tfidf <col> [--top=20] [--max_features=1000]")
|
|
77
|
+
def cmd_tfidf(session: Session, args: str) -> str:
|
|
78
|
+
"""TF-IDF analysis: identify most distinctive terms in a text column.
|
|
79
|
+
|
|
80
|
+
Options:
|
|
81
|
+
--top=<n> top N terms by mean TF-IDF score (default: 20)
|
|
82
|
+
--max_features=<n> vocabulary size limit (default: 1000)
|
|
83
|
+
--ngram_min=<n> minimum n-gram size (default: 1)
|
|
84
|
+
--ngram_max=<n> maximum n-gram size (default: 1)
|
|
85
|
+
|
|
86
|
+
Examples:
|
|
87
|
+
tfidf review_text --top=30
|
|
88
|
+
tfidf comments --ngram_min=2 --ngram_max=2 --top=15
|
|
89
|
+
"""
|
|
90
|
+
try:
|
|
91
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
92
|
+
except ImportError:
|
|
93
|
+
return "scikit-learn required. Install: pip install scikit-learn"
|
|
94
|
+
|
|
95
|
+
import polars as pl
|
|
96
|
+
import numpy as np
|
|
97
|
+
|
|
98
|
+
ca = CommandArgs(args)
|
|
99
|
+
if not ca.positional:
|
|
100
|
+
return "Usage: tfidf <col> [--top=20]"
|
|
101
|
+
|
|
102
|
+
col = ca.positional[0]
|
|
103
|
+
top_n = int(ca.options.get("top", 20))
|
|
104
|
+
max_features = int(ca.options.get("max_features", 1000))
|
|
105
|
+
ngram_min = int(ca.options.get("ngram_min", 1))
|
|
106
|
+
ngram_max = int(ca.options.get("ngram_max", 1))
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
df = session.require_data()
|
|
110
|
+
if col not in df.columns:
|
|
111
|
+
return f"Column not found: {col}"
|
|
112
|
+
|
|
113
|
+
texts = df[col].drop_nulls().cast(pl.Utf8).to_list()
|
|
114
|
+
if len(texts) < 2:
|
|
115
|
+
return "Need at least 2 documents for TF-IDF."
|
|
116
|
+
|
|
117
|
+
vec = TfidfVectorizer(max_features=max_features, stop_words="english",
|
|
118
|
+
ngram_range=(ngram_min, ngram_max))
|
|
119
|
+
tfidf_matrix = vec.fit_transform(texts)
|
|
120
|
+
feature_names = vec.get_feature_names_out()
|
|
121
|
+
mean_scores = np.asarray(tfidf_matrix.mean(axis=0)).flatten()
|
|
122
|
+
top_idx = mean_scores.argsort()[::-1][:top_n]
|
|
123
|
+
|
|
124
|
+
lines = [f"TF-IDF Analysis — {col} (docs={len(texts)}, vocab={len(feature_names)})", ""]
|
|
125
|
+
lines.append(f" {'Rank':<6} {'Term':<30} {'Mean TF-IDF':>12}")
|
|
126
|
+
lines.append(" " + "-" * 52)
|
|
127
|
+
for rank, idx in enumerate(top_idx, 1):
|
|
128
|
+
lines.append(f" {rank:<6} {feature_names[idx]:<30} {mean_scores[idx]:>12.5f}")
|
|
129
|
+
return "\n".join(lines)
|
|
130
|
+
except Exception as e:
|
|
131
|
+
return friendly_error(e, "tfidf")
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@command("lda", usage="lda <col> [--topics=5] [--words=10] [--iter=10]")
|
|
135
|
+
def cmd_lda(session: Session, args: str) -> str:
|
|
136
|
+
"""Latent Dirichlet Allocation (LDA) topic modeling.
|
|
137
|
+
|
|
138
|
+
Options:
|
|
139
|
+
--topics=<n> number of topics (default: 5)
|
|
140
|
+
--words=<n> top words per topic to show (default: 10)
|
|
141
|
+
--iter=<n> max iterations (default: 10)
|
|
142
|
+
--max_features=<n> vocabulary size (default: 1000)
|
|
143
|
+
|
|
144
|
+
Examples:
|
|
145
|
+
lda review_text --topics=5
|
|
146
|
+
lda abstract --topics=8 --words=12
|
|
147
|
+
"""
|
|
148
|
+
try:
|
|
149
|
+
from sklearn.decomposition import LatentDirichletAllocation
|
|
150
|
+
from sklearn.feature_extraction.text import CountVectorizer
|
|
151
|
+
except ImportError:
|
|
152
|
+
return "scikit-learn required. Install: pip install scikit-learn"
|
|
153
|
+
|
|
154
|
+
import polars as pl
|
|
155
|
+
import numpy as np
|
|
156
|
+
|
|
157
|
+
ca = CommandArgs(args)
|
|
158
|
+
if not ca.positional:
|
|
159
|
+
return "Usage: lda <col> [--topics=5]"
|
|
160
|
+
|
|
161
|
+
col = ca.positional[0]
|
|
162
|
+
n_topics = int(ca.options.get("topics", 5))
|
|
163
|
+
n_words = int(ca.options.get("words", 10))
|
|
164
|
+
n_iter = int(ca.options.get("iter", 10))
|
|
165
|
+
max_features = int(ca.options.get("max_features", 1000))
|
|
166
|
+
|
|
167
|
+
try:
|
|
168
|
+
df = session.require_data()
|
|
169
|
+
if col not in df.columns:
|
|
170
|
+
return f"Column not found: {col}"
|
|
171
|
+
|
|
172
|
+
texts = df[col].drop_nulls().cast(pl.Utf8).to_list()
|
|
173
|
+
if len(texts) < n_topics:
|
|
174
|
+
return f"Need at least {n_topics} documents (got {len(texts)})."
|
|
175
|
+
|
|
176
|
+
vec = CountVectorizer(max_features=max_features, stop_words="english", min_df=2)
|
|
177
|
+
dtm = vec.fit_transform(texts)
|
|
178
|
+
feature_names = vec.get_feature_names_out()
|
|
179
|
+
|
|
180
|
+
lda_model = LatentDirichletAllocation(
|
|
181
|
+
n_components=n_topics, max_iter=n_iter, random_state=42
|
|
182
|
+
)
|
|
183
|
+
lda_model.fit(dtm)
|
|
184
|
+
|
|
185
|
+
lines = [f"LDA Topic Modeling — {col} (docs={len(texts)}, topics={n_topics})", ""]
|
|
186
|
+
for topic_idx, topic in enumerate(lda_model.components_):
|
|
187
|
+
top_words_idx = topic.argsort()[::-1][:n_words]
|
|
188
|
+
top_words = [feature_names[i] for i in top_words_idx]
|
|
189
|
+
lines.append(f" Topic {topic_idx + 1}: {', '.join(top_words)}")
|
|
190
|
+
return "\n".join(lines)
|
|
191
|
+
except Exception as e:
|
|
192
|
+
return friendly_error(e, "lda")
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Advanced time-series commands: granger, johansen, vecm, stl, tssmooth."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from openstat.commands.base import command
|
|
8
|
+
from openstat.session import Session
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _stata_opts(raw: str) -> tuple[list[str], dict[str, str]]:
|
|
12
|
+
opts: dict[str, str] = {}
|
|
13
|
+
for m in re.finditer(r'(\w+)\(([^)]*)\)', raw):
|
|
14
|
+
opts[m.group(1).lower()] = m.group(2)
|
|
15
|
+
rest = re.sub(r'\w+\([^)]*\)', '', raw)
|
|
16
|
+
positional = [t.strip(',') for t in rest.split() if t.strip(',')]
|
|
17
|
+
return positional, opts
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@command("granger", usage="granger dep cause [maxlag(4)]")
|
|
21
|
+
def cmd_granger(session: Session, args: str) -> str:
|
|
22
|
+
"""Granger causality test."""
|
|
23
|
+
from openstat.stats.ts_advanced import granger_causality
|
|
24
|
+
df = session.require_data()
|
|
25
|
+
positional, opts = _stata_opts(args)
|
|
26
|
+
if len(positional) < 2:
|
|
27
|
+
return "Usage: granger dep cause [maxlag(4)]"
|
|
28
|
+
dep, cause = positional[0], positional[1]
|
|
29
|
+
maxlag = int(opts.get("maxlag", 4))
|
|
30
|
+
try:
|
|
31
|
+
r = granger_causality(df, dep, cause, maxlag=maxlag)
|
|
32
|
+
lines = [f"\nGranger Causality: {cause} → {dep}", "-" * 50]
|
|
33
|
+
for lag, pval in r["lag_pvalues"].items():
|
|
34
|
+
lines.append(f" Lag {lag:2d}: F-test p-value = {pval:.4f}")
|
|
35
|
+
lines.append(f"\n Min p-value: {r['min_pvalue']:.4f} at lag {r['best_lag']}")
|
|
36
|
+
lines.append(f" Granger-causes at 5%: {'YES' if r['reject_null_5pct'] else 'NO'}")
|
|
37
|
+
return "\n".join(lines)
|
|
38
|
+
except Exception as exc:
|
|
39
|
+
return f"granger error: {exc}"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@command("johansen", usage="johansen var1 var2 [var3 ...] [lags(1)]")
|
|
43
|
+
def cmd_johansen(session: Session, args: str) -> str:
|
|
44
|
+
"""Johansen cointegration test."""
|
|
45
|
+
from openstat.stats.ts_advanced import johansen_test
|
|
46
|
+
df = session.require_data()
|
|
47
|
+
positional, opts = _stata_opts(args)
|
|
48
|
+
cols = [c for c in positional if c in df.columns]
|
|
49
|
+
if len(cols) < 2:
|
|
50
|
+
return "johansen requires at least 2 variables."
|
|
51
|
+
k_ar_diff = int(opts.get("lags", 1))
|
|
52
|
+
try:
|
|
53
|
+
r = johansen_test(df, cols, k_ar_diff=k_ar_diff)
|
|
54
|
+
lines = ["\nJohansen Cointegration Test", "=" * 55]
|
|
55
|
+
lines.append(f" Variables: {', '.join(cols)}")
|
|
56
|
+
lines.append(f" Cointegrating vectors: {r['n_cointegrating_vectors']}")
|
|
57
|
+
lines.append("\n Trace Statistics:")
|
|
58
|
+
lines.append(f" {'r=0':>10} {'Statistic':>12} {'CV 95%':>10} {'CV 90%':>10}")
|
|
59
|
+
for i, (ts, cv95, cv90) in enumerate(zip(r["trace_statistics"], r["trace_cv_95"], r["trace_cv_90"])):
|
|
60
|
+
lines.append(f" r<={i:<8} {ts:>12.4f} {cv95:>10.4f} {cv90:>10.4f}")
|
|
61
|
+
return "\n".join(lines)
|
|
62
|
+
except Exception as exc:
|
|
63
|
+
return f"johansen error: {exc}"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@command("vecm", usage="vecm var1 var2 [var3 ...] [lags(1) rank(1)]")
|
|
67
|
+
def cmd_vecm(session: Session, args: str) -> str:
|
|
68
|
+
"""Vector Error Correction Model."""
|
|
69
|
+
from openstat.stats.ts_advanced import fit_vecm
|
|
70
|
+
df = session.require_data()
|
|
71
|
+
positional, opts = _stata_opts(args)
|
|
72
|
+
cols = [c for c in positional if c in df.columns]
|
|
73
|
+
if len(cols) < 2:
|
|
74
|
+
return "vecm requires at least 2 variables."
|
|
75
|
+
k_ar_diff = int(opts.get("lags", 1))
|
|
76
|
+
coint_rank = int(opts.get("rank", 1))
|
|
77
|
+
try:
|
|
78
|
+
r = fit_vecm(df, cols, k_ar_diff=k_ar_diff, coint_rank=coint_rank)
|
|
79
|
+
session._last_model = r
|
|
80
|
+
lines = ["\nVECM Results", "=" * 50]
|
|
81
|
+
lines.append(f" Variables: {', '.join(cols)}")
|
|
82
|
+
lines.append(f" Cointegration rank: {coint_rank}, AR lags: {k_ar_diff}")
|
|
83
|
+
lines.append(f"\n Alpha (adjustment coefficients):")
|
|
84
|
+
for row in r["alpha"]:
|
|
85
|
+
lines.append(f" {row}")
|
|
86
|
+
return "\n".join(lines)
|
|
87
|
+
except Exception as exc:
|
|
88
|
+
return f"vecm error: {exc}"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@command("stl", usage="stl varname [period(12)]")
|
|
92
|
+
def cmd_stl(session: Session, args: str) -> str:
|
|
93
|
+
"""STL decomposition (trend + seasonal + residual)."""
|
|
94
|
+
from openstat.stats.ts_advanced import stl_decompose
|
|
95
|
+
df = session.require_data()
|
|
96
|
+
positional, opts = _stata_opts(args)
|
|
97
|
+
if not positional:
|
|
98
|
+
return "Usage: stl varname [period(12)]"
|
|
99
|
+
col = positional[0]
|
|
100
|
+
if col not in df.columns:
|
|
101
|
+
return f"Column '{col}' not found."
|
|
102
|
+
period = int(opts.get("period", 12))
|
|
103
|
+
try:
|
|
104
|
+
r = stl_decompose(df, col, period=period)
|
|
105
|
+
lines = [f"\nSTL Decomposition: {col} (period={period})", "-" * 45]
|
|
106
|
+
lines.append(f" {'Strength of trend':<30} {r['strength_trend']:.4f}")
|
|
107
|
+
lines.append(f" {'Strength of seasonal':<30} {r['strength_seasonal']:.4f}")
|
|
108
|
+
lines.append(f" Trend range: [{min(r['trend']):.4f}, {max(r['trend']):.4f}]")
|
|
109
|
+
lines.append(f" Seasonal range: [{min(r['seasonal']):.4f}, {max(r['seasonal']):.4f}]")
|
|
110
|
+
session._last_model = r
|
|
111
|
+
return "\n".join(lines)
|
|
112
|
+
except Exception as exc:
|
|
113
|
+
return f"stl error: {exc}"
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@command("tssmooth", usage="tssmooth varname [method(ma|exp) window(3) alpha(0.3)]")
|
|
117
|
+
def cmd_tssmooth(session: Session, args: str) -> str:
|
|
118
|
+
"""Smooth a time series column."""
|
|
119
|
+
from openstat.stats.ts_advanced import tssmooth
|
|
120
|
+
df = session.require_data()
|
|
121
|
+
positional, opts = _stata_opts(args)
|
|
122
|
+
if not positional:
|
|
123
|
+
return "Usage: tssmooth varname [method(ma|exp) window(3) alpha(0.3)]"
|
|
124
|
+
col = positional[0]
|
|
125
|
+
if col not in df.columns:
|
|
126
|
+
return f"Column '{col}' not found."
|
|
127
|
+
method = opts.get("method", "ma")
|
|
128
|
+
window = int(opts.get("window", 3))
|
|
129
|
+
alpha = float(opts.get("alpha", 0.3))
|
|
130
|
+
session.snapshot()
|
|
131
|
+
try:
|
|
132
|
+
session.df = tssmooth(df, col, method=method, window=window, alpha=alpha)
|
|
133
|
+
new_col = f"{col}_smooth"
|
|
134
|
+
return f"Smoothed '{col}' → '{new_col}' using {method} (window={window}, alpha={alpha})"
|
|
135
|
+
except Exception as exc:
|
|
136
|
+
return f"tssmooth error: {exc}"
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""Time series commands: tsset, arima, var, dfuller, forecast, irf."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import polars as pl
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
from rich.table import Table
|
|
11
|
+
|
|
12
|
+
from openstat.session import Session, ModelResult
|
|
13
|
+
from openstat.commands.base import command, CommandArgs, rich_to_str, friendly_error
|
|
14
|
+
from openstat.dsl.parser import parse_formula, ParseError
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@command("tsset", usage="tsset <time_var> [freq=M|Q|Y|D]")
|
|
18
|
+
def cmd_tsset(session: Session, args: str) -> str:
|
|
19
|
+
"""Declare the time variable for time series analysis."""
|
|
20
|
+
df = session.require_data()
|
|
21
|
+
ca = CommandArgs(args)
|
|
22
|
+
if not ca.positional:
|
|
23
|
+
return "Usage: tsset <time_var> [freq=M|Q|Y|D]"
|
|
24
|
+
|
|
25
|
+
time_var = ca.positional[0]
|
|
26
|
+
if time_var not in df.columns:
|
|
27
|
+
return f"Column not found: {time_var}"
|
|
28
|
+
|
|
29
|
+
session._time_var = time_var
|
|
30
|
+
session._ts_freq = ca.get_option("freq")
|
|
31
|
+
n = df[time_var].n_unique()
|
|
32
|
+
return f"Time variable: {time_var} ({n} unique values)" + (
|
|
33
|
+
f"\nFrequency: {session._ts_freq}" if session._ts_freq else ""
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@command("dfuller", usage="dfuller <variable>")
|
|
38
|
+
def cmd_dfuller(session: Session, args: str) -> str:
|
|
39
|
+
"""Augmented Dickey-Fuller unit root test."""
|
|
40
|
+
df = session.require_data()
|
|
41
|
+
var_name = args.strip()
|
|
42
|
+
if not var_name:
|
|
43
|
+
return "Usage: dfuller <variable>"
|
|
44
|
+
if var_name not in df.columns:
|
|
45
|
+
return f"Column not found: {var_name}"
|
|
46
|
+
|
|
47
|
+
from openstat.stats.timeseries import adf_test
|
|
48
|
+
series = df[var_name].drop_nulls().to_numpy()
|
|
49
|
+
return adf_test(series, var_name)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@command("arima", usage="arima y [~ x1], order(p,d,q)")
|
|
53
|
+
def cmd_arima(session: Session, args: str) -> str:
|
|
54
|
+
"""Fit ARIMA(p,d,q) or ARIMAX model."""
|
|
55
|
+
df = session.require_data()
|
|
56
|
+
|
|
57
|
+
# Parse order(p,d,q)
|
|
58
|
+
m = re.search(r'order\((\d+)\s*,\s*(\d+)\s*,\s*(\d+)\)', args)
|
|
59
|
+
if not m:
|
|
60
|
+
return "Usage: arima y [~ x1], order(p,d,q)"
|
|
61
|
+
order = (int(m.group(1)), int(m.group(2)), int(m.group(3)))
|
|
62
|
+
|
|
63
|
+
# Remove order(...) from args for formula parsing
|
|
64
|
+
formula_str = args[:m.start()] + args[m.end():]
|
|
65
|
+
formula_str = formula_str.strip().rstrip(",").strip()
|
|
66
|
+
|
|
67
|
+
# Parse formula
|
|
68
|
+
if "~" in formula_str:
|
|
69
|
+
try:
|
|
70
|
+
dep, exog_vars = parse_formula(formula_str)
|
|
71
|
+
except ParseError as e:
|
|
72
|
+
return f"Formula error: {e}"
|
|
73
|
+
else:
|
|
74
|
+
parts = formula_str.split()
|
|
75
|
+
dep = parts[0] if parts else ""
|
|
76
|
+
exog_vars = None
|
|
77
|
+
|
|
78
|
+
if not dep or dep not in df.columns:
|
|
79
|
+
return f"Dependent variable not found: {dep}"
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
from openstat.stats.timeseries import fit_arima
|
|
83
|
+
|
|
84
|
+
result, raw = fit_arima(df, dep, order, exog_vars, session._time_var)
|
|
85
|
+
|
|
86
|
+
session._last_model = raw
|
|
87
|
+
session._last_model_vars = (dep, exog_vars or [])
|
|
88
|
+
session._last_fit_result = result
|
|
89
|
+
session._last_fit_kwargs = {"order": order}
|
|
90
|
+
|
|
91
|
+
md = result.to_markdown() if hasattr(result, "to_markdown") else ""
|
|
92
|
+
session.results.append(ModelResult(
|
|
93
|
+
name=f"ARIMA{order}", formula=result.formula,
|
|
94
|
+
table=md, details={
|
|
95
|
+
"n_obs": result.n_obs,
|
|
96
|
+
"params": dict(result.params),
|
|
97
|
+
"aic": result.aic,
|
|
98
|
+
"bic": result.bic,
|
|
99
|
+
},
|
|
100
|
+
))
|
|
101
|
+
|
|
102
|
+
output = result.summary_table()
|
|
103
|
+
if result.warnings:
|
|
104
|
+
output += "\n" + "\n".join(result.warnings)
|
|
105
|
+
return output
|
|
106
|
+
except Exception as e:
|
|
107
|
+
return friendly_error(e, "arima")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@command("var", usage="var y1 y2 [y3], lags(n)")
|
|
111
|
+
def cmd_var(session: Session, args: str) -> str:
|
|
112
|
+
"""Fit a Vector Autoregression (VAR) model."""
|
|
113
|
+
df = session.require_data()
|
|
114
|
+
|
|
115
|
+
# Parse lags(n)
|
|
116
|
+
m = re.search(r'lags\((\d+)\)', args)
|
|
117
|
+
if not m:
|
|
118
|
+
return "Usage: var y1 y2, lags(n)"
|
|
119
|
+
lags = int(m.group(1))
|
|
120
|
+
|
|
121
|
+
# Parse variable list
|
|
122
|
+
var_str = args[:m.start()].strip().rstrip(",").strip()
|
|
123
|
+
variables = var_str.split()
|
|
124
|
+
if len(variables) < 2:
|
|
125
|
+
return "VAR requires at least 2 variables."
|
|
126
|
+
|
|
127
|
+
missing = [v for v in variables if v not in df.columns]
|
|
128
|
+
if missing:
|
|
129
|
+
return f"Columns not found: {', '.join(missing)}"
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
from openstat.stats.timeseries import fit_var
|
|
133
|
+
|
|
134
|
+
summary, raw = fit_var(df, variables, lags, session._time_var)
|
|
135
|
+
|
|
136
|
+
session._last_model = raw
|
|
137
|
+
session._last_model_vars = (variables[0], variables[1:])
|
|
138
|
+
session._last_fit_kwargs = {"lags": lags, "variables": variables}
|
|
139
|
+
|
|
140
|
+
session.results.append(ModelResult(
|
|
141
|
+
name=f"VAR({lags})", formula=f"VAR({', '.join(variables)})",
|
|
142
|
+
table=summary, details={"lags": lags, "variables": variables},
|
|
143
|
+
))
|
|
144
|
+
|
|
145
|
+
return summary
|
|
146
|
+
except Exception as e:
|
|
147
|
+
return friendly_error(e, "var")
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
@command("forecast", usage="forecast <steps>")
|
|
151
|
+
def cmd_forecast(session: Session, args: str) -> str:
|
|
152
|
+
"""Generate forecasts from the last fitted time series model."""
|
|
153
|
+
if session._last_model is None:
|
|
154
|
+
return "No model fitted. Run arima or var first."
|
|
155
|
+
|
|
156
|
+
steps = int(args.strip()) if args.strip().isdigit() else 12
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
from openstat.stats.timeseries import forecast_model
|
|
160
|
+
fc = forecast_model(session._last_model, steps)
|
|
161
|
+
|
|
162
|
+
def render(console: Console) -> None:
|
|
163
|
+
table = Table(title=f"Forecast ({steps} steps)")
|
|
164
|
+
table.add_column("Step", justify="right")
|
|
165
|
+
if fc.ndim == 1:
|
|
166
|
+
table.add_column("Forecast", justify="right")
|
|
167
|
+
for i, val in enumerate(fc, 1):
|
|
168
|
+
table.add_row(str(i), f"{val:.4f}")
|
|
169
|
+
else:
|
|
170
|
+
vars_list = session._last_fit_kwargs.get("variables", [])
|
|
171
|
+
for v in vars_list:
|
|
172
|
+
table.add_column(v, justify="right")
|
|
173
|
+
for i, row in enumerate(fc, 1):
|
|
174
|
+
table.add_row(str(i), *[f"{v:.4f}" for v in row])
|
|
175
|
+
console.print(table)
|
|
176
|
+
|
|
177
|
+
return rich_to_str(render)
|
|
178
|
+
except Exception as e:
|
|
179
|
+
return friendly_error(e, "forecast")
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
@command("irf", usage="irf [steps=N]")
|
|
183
|
+
def cmd_irf(session: Session, args: str) -> str:
|
|
184
|
+
"""Impulse response functions for VAR model."""
|
|
185
|
+
if session._last_model is None:
|
|
186
|
+
return "No model fitted. Run var first."
|
|
187
|
+
|
|
188
|
+
ca = CommandArgs(args)
|
|
189
|
+
steps = int(ca.get_option("steps", "10"))
|
|
190
|
+
|
|
191
|
+
try:
|
|
192
|
+
from openstat.stats.timeseries import compute_irf
|
|
193
|
+
return compute_irf(session._last_model, steps)
|
|
194
|
+
except Exception as e:
|
|
195
|
+
return friendly_error(e, "irf")
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""TUI dashboard command: dashboard (requires textual)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from openstat.commands.base import command
|
|
6
|
+
from openstat.session import Session
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@command("dashboard", usage="dashboard")
|
|
10
|
+
def cmd_dashboard(session: Session, args: str) -> str:
|
|
11
|
+
"""Launch an interactive TUI dashboard (requires: pip install textual).
|
|
12
|
+
|
|
13
|
+
Shows dataset overview, variable list, model results, and recent history
|
|
14
|
+
in a full-screen terminal UI. Press Q or Ctrl+C to exit.
|
|
15
|
+
"""
|
|
16
|
+
try:
|
|
17
|
+
from textual.app import App, ComposeResult
|
|
18
|
+
from textual.widgets import (
|
|
19
|
+
DataTable, Footer, Header, Label, RichLog, TabbedContent, TabPane,
|
|
20
|
+
)
|
|
21
|
+
from textual.binding import Binding
|
|
22
|
+
except ImportError:
|
|
23
|
+
return (
|
|
24
|
+
"textual is required for the dashboard.\n"
|
|
25
|
+
"Install: pip install textual"
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
import polars as pl
|
|
29
|
+
|
|
30
|
+
# ── Snapshot data so the TUI doesn't need the live session ──────────────
|
|
31
|
+
dataset_name = session.dataset_name or "(no dataset)"
|
|
32
|
+
shape_str = session.shape_str
|
|
33
|
+
df = session.df
|
|
34
|
+
results = list(session.results)
|
|
35
|
+
history = list(session.history[-50:]) # last 50 commands
|
|
36
|
+
|
|
37
|
+
# Build column summary
|
|
38
|
+
col_rows: list[tuple[str, ...]] = []
|
|
39
|
+
if df is not None:
|
|
40
|
+
NUMERIC = (pl.Float32, pl.Float64, pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
41
|
+
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64)
|
|
42
|
+
for c in df.columns:
|
|
43
|
+
dtype = str(df[c].dtype)
|
|
44
|
+
n_miss = str(df[c].null_count())
|
|
45
|
+
if df[c].dtype in NUMERIC:
|
|
46
|
+
col_data = df[c].drop_nulls()
|
|
47
|
+
if col_data.len() > 0:
|
|
48
|
+
mean_str = f"{col_data.mean():.3f}"
|
|
49
|
+
sd_str = f"{col_data.std():.3f}" if col_data.len() > 1 else "—"
|
|
50
|
+
else:
|
|
51
|
+
mean_str = sd_str = "—"
|
|
52
|
+
else:
|
|
53
|
+
mean_str = sd_str = "—"
|
|
54
|
+
col_rows.append((c, dtype, n_miss, mean_str, sd_str))
|
|
55
|
+
|
|
56
|
+
class OpenStatDashboard(App):
|
|
57
|
+
TITLE = f"OpenStat Dashboard — {dataset_name}"
|
|
58
|
+
BINDINGS = [
|
|
59
|
+
Binding("q", "quit", "Quit"),
|
|
60
|
+
Binding("ctrl+c", "quit", "Quit"),
|
|
61
|
+
]
|
|
62
|
+
CSS = """
|
|
63
|
+
Screen { background: #1a1a2e; }
|
|
64
|
+
Header { background: #16213e; color: #e94560; }
|
|
65
|
+
Footer { background: #16213e; color: #a8b2d8; }
|
|
66
|
+
TabbedContent { background: #1a1a2e; }
|
|
67
|
+
TabPane { padding: 1 2; }
|
|
68
|
+
DataTable { height: auto; }
|
|
69
|
+
Label { color: #cdd6f4; margin: 0 0 1 0; }
|
|
70
|
+
RichLog { height: 30; border: solid #313244; background: #181825; }
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
def compose(self) -> ComposeResult:
|
|
74
|
+
yield Header()
|
|
75
|
+
with TabbedContent():
|
|
76
|
+
with TabPane("Overview", id="overview"):
|
|
77
|
+
yield Label(f"[bold]Dataset:[/bold] {dataset_name} | [bold]Shape:[/bold] {shape_str}")
|
|
78
|
+
yield Label(
|
|
79
|
+
f"[bold]Models fitted:[/bold] {len(results)} | "
|
|
80
|
+
f"[bold]Commands run:[/bold] {len(session.history)}"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
with TabPane("Variables", id="variables"):
|
|
84
|
+
tbl = DataTable(zebra_stripes=True)
|
|
85
|
+
tbl.add_columns("Variable", "Type", "Missing", "Mean", "SD")
|
|
86
|
+
for row in col_rows:
|
|
87
|
+
tbl.add_row(*row)
|
|
88
|
+
yield tbl
|
|
89
|
+
|
|
90
|
+
with TabPane("Models", id="models"):
|
|
91
|
+
if results:
|
|
92
|
+
log = RichLog(markup=True, highlight=True)
|
|
93
|
+
for mr in results:
|
|
94
|
+
log.write(f"[bold cyan]{mr.name}: {mr.formula}[/bold cyan]")
|
|
95
|
+
log.write(mr.table)
|
|
96
|
+
log.write("")
|
|
97
|
+
yield log
|
|
98
|
+
else:
|
|
99
|
+
yield Label("No models fitted yet.")
|
|
100
|
+
|
|
101
|
+
with TabPane("History", id="history"):
|
|
102
|
+
log = RichLog(markup=False, highlight=False)
|
|
103
|
+
for cmd_line in history:
|
|
104
|
+
log.write(f". {cmd_line}")
|
|
105
|
+
yield log
|
|
106
|
+
|
|
107
|
+
yield Footer()
|
|
108
|
+
|
|
109
|
+
app = OpenStatDashboard()
|
|
110
|
+
app.run()
|
|
111
|
+
return "Dashboard closed."
|