statchat-app 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- statchat/__init__.py +0 -0
- statchat/__main__.py +172 -0
- statchat/assets/icon.ico +0 -0
- statchat/assets/icon.png +0 -0
- statchat/cli/__init__.py +0 -0
- statchat/cli/runner.py +197 -0
- statchat/core/__init__.py +0 -0
- statchat/core/adjuster.py +265 -0
- statchat/core/analyzer.py +181 -0
- statchat/core/cleaner.py +142 -0
- statchat/core/llm_backend.py +257 -0
- statchat/core/loader.py +59 -0
- statchat/core/reporter.py +605 -0
- statchat/gui/__init__.py +0 -0
- statchat/gui/app.py +678 -0
- statchat/gui/chat_panel.py +559 -0
- statchat/gui/settings_dialog.py +269 -0
- statchat/icon.ico +0 -0
- statchat_app-1.0.0.dist-info/METADATA +175 -0
- statchat_app-1.0.0.dist-info/RECORD +23 -0
- statchat_app-1.0.0.dist-info/WHEEL +5 -0
- statchat_app-1.0.0.dist-info/entry_points.txt +2 -0
- statchat_app-1.0.0.dist-info/top_level.txt +1 -0
statchat/__init__.py
ADDED
|
File without changes
|
statchat/__main__.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Stat Chat — Data Cleaning, Normalization, Statistical Analysis & Iterative Adjustment
|
|
4
|
+
|
|
5
|
+
Entry points:
|
|
6
|
+
GUI: statchat
|
|
7
|
+
CLI: statchat --cli --input data.csv [options]
|
|
8
|
+
Also: python -m statchat
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import sys
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# ── Dependency check ───────────────────────────────────────────────────────────
|
|
16
|
+
|
|
17
|
+
REQUIRED = {
|
|
18
|
+
"pandas": "pandas",
|
|
19
|
+
"numpy": "numpy",
|
|
20
|
+
"scipy": "scipy",
|
|
21
|
+
"sklearn": "scikit-learn",
|
|
22
|
+
"reportlab": "reportlab",
|
|
23
|
+
"matplotlib": "matplotlib",
|
|
24
|
+
"openpyxl": "openpyxl",
|
|
25
|
+
"requests": "requests",
|
|
26
|
+
"PIL": "Pillow",
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def check_dependencies():
|
|
31
|
+
missing = []
|
|
32
|
+
for module, pip_name in REQUIRED.items():
|
|
33
|
+
try:
|
|
34
|
+
__import__(module)
|
|
35
|
+
except ImportError:
|
|
36
|
+
missing.append(pip_name)
|
|
37
|
+
return missing
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def prompt_install(missing):
|
|
41
|
+
cmd = "pip install " + " ".join(missing)
|
|
42
|
+
print("=" * 60)
|
|
43
|
+
print(" Stat Chat — Missing Dependencies")
|
|
44
|
+
print("=" * 60)
|
|
45
|
+
print(f"\n The following packages are required but not installed:\n")
|
|
46
|
+
for p in missing:
|
|
47
|
+
print(f" • {p}")
|
|
48
|
+
print(f"\n Install them by running:\n\n {cmd}\n")
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
import tkinter as tk
|
|
52
|
+
from tkinter import messagebox
|
|
53
|
+
root = tk.Tk()
|
|
54
|
+
root.withdraw()
|
|
55
|
+
answer = messagebox.askyesno(
|
|
56
|
+
"Stat Chat — Missing packages",
|
|
57
|
+
"The following required packages are missing:\n\n"
|
|
58
|
+
+ "\n".join(f" • {p}" for p in missing)
|
|
59
|
+
+ f"\n\nInstall them now?\n\n{cmd}",
|
|
60
|
+
)
|
|
61
|
+
root.destroy()
|
|
62
|
+
if answer:
|
|
63
|
+
import subprocess
|
|
64
|
+
subprocess.check_call([sys.executable, "-m", "pip", "install"] + missing)
|
|
65
|
+
print("\n ✓ Installation complete. Relaunch Stat Chat.")
|
|
66
|
+
except Exception:
|
|
67
|
+
pass
|
|
68
|
+
|
|
69
|
+
sys.exit(1)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# ── Launchers ──────────────────────────────────────────────────────────────────
|
|
73
|
+
|
|
74
|
+
def run_gui():
|
|
75
|
+
try:
|
|
76
|
+
import tkinter as tk
|
|
77
|
+
except ImportError:
|
|
78
|
+
print("[ERROR] Tkinter is not available.")
|
|
79
|
+
print(" Linux: sudo apt install python3-tk")
|
|
80
|
+
print(" macOS/Windows: reinstall Python from python.org")
|
|
81
|
+
sys.exit(1)
|
|
82
|
+
|
|
83
|
+
from statchat.gui.app import StatChatApp
|
|
84
|
+
root = tk.Tk()
|
|
85
|
+
StatChatApp(root)
|
|
86
|
+
root.mainloop()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def run_cli(args):
|
|
90
|
+
from statchat.cli.runner import CLIRunner
|
|
91
|
+
CLIRunner(args).run()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# ── CLI argument parser ────────────────────────────────────────────────────────
|
|
95
|
+
|
|
96
|
+
def build_parser():
|
|
97
|
+
parser = argparse.ArgumentParser(
|
|
98
|
+
prog="statchat",
|
|
99
|
+
description="Stat Chat — Data Cleaning, Normalization & Statistical Analysis",
|
|
100
|
+
formatter_class=argparse.RawTextHelpFormatter,
|
|
101
|
+
)
|
|
102
|
+
parser.add_argument("--cli", action="store_true", help="Run in CLI mode (no GUI)")
|
|
103
|
+
parser.add_argument("--input", "-i", type=str, help="Input CSV or Excel file")
|
|
104
|
+
parser.add_argument("--output", "-o", type=str, help="Path to save processed data")
|
|
105
|
+
parser.add_argument("--output-format", choices=["csv", "xlsx", "json"], default="csv",
|
|
106
|
+
help="Output format (default: csv)")
|
|
107
|
+
parser.add_argument("--report", "-r", type=str, help="Path to save PDF report")
|
|
108
|
+
|
|
109
|
+
cg = parser.add_argument_group("Cleaning Options")
|
|
110
|
+
cg.add_argument("--drop-duplicates", action="store_true", help="Remove duplicate rows")
|
|
111
|
+
cg.add_argument("--drop-nulls", action="store_true", help="Drop rows with null values")
|
|
112
|
+
cg.add_argument("--fill-nulls", choices=["mean", "median", "mode", "zero"],
|
|
113
|
+
help="Fill null values with strategy")
|
|
114
|
+
|
|
115
|
+
ng = parser.add_argument_group("Normalization Options")
|
|
116
|
+
ng.add_argument("--normalize", choices=["zscore", "minmax", "robust"],
|
|
117
|
+
help="Normalization method")
|
|
118
|
+
ng.add_argument("--norm-mean", type=float, default=0.0,
|
|
119
|
+
help="Z-score target mean (default: 0.0)")
|
|
120
|
+
ng.add_argument("--norm-std", type=float, default=1.0,
|
|
121
|
+
help="Z-score target std (default: 1.0)")
|
|
122
|
+
|
|
123
|
+
ag = parser.add_argument_group("Analysis Metrics")
|
|
124
|
+
ag.add_argument("--central-tendency", action="store_true", help="Mean, median, mode")
|
|
125
|
+
ag.add_argument("--dispersion", action="store_true", help="Std dev, variance, IQR, range")
|
|
126
|
+
ag.add_argument("--shape", action="store_true", help="Skewness & kurtosis")
|
|
127
|
+
ag.add_argument("--correlation", action="store_true", help="Correlation matrix")
|
|
128
|
+
ag.add_argument("--percentiles", action="store_true", help="P5–P95 percentiles")
|
|
129
|
+
ag.add_argument("--roc-auc", type=str, metavar="TARGET_COL",
|
|
130
|
+
help="ROC-AUC vs a binary target column")
|
|
131
|
+
ag.add_argument("--all-metrics", action="store_true", help="Run all metrics")
|
|
132
|
+
|
|
133
|
+
adj = parser.add_argument_group("Adjustment Options (requires LLM backend)")
|
|
134
|
+
adj.add_argument("--adjust", action="append", metavar="INSTRUCTION",
|
|
135
|
+
help="Natural-language adjustment (repeatable).\n"
|
|
136
|
+
"e.g. --adjust 'Add 1000 to spend'")
|
|
137
|
+
adj.add_argument("--adjust-image", type=str, metavar="IMAGE_PATH",
|
|
138
|
+
help="Annotated report image — vision model extracts instructions")
|
|
139
|
+
|
|
140
|
+
bg = parser.add_argument_group("LLM Backend Options")
|
|
141
|
+
bg.add_argument("--backend", choices=["claude", "lmstudio", "lmstudio_vision"],
|
|
142
|
+
default="claude", help="LLM provider (default: claude)")
|
|
143
|
+
bg.add_argument("--lmstudio-url", type=str, default="http://localhost:1234",
|
|
144
|
+
metavar="URL", help="LM Studio server URL")
|
|
145
|
+
bg.add_argument("--lmstudio-model", type=str, default="", metavar="MODEL_ID",
|
|
146
|
+
help="LM Studio text model ID")
|
|
147
|
+
bg.add_argument("--lmstudio-vision-model", type=str, default="", metavar="MODEL_ID",
|
|
148
|
+
help="LM Studio vision model ID")
|
|
149
|
+
|
|
150
|
+
return parser
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
# ── Main entry point ───────────────────────────────────────────────────────────
|
|
154
|
+
|
|
155
|
+
def main():
|
|
156
|
+
missing = check_dependencies()
|
|
157
|
+
if missing:
|
|
158
|
+
prompt_install(missing)
|
|
159
|
+
|
|
160
|
+
parser = build_parser()
|
|
161
|
+
args = parser.parse_args()
|
|
162
|
+
|
|
163
|
+
if args.cli:
|
|
164
|
+
if not args.input:
|
|
165
|
+
parser.error("--input is required in CLI mode")
|
|
166
|
+
run_cli(args)
|
|
167
|
+
else:
|
|
168
|
+
run_gui()
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
if __name__ == "__main__":
|
|
172
|
+
main()
|
statchat/assets/icon.ico
ADDED
|
Binary file
|
statchat/assets/icon.png
ADDED
|
Binary file
|
statchat/cli/__init__.py
ADDED
|
File without changes
|
statchat/cli/runner.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""CLI mode runner."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
import base64
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from statchat.core.loader import load_file, save_file, get_file_info
|
|
7
|
+
from statchat.core.cleaner import apply_cleaning, apply_normalization
|
|
8
|
+
from statchat.core.analyzer import run_analysis
|
|
9
|
+
from statchat.core.reporter import generate_report
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CLIRunner:
|
|
13
|
+
def __init__(self, args):
|
|
14
|
+
self.args = args
|
|
15
|
+
|
|
16
|
+
def run(self):
|
|
17
|
+
args = self.args
|
|
18
|
+
print(f"\n── Stat Chat CLI ──────────────────────────────────")
|
|
19
|
+
print(f"Loading: {args.input}")
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
original_df = load_file(args.input)
|
|
23
|
+
except Exception as e:
|
|
24
|
+
print(f"[ERROR] {e}")
|
|
25
|
+
sys.exit(1)
|
|
26
|
+
|
|
27
|
+
info = get_file_info(original_df)
|
|
28
|
+
print(f" Rows: {info['rows']} | Columns: {info['columns']}")
|
|
29
|
+
print(f" Numeric cols: {info['numeric_columns']}")
|
|
30
|
+
|
|
31
|
+
# ── Configure LLM backend ─────────────────────────────────────────────
|
|
32
|
+
if args.adjust or args.adjust_image:
|
|
33
|
+
from statchat.core.llm_backend import LLMConfig, set_config
|
|
34
|
+
provider = getattr(args, "backend", "claude")
|
|
35
|
+
cfg = LLMConfig(
|
|
36
|
+
provider=provider,
|
|
37
|
+
lmstudio_base_url=getattr(args, "lmstudio_url", "http://localhost:1234"),
|
|
38
|
+
lmstudio_model=getattr(args, "lmstudio_model", ""),
|
|
39
|
+
lmstudio_vision_model=getattr(args, "lmstudio_vision_model", ""),
|
|
40
|
+
)
|
|
41
|
+
set_config(cfg)
|
|
42
|
+
print(f" [backend] {provider}"
|
|
43
|
+
+ (f" @ {cfg.lmstudio_base_url}" if "lmstudio" in provider else ""))
|
|
44
|
+
|
|
45
|
+
# ── Cleaning ──────────────────────────────────────────────────────────
|
|
46
|
+
clean_opts = {
|
|
47
|
+
"drop_duplicates": args.drop_duplicates,
|
|
48
|
+
"drop_nulls": args.drop_nulls,
|
|
49
|
+
"fill_nulls": args.fill_nulls,
|
|
50
|
+
}
|
|
51
|
+
cleaned_df, clean_log = apply_cleaning(original_df.copy(), clean_opts)
|
|
52
|
+
for msg in clean_log:
|
|
53
|
+
print(f" [clean] {msg}")
|
|
54
|
+
|
|
55
|
+
# ── Normalization ─────────────────────────────────────────────────────
|
|
56
|
+
norm_opts = {
|
|
57
|
+
"normalize": args.normalize,
|
|
58
|
+
"norm_mean": args.norm_mean,
|
|
59
|
+
"norm_std": args.norm_std,
|
|
60
|
+
}
|
|
61
|
+
cleaned_df, norm_log = apply_normalization(cleaned_df, norm_opts)
|
|
62
|
+
for msg in norm_log:
|
|
63
|
+
print(f" [norm] {msg}")
|
|
64
|
+
|
|
65
|
+
# ── Text adjustments ──────────────────────────────────────────────────
|
|
66
|
+
adj_history = []
|
|
67
|
+
working_df = cleaned_df.copy()
|
|
68
|
+
|
|
69
|
+
if args.adjust:
|
|
70
|
+
from statchat.core.adjuster import apply_instructions
|
|
71
|
+
import datetime
|
|
72
|
+
print(f"\n── Applying Text Adjustments ──────────────────────")
|
|
73
|
+
for instruction in args.adjust:
|
|
74
|
+
print(f" Instruction: \"{instruction}\"")
|
|
75
|
+
try:
|
|
76
|
+
working_df, descriptions, ops = apply_instructions(working_df, instruction)
|
|
77
|
+
for d in descriptions:
|
|
78
|
+
print(f" [adjust] {d}")
|
|
79
|
+
adj_history.append({
|
|
80
|
+
"version": len(adj_history) + 1,
|
|
81
|
+
"df": working_df.copy(),
|
|
82
|
+
"description": "; ".join(descriptions),
|
|
83
|
+
"ops": ops,
|
|
84
|
+
"timestamp": datetime.datetime.now().strftime("%H:%M:%S"),
|
|
85
|
+
})
|
|
86
|
+
except Exception as e:
|
|
87
|
+
print(f" [ERROR] Could not apply: {e}")
|
|
88
|
+
sys.exit(1)
|
|
89
|
+
|
|
90
|
+
# ── Image annotation ──────────────────────────────────────────────────
|
|
91
|
+
if args.adjust_image:
|
|
92
|
+
from statchat.core.adjuster import apply_instructions_from_image
|
|
93
|
+
import datetime
|
|
94
|
+
print(f"\n── Parsing Annotated Image ────────────────────────")
|
|
95
|
+
img_path = Path(args.adjust_image)
|
|
96
|
+
if not img_path.exists():
|
|
97
|
+
print(f" [ERROR] Image not found: {img_path}")
|
|
98
|
+
sys.exit(1)
|
|
99
|
+
print(f" Image: {img_path.name}")
|
|
100
|
+
try:
|
|
101
|
+
with open(img_path, "rb") as f:
|
|
102
|
+
image_b64 = base64.b64encode(f.read()).decode("utf-8")
|
|
103
|
+
working_df, descriptions, ops = apply_instructions_from_image(
|
|
104
|
+
working_df, image_b64
|
|
105
|
+
)
|
|
106
|
+
for d in descriptions:
|
|
107
|
+
print(f" [adjust] {d}")
|
|
108
|
+
adj_history.append({
|
|
109
|
+
"version": len(adj_history) + 1,
|
|
110
|
+
"df": working_df.copy(),
|
|
111
|
+
"description": f"From image '{img_path.name}': " + "; ".join(descriptions),
|
|
112
|
+
"ops": ops,
|
|
113
|
+
"timestamp": datetime.datetime.now().strftime("%H:%M:%S"),
|
|
114
|
+
})
|
|
115
|
+
except Exception as e:
|
|
116
|
+
print(f" [ERROR] Could not parse image: {e}")
|
|
117
|
+
sys.exit(1)
|
|
118
|
+
|
|
119
|
+
# ── Save output data ──────────────────────────────────────────────────
|
|
120
|
+
final_df = working_df # post-adjustment (or just cleaned if no adjustments)
|
|
121
|
+
|
|
122
|
+
if args.output:
|
|
123
|
+
out_path = save_file(final_df, args.output, fmt=args.output_format)
|
|
124
|
+
print(f"\n [saved] Data → {out_path}")
|
|
125
|
+
|
|
126
|
+
# ── Analysis ─────────────────────────────────────────────────────────
|
|
127
|
+
df_for_analysis = final_df.copy().reset_index(drop=True)
|
|
128
|
+
if args.roc_auc and args.roc_auc in original_df.columns:
|
|
129
|
+
df_for_analysis[args.roc_auc] = original_df[args.roc_auc].reset_index(drop=True)
|
|
130
|
+
|
|
131
|
+
analysis_opts = {
|
|
132
|
+
"central_tendency": args.central_tendency,
|
|
133
|
+
"dispersion": args.dispersion,
|
|
134
|
+
"shape": args.shape,
|
|
135
|
+
"correlation": args.correlation,
|
|
136
|
+
"percentiles": args.percentiles,
|
|
137
|
+
"normality": False,
|
|
138
|
+
"roc_auc": args.roc_auc,
|
|
139
|
+
"all_metrics": args.all_metrics,
|
|
140
|
+
"original_df": original_df,
|
|
141
|
+
}
|
|
142
|
+
results = run_analysis(df_for_analysis, analysis_opts)
|
|
143
|
+
|
|
144
|
+
if results:
|
|
145
|
+
print(f"\n── Analysis Results ───────────────────────────────")
|
|
146
|
+
for section, data in results.items():
|
|
147
|
+
if section == "error":
|
|
148
|
+
print(f" [error] {data}")
|
|
149
|
+
elif section == "correlation":
|
|
150
|
+
print(f" [correlation] Matrix computed "
|
|
151
|
+
f"({len(data.columns)} x {len(data.columns)})")
|
|
152
|
+
elif section == "roc_auc":
|
|
153
|
+
for col, v in data.items():
|
|
154
|
+
if "auc" in v:
|
|
155
|
+
print(f" [roc_auc] {col}: AUC = {v['auc']:.4f}")
|
|
156
|
+
elif "error" in v:
|
|
157
|
+
print(f" [roc_auc] {col}: ERROR - {v['error']}")
|
|
158
|
+
else:
|
|
159
|
+
print(f"\n [{section}]")
|
|
160
|
+
for col, vals in data.items():
|
|
161
|
+
summary = " ".join(
|
|
162
|
+
f"{k}={v:.4f}" if isinstance(v, float) else f"{k}={v}"
|
|
163
|
+
for k, v in vals.items()
|
|
164
|
+
)
|
|
165
|
+
print(f" {col}: {summary}")
|
|
166
|
+
|
|
167
|
+
# ── PDF Report ────────────────────────────────────────────────────────
|
|
168
|
+
if args.report:
|
|
169
|
+
print(f"\n── Generating PDF Report ──────────────────────────")
|
|
170
|
+
try:
|
|
171
|
+
# Seed history with original if adjustments were made
|
|
172
|
+
full_history = []
|
|
173
|
+
if adj_history:
|
|
174
|
+
import datetime
|
|
175
|
+
full_history = [{
|
|
176
|
+
"version": 0,
|
|
177
|
+
"df": original_df.copy(),
|
|
178
|
+
"description": "Original dataset",
|
|
179
|
+
"ops": [],
|
|
180
|
+
"timestamp": datetime.datetime.now().strftime("%H:%M:%S"),
|
|
181
|
+
}] + adj_history
|
|
182
|
+
|
|
183
|
+
out = generate_report(
|
|
184
|
+
filepath=args.report,
|
|
185
|
+
original_df=original_df,
|
|
186
|
+
cleaned_df=final_df,
|
|
187
|
+
analysis_results=results,
|
|
188
|
+
clean_log=clean_log,
|
|
189
|
+
norm_log=norm_log,
|
|
190
|
+
source_file=args.input,
|
|
191
|
+
adjustment_history=full_history if full_history else None,
|
|
192
|
+
)
|
|
193
|
+
print(f" [saved] Report → {out}")
|
|
194
|
+
except Exception as e:
|
|
195
|
+
print(f" [ERROR] Could not generate report: {e}")
|
|
196
|
+
|
|
197
|
+
print("\n── Done ───────────────────────────────────────────\n")
|
|
File without changes
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
"""
|
|
2
|
+
core/adjuster.py
|
|
3
|
+
----------------
|
|
4
|
+
Interprets natural-language adjustment instructions via the configured LLM
|
|
5
|
+
backend (Claude API or LM Studio) and applies them to a pandas DataFrame.
|
|
6
|
+
|
|
7
|
+
Adjustment instructions are translated into a small safe DSL:
|
|
8
|
+
{ "op": "add"|"subtract"|"multiply"|"divide"|"set"|"clip"|"round"|
|
|
9
|
+
"rename"|"drop_col"|"filter_rows"|"fillna"|"abs"|"log"|"sqrt",
|
|
10
|
+
"column": "<col>", # target column (or "*" for all numeric)
|
|
11
|
+
"value": <number|str>, # operand (where relevant)
|
|
12
|
+
"condition": "<expr>", # optional pandas .query() string
|
|
13
|
+
"description": "<human summary>"
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
Multiple operations are returned as a list.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
import math
|
|
21
|
+
from typing import Optional
|
|
22
|
+
import pandas as pd
|
|
23
|
+
import numpy as np
|
|
24
|
+
|
|
25
|
+
from statchat.core.llm_backend import chat_json, get_config, LLMConfig
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# ── System prompt ─────────────────────────────────────────────────────────────
|
|
29
|
+
|
|
30
|
+
_SYSTEM = """You are a data transformation assistant. The user will describe adjustments
|
|
31
|
+
they want to make to a pandas DataFrame in plain English.
|
|
32
|
+
|
|
33
|
+
You must respond with ONLY a JSON array of operation objects — no prose, no markdown fences.
|
|
34
|
+
|
|
35
|
+
Each object has these keys:
|
|
36
|
+
op : one of add | subtract | multiply | divide | set | clip | round |
|
|
37
|
+
rename | drop_col | filter_rows | fillna | abs | log | sqrt
|
|
38
|
+
column : the column name to operate on (use "*" to mean all numeric columns)
|
|
39
|
+
value : numeric value, new name string, or clip bounds [min, max]
|
|
40
|
+
condition : (optional) a pandas DataFrame.query() expression string to limit rows
|
|
41
|
+
description : a short human-readable summary of this operation
|
|
42
|
+
|
|
43
|
+
Examples:
|
|
44
|
+
User: "Add $1000 to each value in spend"
|
|
45
|
+
-> [{{"op":"add","column":"spend","value":1000,"description":"Add 1000 to spend"}}]
|
|
46
|
+
|
|
47
|
+
User: "Multiply income and age by 1.1"
|
|
48
|
+
-> [{{"op":"multiply","column":"income","value":1.1,"description":"Multiply income by 1.1"}},
|
|
49
|
+
{{"op":"multiply","column":"age","value":1.1,"description":"Multiply age by 1.1"}}]
|
|
50
|
+
|
|
51
|
+
User: "Rename 'score' to 'risk_score' and drop the 'region' column"
|
|
52
|
+
-> [{{"op":"rename","column":"score","value":"risk_score","description":"Rename score to risk_score"}},
|
|
53
|
+
{{"op":"drop_col","column":"region","description":"Drop column region"}}]
|
|
54
|
+
|
|
55
|
+
User: "Clip spend between 0 and 5000"
|
|
56
|
+
-> [{{"op":"clip","column":"spend","value":[0,5000],"description":"Clip spend to [0, 5000]"}}]
|
|
57
|
+
|
|
58
|
+
User: "Set all income values below 20000 to 20000"
|
|
59
|
+
-> [{{"op":"clip","column":"income","value":[20000,null],"description":"Floor income at 20000"}}]
|
|
60
|
+
|
|
61
|
+
User: "Fill nulls in income with 0"
|
|
62
|
+
-> [{{"op":"fillna","column":"income","value":0,"description":"Fill income nulls with 0"}}]
|
|
63
|
+
|
|
64
|
+
User: "Round age to 0 decimal places"
|
|
65
|
+
-> [{{"op":"round","column":"age","value":0,"description":"Round age to 0 decimals"}}]
|
|
66
|
+
|
|
67
|
+
User: "Remove rows where spend is negative"
|
|
68
|
+
-> [{{"op":"filter_rows","condition":"spend >= 0","description":"Remove rows where spend < 0"}}]
|
|
69
|
+
|
|
70
|
+
User: "Log-transform spend"
|
|
71
|
+
-> [{{"op":"log","column":"spend","description":"Natural log of spend"}}]
|
|
72
|
+
|
|
73
|
+
Columns available: {columns}
|
|
74
|
+
Numeric columns: {numeric_columns}
|
|
75
|
+
|
|
76
|
+
Only use column names from the list above. Respond with ONLY the JSON array."""
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# ── Vision system prompt (for image/PDF annotation workflow) ──────────────────
|
|
80
|
+
|
|
81
|
+
_VISION_SYSTEM = """You are a data transformation assistant. The user has annotated a
|
|
82
|
+
data report (shown as an image) with handwritten notes, highlights, or typed
|
|
83
|
+
comments describing changes they want to make to the underlying dataset.
|
|
84
|
+
|
|
85
|
+
Extract ALL adjustment instructions visible in the image and translate them
|
|
86
|
+
into the same JSON operation format.
|
|
87
|
+
|
|
88
|
+
You must respond with ONLY a JSON array of operation objects — no prose, no markdown fences.
|
|
89
|
+
|
|
90
|
+
Each object has these keys:
|
|
91
|
+
op : one of add | subtract | multiply | divide | set | clip | round |
|
|
92
|
+
rename | drop_col | filter_rows | fillna | abs | log | sqrt
|
|
93
|
+
column : the column name to operate on (use "*" for all numeric columns)
|
|
94
|
+
value : numeric value, new name string, or clip bounds [min, max]
|
|
95
|
+
condition : (optional) a pandas .query() expression string
|
|
96
|
+
description : a short human-readable summary of this operation
|
|
97
|
+
|
|
98
|
+
Columns available: {columns}
|
|
99
|
+
Numeric columns: {numeric_columns}
|
|
100
|
+
|
|
101
|
+
Respond with ONLY the JSON array. If no clear instructions are found, return []."""
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# ── Parsing ───────────────────────────────────────────────────────────────────
|
|
105
|
+
|
|
106
|
+
def parse_instructions(instruction: str, df: pd.DataFrame,
|
|
107
|
+
cfg: Optional[LLMConfig] = None) -> list[dict]:
|
|
108
|
+
"""Send a text instruction to the LLM and return operation dicts."""
|
|
109
|
+
columns = list(df.columns)
|
|
110
|
+
numeric_columns = list(df.select_dtypes(include="number").columns)
|
|
111
|
+
system = _SYSTEM.format(
|
|
112
|
+
columns=json.dumps(columns),
|
|
113
|
+
numeric_columns=json.dumps(numeric_columns),
|
|
114
|
+
)
|
|
115
|
+
return chat_json(system, instruction, cfg=cfg or get_config())
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def parse_instructions_from_image(image_b64: str, df: pd.DataFrame,
|
|
119
|
+
cfg: Optional[LLMConfig] = None) -> list[dict]:
|
|
120
|
+
"""
|
|
121
|
+
Send an annotated report image to a vision LLM and return operation dicts.
|
|
122
|
+
|
|
123
|
+
image_b64 : base64-encoded PNG or JPEG of the annotated report page.
|
|
124
|
+
df : current DataFrame (used for column hints in the system prompt).
|
|
125
|
+
"""
|
|
126
|
+
columns = list(df.columns)
|
|
127
|
+
numeric_columns = list(df.select_dtypes(include="number").columns)
|
|
128
|
+
system = _VISION_SYSTEM.format(
|
|
129
|
+
columns=json.dumps(columns),
|
|
130
|
+
numeric_columns=json.dumps(numeric_columns),
|
|
131
|
+
)
|
|
132
|
+
vision_cfg = cfg or get_config()
|
|
133
|
+
# If the global config is text-only lmstudio, auto-switch to vision variant
|
|
134
|
+
if vision_cfg.provider == "lmstudio":
|
|
135
|
+
from dataclasses import replace
|
|
136
|
+
vision_cfg = replace(vision_cfg, provider="lmstudio_vision")
|
|
137
|
+
|
|
138
|
+
return chat_json(system, "Please extract all adjustment instructions from this annotated report.",
|
|
139
|
+
image_b64=image_b64, cfg=vision_cfg)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
# ── Operation executor ────────────────────────────────────────────────────────
|
|
143
|
+
|
|
144
|
+
def _resolve_mask(df: pd.DataFrame, condition: Optional[str]) -> pd.Series:
|
|
145
|
+
if not condition:
|
|
146
|
+
return pd.Series([True] * len(df), index=df.index)
|
|
147
|
+
return df.eval(condition)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def apply_operation(df: pd.DataFrame, op: dict) -> tuple[pd.DataFrame, str]:
|
|
151
|
+
"""Apply a single operation dict to df. Returns (new_df, description)."""
|
|
152
|
+
df = df.copy()
|
|
153
|
+
o = op.get("op", "").lower()
|
|
154
|
+
col = op.get("column", "")
|
|
155
|
+
val = op.get("value")
|
|
156
|
+
cond = op.get("condition")
|
|
157
|
+
desc = op.get("description", f"{o} on {col}")
|
|
158
|
+
numeric = list(df.select_dtypes(include="number").columns)
|
|
159
|
+
|
|
160
|
+
target_cols = numeric if col == "*" else [col]
|
|
161
|
+
for tc in target_cols:
|
|
162
|
+
if tc not in df.columns and o not in ("filter_rows", "rename"):
|
|
163
|
+
raise ValueError(f"Column '{tc}' not found in dataset.")
|
|
164
|
+
|
|
165
|
+
mask = _resolve_mask(df, cond)
|
|
166
|
+
|
|
167
|
+
if o == "add":
|
|
168
|
+
for tc in target_cols:
|
|
169
|
+
df.loc[mask, tc] = df.loc[mask, tc] + float(val)
|
|
170
|
+
|
|
171
|
+
elif o == "subtract":
|
|
172
|
+
for tc in target_cols:
|
|
173
|
+
df.loc[mask, tc] = df.loc[mask, tc] - float(val)
|
|
174
|
+
|
|
175
|
+
elif o == "multiply":
|
|
176
|
+
for tc in target_cols:
|
|
177
|
+
df.loc[mask, tc] = df.loc[mask, tc] * float(val)
|
|
178
|
+
|
|
179
|
+
elif o == "divide":
|
|
180
|
+
if float(val) == 0:
|
|
181
|
+
raise ValueError("Division by zero.")
|
|
182
|
+
for tc in target_cols:
|
|
183
|
+
df.loc[mask, tc] = df.loc[mask, tc] / float(val)
|
|
184
|
+
|
|
185
|
+
elif o == "set":
|
|
186
|
+
for tc in target_cols:
|
|
187
|
+
df.loc[mask, tc] = float(val) if isinstance(val, (int, float)) else val
|
|
188
|
+
|
|
189
|
+
elif o == "clip":
|
|
190
|
+
lo, hi = (val[0], val[1]) if isinstance(val, list) else (None, None)
|
|
191
|
+
lo = None if lo is None or (isinstance(lo, float) and math.isnan(lo)) else float(lo)
|
|
192
|
+
hi = None if hi is None or (isinstance(hi, float) and math.isnan(hi)) else float(hi)
|
|
193
|
+
for tc in target_cols:
|
|
194
|
+
df[tc] = df[tc].clip(lower=lo, upper=hi)
|
|
195
|
+
|
|
196
|
+
elif o == "round":
|
|
197
|
+
decimals = int(val) if val is not None else 0
|
|
198
|
+
for tc in target_cols:
|
|
199
|
+
df[tc] = df[tc].round(decimals)
|
|
200
|
+
|
|
201
|
+
elif o == "rename":
|
|
202
|
+
if col not in df.columns:
|
|
203
|
+
raise ValueError(f"Column '{col}' not found.")
|
|
204
|
+
df = df.rename(columns={col: str(val)})
|
|
205
|
+
|
|
206
|
+
elif o == "drop_col":
|
|
207
|
+
for tc in target_cols:
|
|
208
|
+
df = df.drop(columns=[tc], errors="ignore")
|
|
209
|
+
|
|
210
|
+
elif o == "filter_rows":
|
|
211
|
+
if not cond:
|
|
212
|
+
raise ValueError("filter_rows requires a condition.")
|
|
213
|
+
df = df.query(cond).reset_index(drop=True)
|
|
214
|
+
|
|
215
|
+
elif o == "fillna":
|
|
216
|
+
for tc in target_cols:
|
|
217
|
+
df[tc] = df[tc].fillna(float(val) if isinstance(val, (int, float)) else val)
|
|
218
|
+
|
|
219
|
+
elif o == "abs":
|
|
220
|
+
for tc in target_cols:
|
|
221
|
+
df[tc] = df[tc].abs()
|
|
222
|
+
|
|
223
|
+
elif o == "log":
|
|
224
|
+
for tc in target_cols:
|
|
225
|
+
df[tc] = np.log(df[tc].replace(0, np.nan))
|
|
226
|
+
|
|
227
|
+
elif o == "sqrt":
|
|
228
|
+
for tc in target_cols:
|
|
229
|
+
df[tc] = np.sqrt(df[tc].clip(lower=0))
|
|
230
|
+
|
|
231
|
+
else:
|
|
232
|
+
raise ValueError(f"Unknown operation: '{o}'")
|
|
233
|
+
|
|
234
|
+
return df, desc
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def apply_instructions(df: pd.DataFrame, instruction: str,
|
|
238
|
+
cfg: Optional[LLMConfig] = None) -> tuple[pd.DataFrame, list[str], list[dict]]:
|
|
239
|
+
"""
|
|
240
|
+
Parse and apply all operations from a natural-language instruction.
|
|
241
|
+
Returns (modified_df, descriptions_list, ops_list).
|
|
242
|
+
"""
|
|
243
|
+
ops = parse_instructions(instruction, df, cfg=cfg)
|
|
244
|
+
descriptions = []
|
|
245
|
+
for op in ops:
|
|
246
|
+
df, desc = apply_operation(df, op)
|
|
247
|
+
descriptions.append(desc)
|
|
248
|
+
return df, descriptions, ops
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def apply_instructions_from_image(df: pd.DataFrame, image_b64: str,
|
|
252
|
+
cfg: Optional[LLMConfig] = None
|
|
253
|
+
) -> tuple[pd.DataFrame, list[str], list[dict]]:
|
|
254
|
+
"""
|
|
255
|
+
Parse and apply operations extracted from an annotated report image.
|
|
256
|
+
Returns (modified_df, descriptions_list, ops_list).
|
|
257
|
+
"""
|
|
258
|
+
ops = parse_instructions_from_image(image_b64, df, cfg=cfg)
|
|
259
|
+
if not ops:
|
|
260
|
+
raise ValueError("No adjustment instructions found in the image.")
|
|
261
|
+
descriptions = []
|
|
262
|
+
for op in ops:
|
|
263
|
+
df, desc = apply_operation(df, op)
|
|
264
|
+
descriptions.append(desc)
|
|
265
|
+
return df, descriptions, ops
|