statchat-app 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
statchat/__init__.py ADDED
File without changes
statchat/__main__.py ADDED
@@ -0,0 +1,172 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Stat Chat — Data Cleaning, Normalization, Statistical Analysis & Iterative Adjustment
4
+
5
+ Entry points:
6
+ GUI: statchat
7
+ CLI: statchat --cli --input data.csv [options]
8
+ Also: python -m statchat
9
+ """
10
+
11
+ import argparse
12
+ import sys
13
+
14
+
15
+ # ── Dependency check ───────────────────────────────────────────────────────────
16
+
17
+ REQUIRED = {
18
+ "pandas": "pandas",
19
+ "numpy": "numpy",
20
+ "scipy": "scipy",
21
+ "sklearn": "scikit-learn",
22
+ "reportlab": "reportlab",
23
+ "matplotlib": "matplotlib",
24
+ "openpyxl": "openpyxl",
25
+ "requests": "requests",
26
+ "PIL": "Pillow",
27
+ }
28
+
29
+
30
+ def check_dependencies():
31
+ missing = []
32
+ for module, pip_name in REQUIRED.items():
33
+ try:
34
+ __import__(module)
35
+ except ImportError:
36
+ missing.append(pip_name)
37
+ return missing
38
+
39
+
40
+ def prompt_install(missing):
41
+ cmd = "pip install " + " ".join(missing)
42
+ print("=" * 60)
43
+ print(" Stat Chat — Missing Dependencies")
44
+ print("=" * 60)
45
+ print(f"\n The following packages are required but not installed:\n")
46
+ for p in missing:
47
+ print(f" • {p}")
48
+ print(f"\n Install them by running:\n\n {cmd}\n")
49
+
50
+ try:
51
+ import tkinter as tk
52
+ from tkinter import messagebox
53
+ root = tk.Tk()
54
+ root.withdraw()
55
+ answer = messagebox.askyesno(
56
+ "Stat Chat — Missing packages",
57
+ "The following required packages are missing:\n\n"
58
+ + "\n".join(f" • {p}" for p in missing)
59
+ + f"\n\nInstall them now?\n\n{cmd}",
60
+ )
61
+ root.destroy()
62
+ if answer:
63
+ import subprocess
64
+ subprocess.check_call([sys.executable, "-m", "pip", "install"] + missing)
65
+ print("\n ✓ Installation complete. Relaunch Stat Chat.")
66
+ except Exception:
67
+ pass
68
+
69
+ sys.exit(1)
70
+
71
+
72
+ # ── Launchers ──────────────────────────────────────────────────────────────────
73
+
74
+ def run_gui():
75
+ try:
76
+ import tkinter as tk
77
+ except ImportError:
78
+ print("[ERROR] Tkinter is not available.")
79
+ print(" Linux: sudo apt install python3-tk")
80
+ print(" macOS/Windows: reinstall Python from python.org")
81
+ sys.exit(1)
82
+
83
+ from statchat.gui.app import StatChatApp
84
+ root = tk.Tk()
85
+ StatChatApp(root)
86
+ root.mainloop()
87
+
88
+
89
+ def run_cli(args):
90
+ from statchat.cli.runner import CLIRunner
91
+ CLIRunner(args).run()
92
+
93
+
94
+ # ── CLI argument parser ────────────────────────────────────────────────────────
95
+
96
+ def build_parser():
97
+ parser = argparse.ArgumentParser(
98
+ prog="statchat",
99
+ description="Stat Chat — Data Cleaning, Normalization & Statistical Analysis",
100
+ formatter_class=argparse.RawTextHelpFormatter,
101
+ )
102
+ parser.add_argument("--cli", action="store_true", help="Run in CLI mode (no GUI)")
103
+ parser.add_argument("--input", "-i", type=str, help="Input CSV or Excel file")
104
+ parser.add_argument("--output", "-o", type=str, help="Path to save processed data")
105
+ parser.add_argument("--output-format", choices=["csv", "xlsx", "json"], default="csv",
106
+ help="Output format (default: csv)")
107
+ parser.add_argument("--report", "-r", type=str, help="Path to save PDF report")
108
+
109
+ cg = parser.add_argument_group("Cleaning Options")
110
+ cg.add_argument("--drop-duplicates", action="store_true", help="Remove duplicate rows")
111
+ cg.add_argument("--drop-nulls", action="store_true", help="Drop rows with null values")
112
+ cg.add_argument("--fill-nulls", choices=["mean", "median", "mode", "zero"],
113
+ help="Fill null values with strategy")
114
+
115
+ ng = parser.add_argument_group("Normalization Options")
116
+ ng.add_argument("--normalize", choices=["zscore", "minmax", "robust"],
117
+ help="Normalization method")
118
+ ng.add_argument("--norm-mean", type=float, default=0.0,
119
+ help="Z-score target mean (default: 0.0)")
120
+ ng.add_argument("--norm-std", type=float, default=1.0,
121
+ help="Z-score target std (default: 1.0)")
122
+
123
+ ag = parser.add_argument_group("Analysis Metrics")
124
+ ag.add_argument("--central-tendency", action="store_true", help="Mean, median, mode")
125
+ ag.add_argument("--dispersion", action="store_true", help="Std dev, variance, IQR, range")
126
+ ag.add_argument("--shape", action="store_true", help="Skewness & kurtosis")
127
+ ag.add_argument("--correlation", action="store_true", help="Correlation matrix")
128
+ ag.add_argument("--percentiles", action="store_true", help="P5–P95 percentiles")
129
+ ag.add_argument("--roc-auc", type=str, metavar="TARGET_COL",
130
+ help="ROC-AUC vs a binary target column")
131
+ ag.add_argument("--all-metrics", action="store_true", help="Run all metrics")
132
+
133
+ adj = parser.add_argument_group("Adjustment Options (requires LLM backend)")
134
+ adj.add_argument("--adjust", action="append", metavar="INSTRUCTION",
135
+ help="Natural-language adjustment (repeatable).\n"
136
+ "e.g. --adjust 'Add 1000 to spend'")
137
+ adj.add_argument("--adjust-image", type=str, metavar="IMAGE_PATH",
138
+ help="Annotated report image — vision model extracts instructions")
139
+
140
+ bg = parser.add_argument_group("LLM Backend Options")
141
+ bg.add_argument("--backend", choices=["claude", "lmstudio", "lmstudio_vision"],
142
+ default="claude", help="LLM provider (default: claude)")
143
+ bg.add_argument("--lmstudio-url", type=str, default="http://localhost:1234",
144
+ metavar="URL", help="LM Studio server URL")
145
+ bg.add_argument("--lmstudio-model", type=str, default="", metavar="MODEL_ID",
146
+ help="LM Studio text model ID")
147
+ bg.add_argument("--lmstudio-vision-model", type=str, default="", metavar="MODEL_ID",
148
+ help="LM Studio vision model ID")
149
+
150
+ return parser
151
+
152
+
153
+ # ── Main entry point ───────────────────────────────────────────────────────────
154
+
155
+ def main():
156
+ missing = check_dependencies()
157
+ if missing:
158
+ prompt_install(missing)
159
+
160
+ parser = build_parser()
161
+ args = parser.parse_args()
162
+
163
+ if args.cli:
164
+ if not args.input:
165
+ parser.error("--input is required in CLI mode")
166
+ run_cli(args)
167
+ else:
168
+ run_gui()
169
+
170
+
171
+ if __name__ == "__main__":
172
+ main()
Binary file
Binary file
File without changes
statchat/cli/runner.py ADDED
@@ -0,0 +1,197 @@
1
+ """CLI mode runner."""
2
+
3
+ import sys
4
+ import base64
5
+ from pathlib import Path
6
+ from statchat.core.loader import load_file, save_file, get_file_info
7
+ from statchat.core.cleaner import apply_cleaning, apply_normalization
8
+ from statchat.core.analyzer import run_analysis
9
+ from statchat.core.reporter import generate_report
10
+
11
+
12
+ class CLIRunner:
13
+ def __init__(self, args):
14
+ self.args = args
15
+
16
+ def run(self):
17
+ args = self.args
18
+ print(f"\n── Stat Chat CLI ──────────────────────────────────")
19
+ print(f"Loading: {args.input}")
20
+
21
+ try:
22
+ original_df = load_file(args.input)
23
+ except Exception as e:
24
+ print(f"[ERROR] {e}")
25
+ sys.exit(1)
26
+
27
+ info = get_file_info(original_df)
28
+ print(f" Rows: {info['rows']} | Columns: {info['columns']}")
29
+ print(f" Numeric cols: {info['numeric_columns']}")
30
+
31
+ # ── Configure LLM backend ─────────────────────────────────────────────
32
+ if args.adjust or args.adjust_image:
33
+ from statchat.core.llm_backend import LLMConfig, set_config
34
+ provider = getattr(args, "backend", "claude")
35
+ cfg = LLMConfig(
36
+ provider=provider,
37
+ lmstudio_base_url=getattr(args, "lmstudio_url", "http://localhost:1234"),
38
+ lmstudio_model=getattr(args, "lmstudio_model", ""),
39
+ lmstudio_vision_model=getattr(args, "lmstudio_vision_model", ""),
40
+ )
41
+ set_config(cfg)
42
+ print(f" [backend] {provider}"
43
+ + (f" @ {cfg.lmstudio_base_url}" if "lmstudio" in provider else ""))
44
+
45
+ # ── Cleaning ──────────────────────────────────────────────────────────
46
+ clean_opts = {
47
+ "drop_duplicates": args.drop_duplicates,
48
+ "drop_nulls": args.drop_nulls,
49
+ "fill_nulls": args.fill_nulls,
50
+ }
51
+ cleaned_df, clean_log = apply_cleaning(original_df.copy(), clean_opts)
52
+ for msg in clean_log:
53
+ print(f" [clean] {msg}")
54
+
55
+ # ── Normalization ─────────────────────────────────────────────────────
56
+ norm_opts = {
57
+ "normalize": args.normalize,
58
+ "norm_mean": args.norm_mean,
59
+ "norm_std": args.norm_std,
60
+ }
61
+ cleaned_df, norm_log = apply_normalization(cleaned_df, norm_opts)
62
+ for msg in norm_log:
63
+ print(f" [norm] {msg}")
64
+
65
+ # ── Text adjustments ──────────────────────────────────────────────────
66
+ adj_history = []
67
+ working_df = cleaned_df.copy()
68
+
69
+ if args.adjust:
70
+ from statchat.core.adjuster import apply_instructions
71
+ import datetime
72
+ print(f"\n── Applying Text Adjustments ──────────────────────")
73
+ for instruction in args.adjust:
74
+ print(f" Instruction: \"{instruction}\"")
75
+ try:
76
+ working_df, descriptions, ops = apply_instructions(working_df, instruction)
77
+ for d in descriptions:
78
+ print(f" [adjust] {d}")
79
+ adj_history.append({
80
+ "version": len(adj_history) + 1,
81
+ "df": working_df.copy(),
82
+ "description": "; ".join(descriptions),
83
+ "ops": ops,
84
+ "timestamp": datetime.datetime.now().strftime("%H:%M:%S"),
85
+ })
86
+ except Exception as e:
87
+ print(f" [ERROR] Could not apply: {e}")
88
+ sys.exit(1)
89
+
90
+ # ── Image annotation ──────────────────────────────────────────────────
91
+ if args.adjust_image:
92
+ from statchat.core.adjuster import apply_instructions_from_image
93
+ import datetime
94
+ print(f"\n── Parsing Annotated Image ────────────────────────")
95
+ img_path = Path(args.adjust_image)
96
+ if not img_path.exists():
97
+ print(f" [ERROR] Image not found: {img_path}")
98
+ sys.exit(1)
99
+ print(f" Image: {img_path.name}")
100
+ try:
101
+ with open(img_path, "rb") as f:
102
+ image_b64 = base64.b64encode(f.read()).decode("utf-8")
103
+ working_df, descriptions, ops = apply_instructions_from_image(
104
+ working_df, image_b64
105
+ )
106
+ for d in descriptions:
107
+ print(f" [adjust] {d}")
108
+ adj_history.append({
109
+ "version": len(adj_history) + 1,
110
+ "df": working_df.copy(),
111
+ "description": f"From image '{img_path.name}': " + "; ".join(descriptions),
112
+ "ops": ops,
113
+ "timestamp": datetime.datetime.now().strftime("%H:%M:%S"),
114
+ })
115
+ except Exception as e:
116
+ print(f" [ERROR] Could not parse image: {e}")
117
+ sys.exit(1)
118
+
119
+ # ── Save output data ──────────────────────────────────────────────────
120
+ final_df = working_df # post-adjustment (or just cleaned if no adjustments)
121
+
122
+ if args.output:
123
+ out_path = save_file(final_df, args.output, fmt=args.output_format)
124
+ print(f"\n [saved] Data → {out_path}")
125
+
126
+ # ── Analysis ─────────────────────────────────────────────────────────
127
+ df_for_analysis = final_df.copy().reset_index(drop=True)
128
+ if args.roc_auc and args.roc_auc in original_df.columns:
129
+ df_for_analysis[args.roc_auc] = original_df[args.roc_auc].reset_index(drop=True)
130
+
131
+ analysis_opts = {
132
+ "central_tendency": args.central_tendency,
133
+ "dispersion": args.dispersion,
134
+ "shape": args.shape,
135
+ "correlation": args.correlation,
136
+ "percentiles": args.percentiles,
137
+ "normality": False,
138
+ "roc_auc": args.roc_auc,
139
+ "all_metrics": args.all_metrics,
140
+ "original_df": original_df,
141
+ }
142
+ results = run_analysis(df_for_analysis, analysis_opts)
143
+
144
+ if results:
145
+ print(f"\n── Analysis Results ───────────────────────────────")
146
+ for section, data in results.items():
147
+ if section == "error":
148
+ print(f" [error] {data}")
149
+ elif section == "correlation":
150
+ print(f" [correlation] Matrix computed "
151
+ f"({len(data.columns)} x {len(data.columns)})")
152
+ elif section == "roc_auc":
153
+ for col, v in data.items():
154
+ if "auc" in v:
155
+ print(f" [roc_auc] {col}: AUC = {v['auc']:.4f}")
156
+ elif "error" in v:
157
+ print(f" [roc_auc] {col}: ERROR - {v['error']}")
158
+ else:
159
+ print(f"\n [{section}]")
160
+ for col, vals in data.items():
161
+ summary = " ".join(
162
+ f"{k}={v:.4f}" if isinstance(v, float) else f"{k}={v}"
163
+ for k, v in vals.items()
164
+ )
165
+ print(f" {col}: {summary}")
166
+
167
+ # ── PDF Report ────────────────────────────────────────────────────────
168
+ if args.report:
169
+ print(f"\n── Generating PDF Report ──────────────────────────")
170
+ try:
171
+ # Seed history with original if adjustments were made
172
+ full_history = []
173
+ if adj_history:
174
+ import datetime
175
+ full_history = [{
176
+ "version": 0,
177
+ "df": original_df.copy(),
178
+ "description": "Original dataset",
179
+ "ops": [],
180
+ "timestamp": datetime.datetime.now().strftime("%H:%M:%S"),
181
+ }] + adj_history
182
+
183
+ out = generate_report(
184
+ filepath=args.report,
185
+ original_df=original_df,
186
+ cleaned_df=final_df,
187
+ analysis_results=results,
188
+ clean_log=clean_log,
189
+ norm_log=norm_log,
190
+ source_file=args.input,
191
+ adjustment_history=full_history if full_history else None,
192
+ )
193
+ print(f" [saved] Report → {out}")
194
+ except Exception as e:
195
+ print(f" [ERROR] Could not generate report: {e}")
196
+
197
+ print("\n── Done ───────────────────────────────────────────\n")
File without changes
@@ -0,0 +1,265 @@
1
+ """
2
+ core/adjuster.py
3
+ ----------------
4
+ Interprets natural-language adjustment instructions via the configured LLM
5
+ backend (Claude API or LM Studio) and applies them to a pandas DataFrame.
6
+
7
+ Adjustment instructions are translated into a small safe DSL:
8
+ { "op": "add"|"subtract"|"multiply"|"divide"|"set"|"clip"|"round"|
9
+ "rename"|"drop_col"|"filter_rows"|"fillna"|"abs"|"log"|"sqrt",
10
+ "column": "<col>", # target column (or "*" for all numeric)
11
+ "value": <number|str>, # operand (where relevant)
12
+ "condition": "<expr>", # optional pandas .query() string
13
+ "description": "<human summary>"
14
+ }
15
+
16
+ Multiple operations are returned as a list.
17
+ """
18
+
19
+ import json
20
+ import math
21
+ from typing import Optional
22
+ import pandas as pd
23
+ import numpy as np
24
+
25
+ from statchat.core.llm_backend import chat_json, get_config, LLMConfig
26
+
27
+
28
+ # ── System prompt ─────────────────────────────────────────────────────────────
29
+
30
+ _SYSTEM = """You are a data transformation assistant. The user will describe adjustments
31
+ they want to make to a pandas DataFrame in plain English.
32
+
33
+ You must respond with ONLY a JSON array of operation objects — no prose, no markdown fences.
34
+
35
+ Each object has these keys:
36
+ op : one of add | subtract | multiply | divide | set | clip | round |
37
+ rename | drop_col | filter_rows | fillna | abs | log | sqrt
38
+ column : the column name to operate on (use "*" to mean all numeric columns)
39
+ value : numeric value, new name string, or clip bounds [min, max]
40
+ condition : (optional) a pandas DataFrame.query() expression string to limit rows
41
+ description : a short human-readable summary of this operation
42
+
43
+ Examples:
44
+ User: "Add $1000 to each value in spend"
45
+ -> [{{"op":"add","column":"spend","value":1000,"description":"Add 1000 to spend"}}]
46
+
47
+ User: "Multiply income and age by 1.1"
48
+ -> [{{"op":"multiply","column":"income","value":1.1,"description":"Multiply income by 1.1"}},
49
+ {{"op":"multiply","column":"age","value":1.1,"description":"Multiply age by 1.1"}}]
50
+
51
+ User: "Rename 'score' to 'risk_score' and drop the 'region' column"
52
+ -> [{{"op":"rename","column":"score","value":"risk_score","description":"Rename score to risk_score"}},
53
+ {{"op":"drop_col","column":"region","description":"Drop column region"}}]
54
+
55
+ User: "Clip spend between 0 and 5000"
56
+ -> [{{"op":"clip","column":"spend","value":[0,5000],"description":"Clip spend to [0, 5000]"}}]
57
+
58
+ User: "Set all income values below 20000 to 20000"
59
+ -> [{{"op":"clip","column":"income","value":[20000,null],"description":"Floor income at 20000"}}]
60
+
61
+ User: "Fill nulls in income with 0"
62
+ -> [{{"op":"fillna","column":"income","value":0,"description":"Fill income nulls with 0"}}]
63
+
64
+ User: "Round age to 0 decimal places"
65
+ -> [{{"op":"round","column":"age","value":0,"description":"Round age to 0 decimals"}}]
66
+
67
+ User: "Remove rows where spend is negative"
68
+ -> [{{"op":"filter_rows","condition":"spend >= 0","description":"Remove rows where spend < 0"}}]
69
+
70
+ User: "Log-transform spend"
71
+ -> [{{"op":"log","column":"spend","description":"Natural log of spend"}}]
72
+
73
+ Columns available: {columns}
74
+ Numeric columns: {numeric_columns}
75
+
76
+ Only use column names from the list above. Respond with ONLY the JSON array."""
77
+
78
+
79
+ # ── Vision system prompt (for image/PDF annotation workflow) ──────────────────
80
+
81
+ _VISION_SYSTEM = """You are a data transformation assistant. The user has annotated a
82
+ data report (shown as an image) with handwritten notes, highlights, or typed
83
+ comments describing changes they want to make to the underlying dataset.
84
+
85
+ Extract ALL adjustment instructions visible in the image and translate them
86
+ into the same JSON operation format.
87
+
88
+ You must respond with ONLY a JSON array of operation objects — no prose, no markdown fences.
89
+
90
+ Each object has these keys:
91
+ op : one of add | subtract | multiply | divide | set | clip | round |
92
+ rename | drop_col | filter_rows | fillna | abs | log | sqrt
93
+ column : the column name to operate on (use "*" for all numeric columns)
94
+ value : numeric value, new name string, or clip bounds [min, max]
95
+ condition : (optional) a pandas .query() expression string
96
+ description : a short human-readable summary of this operation
97
+
98
+ Columns available: {columns}
99
+ Numeric columns: {numeric_columns}
100
+
101
+ Respond with ONLY the JSON array. If no clear instructions are found, return []."""
102
+
103
+
104
+ # ── Parsing ───────────────────────────────────────────────────────────────────
105
+
106
+ def parse_instructions(instruction: str, df: pd.DataFrame,
107
+ cfg: Optional[LLMConfig] = None) -> list[dict]:
108
+ """Send a text instruction to the LLM and return operation dicts."""
109
+ columns = list(df.columns)
110
+ numeric_columns = list(df.select_dtypes(include="number").columns)
111
+ system = _SYSTEM.format(
112
+ columns=json.dumps(columns),
113
+ numeric_columns=json.dumps(numeric_columns),
114
+ )
115
+ return chat_json(system, instruction, cfg=cfg or get_config())
116
+
117
+
118
+ def parse_instructions_from_image(image_b64: str, df: pd.DataFrame,
119
+ cfg: Optional[LLMConfig] = None) -> list[dict]:
120
+ """
121
+ Send an annotated report image to a vision LLM and return operation dicts.
122
+
123
+ image_b64 : base64-encoded PNG or JPEG of the annotated report page.
124
+ df : current DataFrame (used for column hints in the system prompt).
125
+ """
126
+ columns = list(df.columns)
127
+ numeric_columns = list(df.select_dtypes(include="number").columns)
128
+ system = _VISION_SYSTEM.format(
129
+ columns=json.dumps(columns),
130
+ numeric_columns=json.dumps(numeric_columns),
131
+ )
132
+ vision_cfg = cfg or get_config()
133
+ # If the global config is text-only lmstudio, auto-switch to vision variant
134
+ if vision_cfg.provider == "lmstudio":
135
+ from dataclasses import replace
136
+ vision_cfg = replace(vision_cfg, provider="lmstudio_vision")
137
+
138
+ return chat_json(system, "Please extract all adjustment instructions from this annotated report.",
139
+ image_b64=image_b64, cfg=vision_cfg)
140
+
141
+
142
+ # ── Operation executor ────────────────────────────────────────────────────────
143
+
144
+ def _resolve_mask(df: pd.DataFrame, condition: Optional[str]) -> pd.Series:
145
+ if not condition:
146
+ return pd.Series([True] * len(df), index=df.index)
147
+ return df.eval(condition)
148
+
149
+
150
+ def apply_operation(df: pd.DataFrame, op: dict) -> tuple[pd.DataFrame, str]:
151
+ """Apply a single operation dict to df. Returns (new_df, description)."""
152
+ df = df.copy()
153
+ o = op.get("op", "").lower()
154
+ col = op.get("column", "")
155
+ val = op.get("value")
156
+ cond = op.get("condition")
157
+ desc = op.get("description", f"{o} on {col}")
158
+ numeric = list(df.select_dtypes(include="number").columns)
159
+
160
+ target_cols = numeric if col == "*" else [col]
161
+ for tc in target_cols:
162
+ if tc not in df.columns and o not in ("filter_rows", "rename"):
163
+ raise ValueError(f"Column '{tc}' not found in dataset.")
164
+
165
+ mask = _resolve_mask(df, cond)
166
+
167
+ if o == "add":
168
+ for tc in target_cols:
169
+ df.loc[mask, tc] = df.loc[mask, tc] + float(val)
170
+
171
+ elif o == "subtract":
172
+ for tc in target_cols:
173
+ df.loc[mask, tc] = df.loc[mask, tc] - float(val)
174
+
175
+ elif o == "multiply":
176
+ for tc in target_cols:
177
+ df.loc[mask, tc] = df.loc[mask, tc] * float(val)
178
+
179
+ elif o == "divide":
180
+ if float(val) == 0:
181
+ raise ValueError("Division by zero.")
182
+ for tc in target_cols:
183
+ df.loc[mask, tc] = df.loc[mask, tc] / float(val)
184
+
185
+ elif o == "set":
186
+ for tc in target_cols:
187
+ df.loc[mask, tc] = float(val) if isinstance(val, (int, float)) else val
188
+
189
+ elif o == "clip":
190
+ lo, hi = (val[0], val[1]) if isinstance(val, list) else (None, None)
191
+ lo = None if lo is None or (isinstance(lo, float) and math.isnan(lo)) else float(lo)
192
+ hi = None if hi is None or (isinstance(hi, float) and math.isnan(hi)) else float(hi)
193
+ for tc in target_cols:
194
+ df[tc] = df[tc].clip(lower=lo, upper=hi)
195
+
196
+ elif o == "round":
197
+ decimals = int(val) if val is not None else 0
198
+ for tc in target_cols:
199
+ df[tc] = df[tc].round(decimals)
200
+
201
+ elif o == "rename":
202
+ if col not in df.columns:
203
+ raise ValueError(f"Column '{col}' not found.")
204
+ df = df.rename(columns={col: str(val)})
205
+
206
+ elif o == "drop_col":
207
+ for tc in target_cols:
208
+ df = df.drop(columns=[tc], errors="ignore")
209
+
210
+ elif o == "filter_rows":
211
+ if not cond:
212
+ raise ValueError("filter_rows requires a condition.")
213
+ df = df.query(cond).reset_index(drop=True)
214
+
215
+ elif o == "fillna":
216
+ for tc in target_cols:
217
+ df[tc] = df[tc].fillna(float(val) if isinstance(val, (int, float)) else val)
218
+
219
+ elif o == "abs":
220
+ for tc in target_cols:
221
+ df[tc] = df[tc].abs()
222
+
223
+ elif o == "log":
224
+ for tc in target_cols:
225
+ df[tc] = np.log(df[tc].replace(0, np.nan))
226
+
227
+ elif o == "sqrt":
228
+ for tc in target_cols:
229
+ df[tc] = np.sqrt(df[tc].clip(lower=0))
230
+
231
+ else:
232
+ raise ValueError(f"Unknown operation: '{o}'")
233
+
234
+ return df, desc
235
+
236
+
237
+ def apply_instructions(df: pd.DataFrame, instruction: str,
238
+ cfg: Optional[LLMConfig] = None) -> tuple[pd.DataFrame, list[str], list[dict]]:
239
+ """
240
+ Parse and apply all operations from a natural-language instruction.
241
+ Returns (modified_df, descriptions_list, ops_list).
242
+ """
243
+ ops = parse_instructions(instruction, df, cfg=cfg)
244
+ descriptions = []
245
+ for op in ops:
246
+ df, desc = apply_operation(df, op)
247
+ descriptions.append(desc)
248
+ return df, descriptions, ops
249
+
250
+
251
+ def apply_instructions_from_image(df: pd.DataFrame, image_b64: str,
252
+ cfg: Optional[LLMConfig] = None
253
+ ) -> tuple[pd.DataFrame, list[str], list[dict]]:
254
+ """
255
+ Parse and apply operations extracted from an annotated report image.
256
+ Returns (modified_df, descriptions_list, ops_list).
257
+ """
258
+ ops = parse_instructions_from_image(image_b64, df, cfg=cfg)
259
+ if not ops:
260
+ raise ValueError("No adjustment instructions found in the image.")
261
+ descriptions = []
262
+ for op in ops:
263
+ df, desc = apply_operation(df, op)
264
+ descriptions.append(desc)
265
+ return df, descriptions, ops