querymind-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- app/agents/InterpreterAgent.py +473 -0
- app/agents/__init__.py +0 -0
- app/agents/insights_generator.py +151 -0
- app/agents/intent_corrector.py +59 -0
- app/agents/llm_intepreter.py +132 -0
- app/agents/narrator.py +27 -0
- app/agents/planner.py +77 -0
- app/cli/__init__.py +0 -0
- app/cli/main.py +346 -0
- app/cli/tui_app.py +98 -0
- app/cli/ui.py +21 -0
- app/core/__init__.py +0 -0
- app/core/context.py +10 -0
- app/core/logger.py +2 -0
- app/core/pipeline.py +379 -0
- app/data/__init__.py +0 -0
- app/data/connectors/csv_connector.py +99 -0
- app/data/connectors/excel_connector.py +68 -0
- app/data/connectors/no_sql_db_connector.py +0 -0
- app/data/connectors/sql_db_connector.py +0 -0
- app/data/schema_engine.py +18 -0
- app/data/type_caster.py +128 -0
- app/executor/__init__.py +0 -0
- app/executor/db_executor.py +0 -0
- app/executor/sheet_selector.py +120 -0
- app/llm/ollama_client.py +47 -0
- app/prompts/interpreter_prompt.txt +28 -0
- app/security/__init__.py +0 -0
- app/security/input_guard.py +133 -0
- app/security/schema_filter.py +20 -0
- app/tests/__init__.py +0 -0
- app/tests/llm_test.py +18 -0
- app/tools/__init__.py +0 -0
- app/tools/analyzer.py +157 -0
- app/tools/join_resolver.py +159 -0
- app/tools/sql_writer.py +37 -0
- app/tools/validator.py +0 -0
- querymind_cli-0.1.0.dist-info/METADATA +139 -0
- querymind_cli-0.1.0.dist-info/RECORD +43 -0
- querymind_cli-0.1.0.dist-info/WHEEL +5 -0
- querymind_cli-0.1.0.dist-info/entry_points.txt +2 -0
- querymind_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- querymind_cli-0.1.0.dist-info/top_level.txt +1 -0
app/data/type_caster.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared column type-casting logic used by both CSVConnector and ExcelConnector.
|
|
3
|
+
|
|
4
|
+
smart_cast_df(df) processes every column:
|
|
5
|
+
- Already datetime → leave alone
|
|
6
|
+
- Already numeric → packed-date check, then whole-number downcast
|
|
7
|
+
- Object/string → try numeric → packed-date check → whole-number downcast
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _try_packed_date(int_series: pd.Series, original_series: pd.Series):
|
|
14
|
+
"""
|
|
15
|
+
Try to parse an integer series as a packed date
|
|
16
|
+
(DDMMYYYY, MMDDYYYY, YYYYMMDD) with zero-padding for 7-digit values.
|
|
17
|
+
|
|
18
|
+
Returns (datetime_series, fmt) if successful, None otherwise.
|
|
19
|
+
"""
|
|
20
|
+
digits = int_series.astype(str).str.len()
|
|
21
|
+
mostly_7_8 = ((digits >= 7) & (digits <= 8)).sum() / len(digits)
|
|
22
|
+
|
|
23
|
+
if mostly_7_8 <= 0.8:
|
|
24
|
+
return None
|
|
25
|
+
|
|
26
|
+
padded = int_series.astype(str).str.zfill(8)
|
|
27
|
+
|
|
28
|
+
for fmt in ("%d%m%Y", "%m%d%Y", "%Y%m%d"):
|
|
29
|
+
try:
|
|
30
|
+
candidate = pd.to_datetime(padded, format=fmt, errors="raise")
|
|
31
|
+
if not (
|
|
32
|
+
(candidate.dt.year >= 1900).all() and (candidate.dt.year <= 2100).all()
|
|
33
|
+
):
|
|
34
|
+
continue
|
|
35
|
+
|
|
36
|
+
full_padded = (
|
|
37
|
+
pd.to_numeric(original_series, errors="coerce")
|
|
38
|
+
.astype("Int64")
|
|
39
|
+
.astype(str)
|
|
40
|
+
.str.zfill(8)
|
|
41
|
+
.replace("<NA>", pd.NaT)
|
|
42
|
+
)
|
|
43
|
+
result = pd.to_datetime(full_padded, format=fmt, errors="coerce")
|
|
44
|
+
return result, fmt
|
|
45
|
+
|
|
46
|
+
except Exception:
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _try_downcast_to_int(series: pd.Series) -> pd.Series:
|
|
53
|
+
"""
|
|
54
|
+
If all non-null values in a float series are whole numbers
|
|
55
|
+
(e.g. 553.0, 1733.0), convert to nullable Int64 so they
|
|
56
|
+
display as 553, 1733 instead of 553.0, 1733.0.
|
|
57
|
+
|
|
58
|
+
Uses Int64 (nullable) rather than int64 so NaN rows are preserved.
|
|
59
|
+
"""
|
|
60
|
+
if series.dtype not in ("float64", "float32"):
|
|
61
|
+
return series
|
|
62
|
+
|
|
63
|
+
non_null = series.dropna()
|
|
64
|
+
if len(non_null) == 0:
|
|
65
|
+
return series
|
|
66
|
+
|
|
67
|
+
if (non_null == non_null.astype("int64")).all():
|
|
68
|
+
return series.astype("Int64")
|
|
69
|
+
|
|
70
|
+
return series
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def smart_cast_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
74
|
+
"""
|
|
75
|
+
Intelligently cast each column to the most appropriate type.
|
|
76
|
+
|
|
77
|
+
Processing order per column:
|
|
78
|
+
1. Already datetime → skip
|
|
79
|
+
2. Numeric dtype (int/float from Excel) → packed-date check,
|
|
80
|
+
then whole-number float → Int64 downcast
|
|
81
|
+
3. Object/string → try numeric cast (>70% parseable),
|
|
82
|
+
then packed-date check, then whole-number downcast
|
|
83
|
+
"""
|
|
84
|
+
for col in df.columns:
|
|
85
|
+
dtype_str = str(df[col].dtype)
|
|
86
|
+
|
|
87
|
+
# Already datetime — nothing to do
|
|
88
|
+
if "datetime" in dtype_str:
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
# ── Already numeric (common with Excel-loaded columns) ────────────
|
|
92
|
+
if df[col].dtype in ("int64", "int32", "float64", "float32", "Int64", "Int32"):
|
|
93
|
+
int_series = (
|
|
94
|
+
pd.to_numeric(df[col], errors="coerce").dropna().astype("int64")
|
|
95
|
+
)
|
|
96
|
+
if len(int_series) > 0:
|
|
97
|
+
result = _try_packed_date(int_series, df[col])
|
|
98
|
+
if result is not None:
|
|
99
|
+
dt_col, fmt = result
|
|
100
|
+
df[col] = dt_col
|
|
101
|
+
print(f"📅 '{col}' detected as packed date ({fmt})")
|
|
102
|
+
continue
|
|
103
|
+
|
|
104
|
+
# Downcast whole-number floats to Int64
|
|
105
|
+
df[col] = _try_downcast_to_int(df[col])
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
# ── Object / string → try numeric cast first ──────────────────────
|
|
109
|
+
if df[col].dtype == object or dtype_str in ("string", "str"):
|
|
110
|
+
converted = pd.to_numeric(df[col], errors="coerce")
|
|
111
|
+
ratio = converted.notna().sum() / max(len(df), 1)
|
|
112
|
+
|
|
113
|
+
if ratio <= 0.7:
|
|
114
|
+
continue # Not numeric enough — leave as object/string
|
|
115
|
+
|
|
116
|
+
int_series = converted.dropna().astype("int64")
|
|
117
|
+
if len(int_series) > 0:
|
|
118
|
+
result = _try_packed_date(int_series, df[col])
|
|
119
|
+
if result is not None:
|
|
120
|
+
dt_col, fmt = result
|
|
121
|
+
df[col] = dt_col
|
|
122
|
+
print(f"📅 '{col}' detected as packed date ({fmt})")
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
# Downcast whole-number floats before storing
|
|
126
|
+
df[col] = _try_downcast_to_int(converted)
|
|
127
|
+
|
|
128
|
+
return df
|
app/executor/__init__.py
ADDED
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from rich.console import Console
|
|
3
|
+
from rich.table import Table
|
|
4
|
+
from rich.prompt import Prompt
|
|
5
|
+
|
|
6
|
+
console = Console()
|
|
7
|
+
|
|
8
|
+
EXIT_WORDS = {"exit", "quit", "/exit", "/quit", "bye", "q", ":q"}
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _ask(message: str) -> str:
|
|
12
|
+
"""Prompt wrapper that raises UserExitError on exit words or Ctrl+C."""
|
|
13
|
+
# Import here to avoid circular import (main imports sheet_selector)
|
|
14
|
+
from app.cli.main import UserExitError
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
value = Prompt.ask(message)
|
|
18
|
+
except (KeyboardInterrupt, EOFError):
|
|
19
|
+
raise UserExitError()
|
|
20
|
+
if value.strip().lower() in EXIT_WORDS:
|
|
21
|
+
raise UserExitError()
|
|
22
|
+
return value
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_sheet_info(file_path: str) -> dict:
|
|
26
|
+
"""Returns {sheet_name: {"rows": int, "cols": int, "columns": [str]}}"""
|
|
27
|
+
xl = pd.ExcelFile(file_path)
|
|
28
|
+
info = {}
|
|
29
|
+
for name in xl.sheet_names:
|
|
30
|
+
try:
|
|
31
|
+
full_df = xl.parse(name)
|
|
32
|
+
info[name] = {
|
|
33
|
+
"rows": len(full_df),
|
|
34
|
+
"cols": len(full_df.columns),
|
|
35
|
+
"columns": full_df.columns.tolist(),
|
|
36
|
+
}
|
|
37
|
+
except Exception:
|
|
38
|
+
info[name] = {"rows": "?", "cols": "?", "columns": []}
|
|
39
|
+
return info
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def prompt_sheet_selection(file_path: str) -> list:
|
|
43
|
+
"""
|
|
44
|
+
Interactive sheet picker.
|
|
45
|
+
Returns list of selected sheet names.
|
|
46
|
+
Raises UserExitError if the user types an exit command.
|
|
47
|
+
"""
|
|
48
|
+
console.print("\n[bold cyan]📋 Excel Sheet Selection[/bold cyan]")
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
sheet_info = get_sheet_info(file_path)
|
|
52
|
+
except Exception as e:
|
|
53
|
+
console.print(f"[red]❌ Could not read sheets: {e}[/red]")
|
|
54
|
+
return []
|
|
55
|
+
|
|
56
|
+
sheet_names = list(sheet_info.keys())
|
|
57
|
+
|
|
58
|
+
table = Table(title="Available Sheets", border_style="blue", show_lines=True)
|
|
59
|
+
table.add_column("#", style="bold yellow", width=4)
|
|
60
|
+
table.add_column("Sheet", style="bold white")
|
|
61
|
+
table.add_column("Rows", justify="right")
|
|
62
|
+
table.add_column("Columns", justify="right")
|
|
63
|
+
table.add_column("Sample columns", style="dim")
|
|
64
|
+
|
|
65
|
+
for i, name in enumerate(sheet_names, 1):
|
|
66
|
+
info = sheet_info[name]
|
|
67
|
+
sample = ", ".join(str(c) for c in info["columns"][:5])
|
|
68
|
+
if len(info["columns"]) > 5:
|
|
69
|
+
sample += f" … (+{len(info['columns']) - 5} more)"
|
|
70
|
+
table.add_row(str(i), name, str(info["rows"]), str(info["cols"]), sample)
|
|
71
|
+
|
|
72
|
+
console.print(table)
|
|
73
|
+
console.print(
|
|
74
|
+
"\n[dim]Options:[/dim]\n"
|
|
75
|
+
" [yellow]•[/yellow] Sheet number(s) separated by commas: [bold]1[/bold] or [bold]1,2[/bold]\n"
|
|
76
|
+
" [yellow]•[/yellow] Type [bold]all[/bold] to load all sheets\n"
|
|
77
|
+
" [yellow]•[/yellow] Type [bold]exit[/bold] to quit\n"
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
while True:
|
|
81
|
+
raw = _ask("[cyan]👉 Select sheet(s)[/cyan]").strip().lower()
|
|
82
|
+
|
|
83
|
+
if raw == "all":
|
|
84
|
+
selected = sheet_names
|
|
85
|
+
break
|
|
86
|
+
|
|
87
|
+
parts = [p.strip() for p in raw.split(",")]
|
|
88
|
+
try:
|
|
89
|
+
indices = [int(p) for p in parts if p]
|
|
90
|
+
selected = []
|
|
91
|
+
valid = True
|
|
92
|
+
for idx in indices:
|
|
93
|
+
if 1 <= idx <= len(sheet_names):
|
|
94
|
+
name = sheet_names[idx - 1]
|
|
95
|
+
if name not in selected:
|
|
96
|
+
selected.append(name)
|
|
97
|
+
else:
|
|
98
|
+
console.print(
|
|
99
|
+
f"[red]❌ '{idx}' is out of range (1–{len(sheet_names)})[/red]"
|
|
100
|
+
)
|
|
101
|
+
valid = False
|
|
102
|
+
break
|
|
103
|
+
if valid and selected:
|
|
104
|
+
break
|
|
105
|
+
except ValueError:
|
|
106
|
+
console.print(
|
|
107
|
+
"[red]❌ Please enter numbers, 'all', or 'exit' to quit.[/red]"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
if len(selected) == 1:
|
|
111
|
+
console.print(f"\n[green]✅ Loading sheet:[/green] [bold]{selected[0]}[/bold]")
|
|
112
|
+
else:
|
|
113
|
+
console.print(
|
|
114
|
+
f"\n[green]✅ Loading {len(selected)} sheets:[/green] "
|
|
115
|
+
f"[bold]{', '.join(selected)}[/bold]\n"
|
|
116
|
+
"[yellow]ℹ️ Sheets will be merged with a '_sheet' column added "
|
|
117
|
+
"so you can filter per sheet in queries.[/yellow]"
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
return selected
|
app/llm/ollama_client.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class OllamaClient:
|
|
5
|
+
"""
|
|
6
|
+
Handles all communication with Ollama LLM.
|
|
7
|
+
Optimized for speed + reliability.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
def __init__(self, model="phi"):
|
|
11
|
+
self.url = "http://localhost:11434/api/generate"
|
|
12
|
+
self.model = model
|
|
13
|
+
|
|
14
|
+
def generate(self, prompt: str) -> str:
|
|
15
|
+
try:
|
|
16
|
+
response = requests.post(
|
|
17
|
+
self.url,
|
|
18
|
+
json={
|
|
19
|
+
"model": self.model,
|
|
20
|
+
"prompt": prompt,
|
|
21
|
+
"stream": False,
|
|
22
|
+
"options": {
|
|
23
|
+
"num_predict": 100,
|
|
24
|
+
"temperature": 0,
|
|
25
|
+
"top_p": 0.9,
|
|
26
|
+
},
|
|
27
|
+
},
|
|
28
|
+
timeout=30,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
if response.status_code != 200:
|
|
32
|
+
return f"ERROR: Bad response {response.status_code}"
|
|
33
|
+
|
|
34
|
+
data = response.json()
|
|
35
|
+
|
|
36
|
+
# print("DEBUG FULL RESPONSE:", data)
|
|
37
|
+
|
|
38
|
+
return data.get("response", "").strip()
|
|
39
|
+
|
|
40
|
+
except requests.exceptions.ConnectionError:
|
|
41
|
+
return "ERROR: Could not connect to Ollama. Is it running?"
|
|
42
|
+
|
|
43
|
+
except requests.exceptions.Timeout:
|
|
44
|
+
return "ERROR: Ollama request timed out"
|
|
45
|
+
|
|
46
|
+
except Exception as e:
|
|
47
|
+
return f"ERROR: {str(e)}"
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
You are a data analyst assistant.
|
|
2
|
+
|
|
3
|
+
Your job is to convert a user query into structured intent.
|
|
4
|
+
|
|
5
|
+
Dataset schema:
|
|
6
|
+
{schema}
|
|
7
|
+
|
|
8
|
+
User query:
|
|
9
|
+
{query}
|
|
10
|
+
|
|
11
|
+
Return ONLY JSON in this format:
|
|
12
|
+
|
|
13
|
+
{
|
|
14
|
+
"metric": "<column_name>",
|
|
15
|
+
"dimension": "<column_name>",
|
|
16
|
+
"operation": "sum | mean | count",
|
|
17
|
+
"query_type": "comparison | aggregation | trend | top_n",
|
|
18
|
+
"limit": number or null
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
Rules:
|
|
22
|
+
- Use only columns from schema
|
|
23
|
+
- "revenue", "sales", "spending" → numeric columns
|
|
24
|
+
- "highest", "most" → comparison
|
|
25
|
+
- "top N" → top_n
|
|
26
|
+
- "average" → mean
|
|
27
|
+
- If unclear → choose best match
|
|
28
|
+
- Do NOT explain anything
|
app/security/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
BLOCKED_KEYWORDS = ["password", "ssn", "credit card", "api key", "private key"]
|
|
4
|
+
|
|
5
|
+
ANALYTICAL_KEYWORDS = {
|
|
6
|
+
# intent words
|
|
7
|
+
"highest",
|
|
8
|
+
"lowest",
|
|
9
|
+
"top",
|
|
10
|
+
"bottom",
|
|
11
|
+
"most",
|
|
12
|
+
"least",
|
|
13
|
+
"best",
|
|
14
|
+
"worst",
|
|
15
|
+
"average",
|
|
16
|
+
"avg",
|
|
17
|
+
"mean",
|
|
18
|
+
"total",
|
|
19
|
+
"sum",
|
|
20
|
+
"count",
|
|
21
|
+
"max",
|
|
22
|
+
"min",
|
|
23
|
+
"trend",
|
|
24
|
+
"over time",
|
|
25
|
+
"monthly",
|
|
26
|
+
"daily",
|
|
27
|
+
"weekly",
|
|
28
|
+
"yearly",
|
|
29
|
+
"compare",
|
|
30
|
+
"comparison",
|
|
31
|
+
"distribution",
|
|
32
|
+
"breakdown",
|
|
33
|
+
"ascending",
|
|
34
|
+
"descending",
|
|
35
|
+
"asc",
|
|
36
|
+
"desc",
|
|
37
|
+
"increasing",
|
|
38
|
+
"decreasing",
|
|
39
|
+
"lowest to highest",
|
|
40
|
+
"highest to lowest",
|
|
41
|
+
"sorted",
|
|
42
|
+
"order",
|
|
43
|
+
# question words
|
|
44
|
+
"show",
|
|
45
|
+
"give",
|
|
46
|
+
"find",
|
|
47
|
+
"list",
|
|
48
|
+
"get",
|
|
49
|
+
"what",
|
|
50
|
+
"which",
|
|
51
|
+
"how many",
|
|
52
|
+
"how much",
|
|
53
|
+
"where",
|
|
54
|
+
"who",
|
|
55
|
+
# common data domain words
|
|
56
|
+
"sales",
|
|
57
|
+
"revenue",
|
|
58
|
+
"profit",
|
|
59
|
+
"spend",
|
|
60
|
+
"spending",
|
|
61
|
+
"spent",
|
|
62
|
+
"cost",
|
|
63
|
+
"item",
|
|
64
|
+
"items",
|
|
65
|
+
"product",
|
|
66
|
+
"products",
|
|
67
|
+
"category",
|
|
68
|
+
"location",
|
|
69
|
+
"payment",
|
|
70
|
+
"method",
|
|
71
|
+
"customer",
|
|
72
|
+
"customers",
|
|
73
|
+
"order",
|
|
74
|
+
"orders",
|
|
75
|
+
"region",
|
|
76
|
+
"city",
|
|
77
|
+
"country",
|
|
78
|
+
"store",
|
|
79
|
+
"date",
|
|
80
|
+
"month",
|
|
81
|
+
"year",
|
|
82
|
+
"by",
|
|
83
|
+
"per",
|
|
84
|
+
"across",
|
|
85
|
+
"between",
|
|
86
|
+
"vs",
|
|
87
|
+
"versus",
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class InputGuard:
|
|
92
|
+
def __init__(self, extra_domain_words=None):
|
|
93
|
+
"""
|
|
94
|
+
extra_domain_words: pass your semantic_map column names here so that
|
|
95
|
+
queries referencing column names directly are always accepted.
|
|
96
|
+
Example: InputGuard(extra_domain_words=["corrected_t_spent", "payment_method"])
|
|
97
|
+
"""
|
|
98
|
+
self._extra = set(extra_domain_words or [])
|
|
99
|
+
|
|
100
|
+
def run(self, context):
|
|
101
|
+
query = context.get("user_query", "").strip()
|
|
102
|
+
|
|
103
|
+
# --- Empty ---
|
|
104
|
+
if not query:
|
|
105
|
+
context["error"] = "Please enter a question."
|
|
106
|
+
return context
|
|
107
|
+
|
|
108
|
+
query_lower = query.lower()
|
|
109
|
+
|
|
110
|
+
# --- Sensitive content ---
|
|
111
|
+
for word in BLOCKED_KEYWORDS:
|
|
112
|
+
if word in query_lower:
|
|
113
|
+
context["error"] = (
|
|
114
|
+
"⛔ Sensitive query detected. Please ask about your data."
|
|
115
|
+
)
|
|
116
|
+
return context
|
|
117
|
+
|
|
118
|
+
# --- Gibberish / no intent ---
|
|
119
|
+
domain_words = ANALYTICAL_KEYWORDS | self._extra
|
|
120
|
+
has_intent = any(kw in query_lower for kw in domain_words)
|
|
121
|
+
|
|
122
|
+
if not has_intent:
|
|
123
|
+
context["error"] = (
|
|
124
|
+
"❓ I couldn't understand that as a data question.\n\n"
|
|
125
|
+
"Try something like:\n"
|
|
126
|
+
" • 'top 5 items by sales'\n"
|
|
127
|
+
" • 'highest revenue by location'\n"
|
|
128
|
+
" • 'average spend by payment method'\n"
|
|
129
|
+
" • 'total sales trend over time'"
|
|
130
|
+
)
|
|
131
|
+
return context
|
|
132
|
+
|
|
133
|
+
return context
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
SENSITIVE_COLUMNS = ["password", "ssn", "credit_card"]
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SchemaFilter:
|
|
5
|
+
def run(self, context):
|
|
6
|
+
schema = context["schema"]
|
|
7
|
+
|
|
8
|
+
if not schema:
|
|
9
|
+
return context
|
|
10
|
+
|
|
11
|
+
filtered_columns = [
|
|
12
|
+
col
|
|
13
|
+
for col in schema["columns"]
|
|
14
|
+
if col["name"].lower() not in SENSITIVE_COLUMNS
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
schema["columns"] = filtered_columns
|
|
18
|
+
context["schema"] = schema
|
|
19
|
+
|
|
20
|
+
return context
|
app/tests/__init__.py
ADDED
|
File without changes
|
app/tests/llm_test.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from app.agents.llm_intepreter import LLMInterpreter
|
|
2
|
+
|
|
3
|
+
context = {
|
|
4
|
+
"user_query": "Which location has highest revenue?",
|
|
5
|
+
"schema": {
|
|
6
|
+
"columns": [
|
|
7
|
+
{"name": "location"},
|
|
8
|
+
{"name": "payment_method"},
|
|
9
|
+
{"name": "total_spent"},
|
|
10
|
+
]
|
|
11
|
+
},
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
agent = LLMInterpreter()
|
|
15
|
+
|
|
16
|
+
result = agent.run(context)
|
|
17
|
+
|
|
18
|
+
print(result["intent"])
|
app/tools/__init__.py
ADDED
|
File without changes
|
app/tools/analyzer.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Analyzer:
|
|
5
|
+
"""
|
|
6
|
+
Executes the structured intent produced by the interpreter.
|
|
7
|
+
|
|
8
|
+
Sheet-aware: if context["intent"]["sheet"] is set (e.g. the user said
|
|
9
|
+
"in sheet Orders"), only that sheet's rows are used for analysis.
|
|
10
|
+
Otherwise the full combined dataframe is used.
|
|
11
|
+
|
|
12
|
+
Supported query types
|
|
13
|
+
---------------------
|
|
14
|
+
comparison – groupby dimension, sum metric, sort descending
|
|
15
|
+
top_n – like comparison, limited to N rows
|
|
16
|
+
aggregation – groupby dimension, mean or sum
|
|
17
|
+
trend – groupby time dimension, sum, sort by index
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def run(self, context: dict) -> dict:
|
|
21
|
+
intent = context.get("intent", {})
|
|
22
|
+
|
|
23
|
+
metric = intent.get("metric")
|
|
24
|
+
dimension = intent.get("dimension")
|
|
25
|
+
query_type = intent.get("query_type")
|
|
26
|
+
target_sheet = intent.get("sheet") # set by InterpreterAgent for sheet queries
|
|
27
|
+
|
|
28
|
+
# ── Sheet-aware dataframe selection ──────────────────────────────
|
|
29
|
+
if target_sheet and target_sheet in context.get("sheet_dataframes", {}):
|
|
30
|
+
df = context["sheet_dataframes"][target_sheet].copy()
|
|
31
|
+
else:
|
|
32
|
+
df = context["dataframe"].copy()
|
|
33
|
+
|
|
34
|
+
# ── Guard: columns must exist ─────────────────────────────────────
|
|
35
|
+
all_columns = df.columns.tolist()
|
|
36
|
+
visible_cols = [c for c in all_columns if c != "_sheet"]
|
|
37
|
+
|
|
38
|
+
if not metric or metric not in all_columns:
|
|
39
|
+
numeric_in_sheet = df.select_dtypes(include="number").columns.tolist()
|
|
40
|
+
id_hints = {"id", "_id", "key", "code", "num", "no", "number"}
|
|
41
|
+
real_numeric = [
|
|
42
|
+
c for c in numeric_in_sheet if not any(h in c.lower() for h in id_hints)
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
if target_sheet and not real_numeric:
|
|
46
|
+
context["error"] = (
|
|
47
|
+
f"The '{target_sheet}' sheet has no numeric columns to measure.\n"
|
|
48
|
+
f" Columns in this sheet: {visible_cols}\n\n"
|
|
49
|
+
f"This sheet is likely a lookup/reference table.\n"
|
|
50
|
+
f"Try querying a sheet that has numeric data, like Orders."
|
|
51
|
+
)
|
|
52
|
+
elif target_sheet and real_numeric:
|
|
53
|
+
context["error"] = (
|
|
54
|
+
f"'{metric}' is not available in the '{target_sheet}' sheet.\n"
|
|
55
|
+
f" Available numeric columns here: {real_numeric}\n"
|
|
56
|
+
f" Try: 'top 5 by {real_numeric[0]} in {target_sheet}'"
|
|
57
|
+
)
|
|
58
|
+
else:
|
|
59
|
+
context["error"] = (
|
|
60
|
+
f"Metric column '{metric}' not found.\n"
|
|
61
|
+
f" Available columns: {visible_cols}"
|
|
62
|
+
)
|
|
63
|
+
return context
|
|
64
|
+
|
|
65
|
+
# Hard guard: internal/system columns must never be used as dimension
|
|
66
|
+
INTERNAL_COLS = {"_sheet"}
|
|
67
|
+
if dimension in INTERNAL_COLS:
|
|
68
|
+
context["error"] = (
|
|
69
|
+
f"'{dimension}' is an internal system column and cannot be "
|
|
70
|
+
f"used as a dimension.\n\n"
|
|
71
|
+
f" Please rephrase and specify a real dimension column.\n"
|
|
72
|
+
f" Available columns: {visible_cols}"
|
|
73
|
+
)
|
|
74
|
+
return context
|
|
75
|
+
|
|
76
|
+
if not dimension or dimension not in all_columns:
|
|
77
|
+
context["error"] = (
|
|
78
|
+
f"Dimension column '{dimension}' not found.\n"
|
|
79
|
+
f" Available columns: {visible_cols}"
|
|
80
|
+
)
|
|
81
|
+
return context
|
|
82
|
+
if not query_type:
|
|
83
|
+
context["error"] = "No query type detected. Please rephrase your question."
|
|
84
|
+
return context
|
|
85
|
+
|
|
86
|
+
# ── Date granularity: group datetime columns by month/year/week ──
|
|
87
|
+
# If the dimension is a datetime column, extract the requested period
|
|
88
|
+
# so "which month" groups by month label, not individual dates.
|
|
89
|
+
if pd.api.types.is_datetime64_any_dtype(df[dimension]):
|
|
90
|
+
granularity = intent.get("time_granularity", "day")
|
|
91
|
+
if granularity == "year":
|
|
92
|
+
df[dimension] = df[dimension].dt.to_period("Y").astype(str)
|
|
93
|
+
elif granularity == "month":
|
|
94
|
+
df[dimension] = df[dimension].dt.to_period("M").astype(str)
|
|
95
|
+
elif granularity == "week":
|
|
96
|
+
df[dimension] = df[dimension].dt.to_period("W").astype(str)
|
|
97
|
+
else:
|
|
98
|
+
df[dimension] = df[dimension].dt.date.astype(str)
|
|
99
|
+
else:
|
|
100
|
+
# ── Clean categorical dimension ───────────────────────────────
|
|
101
|
+
df[dimension] = (
|
|
102
|
+
df[dimension]
|
|
103
|
+
.astype(str)
|
|
104
|
+
.str.strip()
|
|
105
|
+
.replace(["ERROR", "UNKNOWN", "Unknown", "nan", ""], "Unknown")
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# ── Coerce metric to numeric ──────────────────────────────────────
|
|
109
|
+
df[metric] = pd.to_numeric(df[metric], errors="coerce")
|
|
110
|
+
df = df.dropna(subset=[metric])
|
|
111
|
+
|
|
112
|
+
if df.empty:
|
|
113
|
+
context["error"] = f"No numeric data found in '{metric}' after cleaning."
|
|
114
|
+
return context
|
|
115
|
+
|
|
116
|
+
# ── Run analysis ──────────────────────────────────────────────────
|
|
117
|
+
try:
|
|
118
|
+
ascending = intent.get("ascending", False)
|
|
119
|
+
|
|
120
|
+
if query_type == "comparison":
|
|
121
|
+
result = (
|
|
122
|
+
df.groupby(dimension)[metric].sum().sort_values(ascending=ascending)
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
elif query_type == "top_n":
|
|
126
|
+
n = intent.get("limit") or 5
|
|
127
|
+
result = (
|
|
128
|
+
df.groupby(dimension)[metric]
|
|
129
|
+
.sum()
|
|
130
|
+
.sort_values(ascending=ascending)
|
|
131
|
+
.head(n)
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
elif query_type == "aggregation":
|
|
135
|
+
op = intent.get("operation", "sum")
|
|
136
|
+
result = (
|
|
137
|
+
df.groupby(dimension)[metric].mean()
|
|
138
|
+
if op == "mean"
|
|
139
|
+
else df.groupby(dimension)[metric].sum()
|
|
140
|
+
)
|
|
141
|
+
# Always sort by value so display and insight are consistent
|
|
142
|
+
result = result.sort_values(ascending=ascending)
|
|
143
|
+
|
|
144
|
+
elif query_type == "trend":
|
|
145
|
+
result = df.groupby(dimension)[metric].sum().sort_index()
|
|
146
|
+
|
|
147
|
+
else:
|
|
148
|
+
context["error"] = f"Unsupported query type: '{query_type}'"
|
|
149
|
+
return context
|
|
150
|
+
|
|
151
|
+
context["analysis"] = result
|
|
152
|
+
context["target_sheet"] = target_sheet # for InsightGenerator label
|
|
153
|
+
return context
|
|
154
|
+
|
|
155
|
+
except Exception as e:
|
|
156
|
+
context["error"] = f"Analysis failed: {e}"
|
|
157
|
+
return context
|