querymind-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. app/agents/InterpreterAgent.py +473 -0
  2. app/agents/__init__.py +0 -0
  3. app/agents/insights_generator.py +151 -0
  4. app/agents/intent_corrector.py +59 -0
  5. app/agents/llm_intepreter.py +132 -0
  6. app/agents/narrator.py +27 -0
  7. app/agents/planner.py +77 -0
  8. app/cli/__init__.py +0 -0
  9. app/cli/main.py +346 -0
  10. app/cli/tui_app.py +98 -0
  11. app/cli/ui.py +21 -0
  12. app/core/__init__.py +0 -0
  13. app/core/context.py +10 -0
  14. app/core/logger.py +2 -0
  15. app/core/pipeline.py +379 -0
  16. app/data/__init__.py +0 -0
  17. app/data/connectors/csv_connector.py +99 -0
  18. app/data/connectors/excel_connector.py +68 -0
  19. app/data/connectors/no_sql_db_connector.py +0 -0
  20. app/data/connectors/sql_db_connector.py +0 -0
  21. app/data/schema_engine.py +18 -0
  22. app/data/type_caster.py +128 -0
  23. app/executor/__init__.py +0 -0
  24. app/executor/db_executor.py +0 -0
  25. app/executor/sheet_selector.py +120 -0
  26. app/llm/ollama_client.py +47 -0
  27. app/prompts/interpreter_prompt.txt +28 -0
  28. app/security/__init__.py +0 -0
  29. app/security/input_guard.py +133 -0
  30. app/security/schema_filter.py +20 -0
  31. app/tests/__init__.py +0 -0
  32. app/tests/llm_test.py +18 -0
  33. app/tools/__init__.py +0 -0
  34. app/tools/analyzer.py +157 -0
  35. app/tools/join_resolver.py +159 -0
  36. app/tools/sql_writer.py +37 -0
  37. app/tools/validator.py +0 -0
  38. querymind_cli-0.1.0.dist-info/METADATA +139 -0
  39. querymind_cli-0.1.0.dist-info/RECORD +43 -0
  40. querymind_cli-0.1.0.dist-info/WHEEL +5 -0
  41. querymind_cli-0.1.0.dist-info/entry_points.txt +2 -0
  42. querymind_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  43. querymind_cli-0.1.0.dist-info/top_level.txt +1 -0
app/cli/ui.py ADDED
@@ -0,0 +1,21 @@
1
+ from rich.console import Console
2
+ from rich.panel import Panel
3
+
4
+ console = Console()
5
+
6
+
7
+ def show_header():
8
+ console.print(
9
+ Panel.fit(
10
+ "[bold #2dd9fe] 🧠 QueryMind CLI [/bold #2dd9fe]\n"
11
+ "[#74ee15 ] Your AI Data Analyst [/#74ee15]",
12
+ border_style="blue",
13
+ )
14
+ )
15
+
16
+
17
+ def show_message(role, text):
18
+ if role == "user":
19
+ console.print(f"[bold blue]>> {text}[/bold blue]")
20
+ else:
21
+ console.print(f"[bold green]>>💡 {text}[/bold green]")
app/core/__init__.py ADDED
File without changes
app/core/context.py ADDED
@@ -0,0 +1,10 @@
1
+ class Context(dict):
2
+ def __init__(self, user_query):
3
+ super().__init__()
4
+
5
+ self["user_query"] = user_query
6
+ self["intent"] = None
7
+ self["analysis"] = None
8
+ self["answer"] = None
9
+ self["error"] = None
10
+ self["logs"] = []
app/core/logger.py ADDED
@@ -0,0 +1,2 @@
1
+ def log_step(context, step, data):
2
+ context["logs"].append({"steps": step, "data": data})
app/core/pipeline.py ADDED
@@ -0,0 +1,379 @@
1
+ from app.agents.InterpreterAgent import InterpreterAgent
2
+ from app.agents.llm_intepreter import LLMInterpreter
3
+ from app.tools.analyzer import Analyzer
4
+ from app.security.input_guard import InputGuard
5
+ from app.data.connectors.csv_connector import CSVConnector
6
+ from app.data.connectors.excel_connector import ExcelConnector
7
+ from app.security.schema_filter import SchemaFilter
8
+ from app.data.schema_engine import SchemaEngine
9
+ from app.agents.insights_generator import InsightGenerator
10
+ from app.tools.join_resolver import JoinResolver
11
+
12
+
13
+ class QueryMindPipeline:
14
+ """
15
+ Orchestrates the full query → insight pipeline.
16
+
17
+ Accepts either a CSVConnector or ExcelConnector — the rest of the
18
+ pipeline is connector-agnostic.
19
+
20
+ Step sequence
21
+ -------------
22
+ 1. InputGuard – blocks junk / sensitive input
23
+ 2. InterpreterAgent – fast rule-based intent extraction
24
+ 3. LLMInterpreter – runs only when confidence < 0.8;
25
+ falls back to rule intent on failure
26
+ 4. Analyzer – pandas operations; sheet-aware for Excel
27
+ 5. InsightGenerator – formats raw Series → readable answer
28
+ """
29
+
30
+ def __init__(self, connector, semantic_map: dict):
31
+ """
32
+ connector – a CSVConnector or ExcelConnector instance
33
+ semantic_map – {"metric": col, "dimension": col, "time": col|None}
34
+ """
35
+ self.semantic_map = semantic_map
36
+
37
+ # Infrastructure
38
+ self.schema_filter = SchemaFilter()
39
+ self.schema_engine = SchemaEngine()
40
+
41
+ extra_words = [v for v in semantic_map.values() if v]
42
+ self.input_guard = InputGuard(extra_domain_words=extra_words)
43
+
44
+ # Agents
45
+ self.interpreter = InterpreterAgent()
46
+ self.llm_interpreter = LLMInterpreter()
47
+ self.insight_generator = InsightGenerator()
48
+ self.analyzer = Analyzer()
49
+ self.join_resolver = JoinResolver()
50
+
51
+ # Check Ollama availability at startup
52
+ self.llm_available = self._check_ollama()
53
+
54
+ # Load + cache base context once at startup
55
+ self._base_context = {}
56
+ self._base_context = connector.run(self._base_context)
57
+ if self._base_context.get("error"):
58
+ raise RuntimeError(f"Failed to load data: {self._base_context['error']}")
59
+ self._base_context = self.schema_filter.run(self._base_context)
60
+ self._base_context = self.schema_engine.run(self._base_context)
61
+
62
+ # ------------------------------------------------------------------
63
+ def _check_ollama(self) -> bool:
64
+ """
65
+ Ping Ollama at startup. Returns True if reachable, False otherwise.
66
+ Prints a clear warning so the user knows LLM fallback is disabled.
67
+ """
68
+ try:
69
+ import requests
70
+
71
+ resp = requests.get("http://localhost:11434", timeout=3)
72
+ if resp.status_code == 200:
73
+ print("✅ Ollama detected — LLM fallback enabled")
74
+ return True
75
+ except Exception:
76
+ pass
77
+ print(
78
+ "⚠️ Ollama not detected on localhost:11434\n"
79
+ " LLM fallback disabled — rule-based interpreter only.\n"
80
+ " To enable: install Ollama from https://ollama.ai and run: ollama pull phi"
81
+ )
82
+ return False
83
+
84
+ # ------------------------------------------------------------------
85
+ def _check_missing_column(self, context: dict) -> dict:
86
+ """
87
+ Detects when the user's query references a column that doesn't exist
88
+ in any loaded sheet, and the interpreter silently fell back to the
89
+ semantic default dimension.
90
+
91
+ Sets context["error"] with a helpful message if detected.
92
+ """
93
+ import re
94
+
95
+ query = context.get("user_query", "").lower()
96
+ intent = context.get("intent", {})
97
+ schema = context.get("schema", {})
98
+ semantic = context.get("semantic_map", {})
99
+
100
+ intent_dimension = intent.get("dimension", "")
101
+ semantic_dimension = semantic.get("dimension", "")
102
+
103
+ # Only check when interpreter fell back to semantic default
104
+ # (means it couldn't find an explicit column match in the query)
105
+ if intent_dimension != semantic_dimension:
106
+ return context
107
+
108
+ columns = [col["name"] for col in schema.get("columns", [])]
109
+ col_set = set(columns)
110
+ col_words = set()
111
+ for col in columns:
112
+ for part in col.split("_"):
113
+ if len(part) > 2:
114
+ col_words.add(part)
115
+
116
+ STOP_WORDS = {
117
+ "which",
118
+ "what",
119
+ "who",
120
+ "where",
121
+ "when",
122
+ "how",
123
+ "the",
124
+ "was",
125
+ "were",
126
+ "had",
127
+ "has",
128
+ "have",
129
+ "did",
130
+ "does",
131
+ "most",
132
+ "least",
133
+ "max",
134
+ "min",
135
+ "top",
136
+ "highest",
137
+ "lowest",
138
+ "total",
139
+ "average",
140
+ "used",
141
+ "give",
142
+ "show",
143
+ "list",
144
+ "find",
145
+ "get",
146
+ "and",
147
+ "for",
148
+ "with",
149
+ "from",
150
+ "that",
151
+ "this",
152
+ "are",
153
+ "all",
154
+ "per",
155
+ "across",
156
+ "gave",
157
+ "its",
158
+ "their",
159
+ "use",
160
+ "been",
161
+ "much",
162
+ "many",
163
+ "more",
164
+ "less",
165
+ "than",
166
+ "into",
167
+ "over",
168
+ "each",
169
+ "some",
170
+ "any",
171
+ "our",
172
+ "not",
173
+ "but",
174
+ "can",
175
+ "could",
176
+ "would",
177
+ "using",
178
+ "like",
179
+ "sales",
180
+ "revenue",
181
+ "profit",
182
+ "spend",
183
+ "spending",
184
+ "spent",
185
+ "cost",
186
+ "amount",
187
+ "value",
188
+ "number",
189
+ "count",
190
+ "sum",
191
+ "avg",
192
+ "mean",
193
+ "sheet",
194
+ "data",
195
+ "file",
196
+ "table",
197
+ "column",
198
+ "field",
199
+ "ascending",
200
+ "descending",
201
+ "asc",
202
+ "desc",
203
+ "increasing",
204
+ "decreasing",
205
+ "sort",
206
+ "sorted",
207
+ "order",
208
+ "ordering",
209
+ # Common filler words that are not column names
210
+ "specific",
211
+ "wise",
212
+ "based",
213
+ "overall",
214
+ "give",
215
+ "respective",
216
+ "related",
217
+ "breakdown",
218
+ "detail",
219
+ "particular",
220
+ "certain",
221
+ "various",
222
+ "different",
223
+ }
224
+
225
+ # Add sheet names AND every individual word in each sheet name
226
+ # so "List of Orders" doesn't cause "orders" to be flagged as missing
227
+ for s in context.get("excel_sheets", []):
228
+ STOP_WORDS.add(s.lower())
229
+ for word in s.lower().split():
230
+ STOP_WORDS.add(word)
231
+
232
+ words = re.findall(r"[a-zA-Z]+", query)
233
+
234
+ # Unigrams: words unknown to both stop list and schema
235
+ unknowns = [
236
+ w
237
+ for w in words
238
+ if w not in STOP_WORDS
239
+ and w not in col_words
240
+ and w not in col_set
241
+ and len(w) > 3
242
+ ]
243
+
244
+ # Bigrams: adjacent unknown-word pairs as potential column names
245
+ bigrams = []
246
+ for i in range(len(words) - 1):
247
+ a, b = words[i], words[i + 1]
248
+ pair = f"{a}_{b}"
249
+ if (
250
+ pair not in col_set
251
+ and a not in STOP_WORDS
252
+ and b not in STOP_WORDS
253
+ and a not in col_words
254
+ and b not in col_words
255
+ and len(a) > 2
256
+ and len(b) > 2
257
+ ):
258
+ bigrams.append(pair)
259
+
260
+ candidates = bigrams + unknowns
261
+ if not candidates:
262
+ return context
263
+
264
+ # Most likely missing column = longest candidate
265
+ most_likely = sorted(set(candidates), key=len, reverse=True)[0]
266
+
267
+ # Suggest the closest real column name using character overlap
268
+ def similarity(a, b):
269
+ a_set = set(a.replace("_", ""))
270
+ b_set = set(b.replace("_", ""))
271
+ return len(a_set & b_set) / max(len(a_set | b_set), 1)
272
+
273
+ ml_words = set(most_likely.replace("_", ""))
274
+ suggestions = sorted(
275
+ [c for c in columns if c != "_sheet"],
276
+ key=lambda c: similarity(most_likely, c),
277
+ reverse=True,
278
+ )[:3]
279
+
280
+ context["error"] = (
281
+ f"❓ Column '{most_likely.replace('_', ' ')}' doesn't exist in your data.\n\n"
282
+ f" Available columns: {[c for c in columns if c != '_sheet']}\n\n"
283
+ f" Closest matches: {suggestions}\n"
284
+ f" Try rephrasing — e.g. 'which {suggestions[0]} had the most {semantic.get('metric', 'value')}?'"
285
+ )
286
+ return context
287
+
288
+ # ------------------------------------------------------------------
289
+ def run(self, context: dict) -> dict:
290
+ # Inject shared state into every query context
291
+ context["dataframe"] = self._base_context.get("dataframe")
292
+ context["schema"] = self._base_context.get("schema")
293
+ context["schema_description"] = self._base_context.get("schema_description")
294
+ context["semantic_map"] = self.semantic_map
295
+
296
+ # Carry Excel-specific metadata so Analyzer / InsightGenerator can use it
297
+ context["sheet_dataframes"] = self._base_context.get("sheet_dataframes", {})
298
+ context["excel_sheets"] = self._base_context.get("excel_sheets", [])
299
+ context["excel_mode"] = self._base_context.get("excel_mode", None)
300
+
301
+ # STEP 1 – Input guard
302
+ context = self.input_guard.run(context)
303
+ if context.get("error"):
304
+ return context
305
+
306
+ # STEP 2 – Rule-based interpreter
307
+ context = self.interpreter.run(context)
308
+ if context.get("error"):
309
+ return context
310
+
311
+ confidence = context.get("intent_confidence", 0)
312
+
313
+ # STEP 3 – LLM fallback (only when Ollama is available and confidence is low)
314
+ if confidence < 0.8:
315
+ if not self.llm_available:
316
+ # Ollama is down — reject low-confidence queries cleanly
317
+ context["error"] = (
318
+ "❓ I couldn't understand that query, and the LLM fallback "
319
+ "is unavailable (Ollama not running).\n\n"
320
+ "Try rephrasing with clearer keywords:\n"
321
+ " • 'top 5 items by sales'\n"
322
+ " • 'highest revenue by location'\n"
323
+ " • 'average spend by payment method'\n"
324
+ " • 'total sales trend over time'"
325
+ )
326
+ return context
327
+
328
+ llm_context = self.llm_interpreter.run(dict(context))
329
+ if llm_context.get("error"):
330
+ context["error"] = (
331
+ "❓ I couldn't understand that query.\n\n"
332
+ "Try something like:\n"
333
+ " • 'top 5 items by sales'\n"
334
+ " • 'highest revenue by location'\n"
335
+ " • 'average spend by payment method'\n"
336
+ " • 'total sales trend over time'"
337
+ )
338
+ return context
339
+ context["intent"] = llm_context["intent"]
340
+ context["llm_used"] = True
341
+
342
+ # STEP 3.5a – Guard: user asked about a column that doesn't exist
343
+ context = self._check_missing_column(context)
344
+ if context.get("error"):
345
+ return context
346
+
347
+ # STEP 3.5b – Guard: trend query but no time column configured
348
+ if context.get("intent", {}).get("no_time_column"):
349
+ context["error"] = (
350
+ "⏱️ This query needs a time column, but none was configured.\n\n"
351
+ "Re-run QueryMind and set a time column at the setup prompt, "
352
+ "or rephrase your question to use a different dimension."
353
+ )
354
+ return context
355
+
356
+ # STEP 3.7 – Cross-sheet join resolution
357
+ # Runs only when dimension column lives in a different sheet
358
+ # than the metric column (e.g. "which manager had max sales?")
359
+ context = self.join_resolver.run(context)
360
+ if context.get("error"):
361
+ return context
362
+
363
+ # STEP 4 – Analyze
364
+ context = self.analyzer.run(context)
365
+ if context.get("error"):
366
+ return context
367
+
368
+ # STEP 5 – Generate insight
369
+ context = self.insight_generator.run(context)
370
+
371
+ if not context.get("answer"):
372
+ raw = context.get("analysis")
373
+ context["answer"] = (
374
+ raw.to_string()
375
+ if raw is not None
376
+ else "⚠️ Could not generate an answer for that query."
377
+ )
378
+
379
+ return context
app/data/__init__.py ADDED
File without changes
@@ -0,0 +1,99 @@
1
+ import pandas as pd
2
+ import chardet
3
+ from app.data.type_caster import smart_cast_df
4
+
5
+
6
+ def _detect_encoding(file_path: str) -> str:
7
+ """
8
+ Detect file encoding.
9
+ Checks for BOM first (catches Excel-exported UTF-8 files),
10
+ then falls back to chardet for other encodings (latin-1, cp1252, etc.)
11
+ """
12
+ with open(file_path, "rb") as f:
13
+ raw = f.read(4096)
14
+
15
+ # UTF-8 BOM — most common cause of \ufeff in column names
16
+ if raw.startswith(b"\xef\xbb\xbf"):
17
+ return "utf-8-sig"
18
+
19
+ # UTF-16 BOMs
20
+ if raw.startswith(b"\xff\xfe") or raw.startswith(b"\xfe\xff"):
21
+ return "utf-16"
22
+
23
+ detected = chardet.detect(raw)
24
+ return detected.get("encoding") or "utf-8"
25
+
26
+
27
+ def _detect_delimiter(file_path: str, encoding: str) -> str:
28
+ """
29
+ Detect delimiter by counting occurrences in the first line.
30
+ Handles comma, semicolon, tab, pipe — in that priority order on ties.
31
+ """
32
+ try:
33
+ with open(file_path, "r", encoding=encoding, errors="replace") as f:
34
+ first_line = f.readline()
35
+ except Exception:
36
+ return ","
37
+
38
+ candidates = {",": 0, ";": 0, "\t": 0, "|": 0}
39
+ for delim in candidates:
40
+ candidates[delim] = first_line.count(delim)
41
+
42
+ best = max(candidates, key=candidates.get)
43
+ return best if candidates[best] > 0 else ","
44
+
45
+
46
+ class CSVConnector:
47
+ def __init__(self, file_path: str):
48
+ self.file_path = file_path
49
+
50
+ def run(self, context: dict) -> dict:
51
+ try:
52
+ encoding = _detect_encoding(self.file_path)
53
+ delimiter = _detect_delimiter(self.file_path, encoding)
54
+
55
+ print(f"📄 CSV encoding={encoding} delimiter={repr(delimiter)}")
56
+
57
+ try:
58
+ df = pd.read_csv(
59
+ self.file_path,
60
+ encoding=encoding,
61
+ sep=delimiter,
62
+ on_bad_lines="warn", # skip malformed rows, don't crash
63
+ )
64
+ except pd.errors.EmptyDataError:
65
+ context["error"] = (
66
+ f"'{self.file_path}' is completely empty. "
67
+ f"Please provide a file with headers and at least one row of data."
68
+ )
69
+ return context
70
+
71
+ # Guard: headers-only (parsed fine but zero rows)
72
+ if df.empty:
73
+ context["error"] = (
74
+ f"'{self.file_path}' contains only headers and no data rows. "
75
+ f"Please provide a file with at least one row of data."
76
+ )
77
+ return context
78
+
79
+ # Normalize column names
80
+ df.columns = [col.lower().strip().replace(" ", "_") for col in df.columns]
81
+
82
+ # Smart cast: numeric + packed-date detection (shared logic)
83
+ df = smart_cast_df(df)
84
+
85
+ context["dataframe"] = df
86
+ context["schema"] = {
87
+ "columns": [
88
+ {"name": col, "type": str(df[col].dtype)} for col in df.columns
89
+ ]
90
+ }
91
+
92
+ print("✅ Columns detected:", df.columns.tolist())
93
+ print("📊 Data types:\n", df.dtypes.to_string())
94
+
95
+ return context
96
+
97
+ except Exception as e:
98
+ context["error"] = f"Failed to load CSV: {e}"
99
+ return context
@@ -0,0 +1,68 @@
1
+ import pandas as pd
2
+ from app.data.type_caster import smart_cast_df
3
+
4
+
5
+ def _normalize_col(col: str) -> str:
6
+ return col.lower().strip().replace(" ", "_")
7
+
8
+
9
+ class ExcelConnector:
10
+ def __init__(self, file_path: str, selected_sheets: list):
11
+ self.file_path = file_path
12
+ self.selected_sheets = selected_sheets
13
+
14
+ def run(self, context: dict) -> dict:
15
+ try:
16
+ xl = pd.ExcelFile(self.file_path)
17
+ except Exception as e:
18
+ context["error"] = f"Cannot open Excel file: {e}"
19
+ return context
20
+
21
+ available = xl.sheet_names
22
+ invalid = [s for s in self.selected_sheets if s not in available]
23
+ if invalid:
24
+ context["error"] = f"Sheet(s) not found: {invalid}. Available: {available}"
25
+ return context
26
+
27
+ sheet_dfs = {}
28
+ for sheet in self.selected_sheets:
29
+ try:
30
+ df = xl.parse(sheet)
31
+ df.columns = [_normalize_col(c) for c in df.columns]
32
+ df = smart_cast_df(df) # ← shared smart caster
33
+ sheet_dfs[sheet] = df
34
+ except Exception as e:
35
+ context["error"] = f"Failed to parse sheet '{sheet}': {e}"
36
+ return context
37
+
38
+ mode = "single" if len(self.selected_sheets) == 1 else "multi"
39
+
40
+ if mode == "single":
41
+ combined = sheet_dfs[self.selected_sheets[0]].copy()
42
+ combined["_sheet"] = self.selected_sheets[0]
43
+ else:
44
+ frames = []
45
+ for sheet, df in sheet_dfs.items():
46
+ df = df.copy()
47
+ df["_sheet"] = sheet
48
+ frames.append(df)
49
+ combined = pd.concat(frames, ignore_index=True, sort=False)
50
+ # pd.concat upcasts int64 → float64 when a column is missing
51
+ # in some sheets (NaN rows force the upcast). Re-run downcast.
52
+ combined = smart_cast_df(combined)
53
+
54
+ print(f"✅ Excel loaded ({mode}): {self.selected_sheets}")
55
+ print(f" Shape: {combined.shape} Cols: {combined.columns.tolist()}")
56
+ print(f" Dtypes: {combined.dtypes.to_dict()}")
57
+
58
+ context["dataframe"] = combined
59
+ context["sheet_dataframes"] = sheet_dfs
60
+ context["excel_sheets"] = self.selected_sheets
61
+ context["excel_mode"] = mode
62
+ context["schema"] = {
63
+ "columns": [
64
+ {"name": col, "type": str(combined[col].dtype)}
65
+ for col in combined.columns
66
+ ]
67
+ }
68
+ return context
File without changes
File without changes
@@ -0,0 +1,18 @@
1
+ class SchemaEngine:
2
+ def run(self, context):
3
+ schema = context["schema"]
4
+ print("Schema: ", schema)
5
+
6
+ if not schema:
7
+ print("\n -------------- CONTEXT------------ \n", context)
8
+
9
+ return context
10
+
11
+ description = "Table: data\n\nColumns:\n"
12
+
13
+ for col in schema["columns"]:
14
+ description += f"- {col['name']} ({col['type']})\n"
15
+
16
+ context["schema_description"] = description
17
+
18
+ return context