querymind-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. app/agents/InterpreterAgent.py +473 -0
  2. app/agents/__init__.py +0 -0
  3. app/agents/insights_generator.py +151 -0
  4. app/agents/intent_corrector.py +59 -0
  5. app/agents/llm_intepreter.py +132 -0
  6. app/agents/narrator.py +27 -0
  7. app/agents/planner.py +77 -0
  8. app/cli/__init__.py +0 -0
  9. app/cli/main.py +346 -0
  10. app/cli/tui_app.py +98 -0
  11. app/cli/ui.py +21 -0
  12. app/core/__init__.py +0 -0
  13. app/core/context.py +10 -0
  14. app/core/logger.py +2 -0
  15. app/core/pipeline.py +379 -0
  16. app/data/__init__.py +0 -0
  17. app/data/connectors/csv_connector.py +99 -0
  18. app/data/connectors/excel_connector.py +68 -0
  19. app/data/connectors/no_sql_db_connector.py +0 -0
  20. app/data/connectors/sql_db_connector.py +0 -0
  21. app/data/schema_engine.py +18 -0
  22. app/data/type_caster.py +128 -0
  23. app/executor/__init__.py +0 -0
  24. app/executor/db_executor.py +0 -0
  25. app/executor/sheet_selector.py +120 -0
  26. app/llm/ollama_client.py +47 -0
  27. app/prompts/interpreter_prompt.txt +28 -0
  28. app/security/__init__.py +0 -0
  29. app/security/input_guard.py +133 -0
  30. app/security/schema_filter.py +20 -0
  31. app/tests/__init__.py +0 -0
  32. app/tests/llm_test.py +18 -0
  33. app/tools/__init__.py +0 -0
  34. app/tools/analyzer.py +157 -0
  35. app/tools/join_resolver.py +159 -0
  36. app/tools/sql_writer.py +37 -0
  37. app/tools/validator.py +0 -0
  38. querymind_cli-0.1.0.dist-info/METADATA +139 -0
  39. querymind_cli-0.1.0.dist-info/RECORD +43 -0
  40. querymind_cli-0.1.0.dist-info/WHEEL +5 -0
  41. querymind_cli-0.1.0.dist-info/entry_points.txt +2 -0
  42. querymind_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
  43. querymind_cli-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,473 @@
1
+ import re
2
+
3
+
4
+ class InterpreterAgent:
5
+ """
6
+ Rule-based intent interpreter.
7
+
8
+ Builds a structured intent dict from the user query using keyword matching
9
+ and the semantic_map supplied at setup time.
10
+
11
+ Confidence rules
12
+ ----------------
13
+ - 0.9 → matched a strong analytical keyword; skip LLM
14
+ - 0.2 → no recognisable keyword; defer to LLM
15
+ """
16
+
17
+ STRONG_KEYWORDS = {
18
+ "highest",
19
+ "lowest",
20
+ "top",
21
+ "bottom",
22
+ "average",
23
+ "avg",
24
+ "mean",
25
+ "trend",
26
+ "over time",
27
+ "monthly",
28
+ "daily",
29
+ "weekly",
30
+ "yearly",
31
+ "total",
32
+ "sum",
33
+ "most",
34
+ "least",
35
+ "max",
36
+ "min",
37
+ "distribution",
38
+ "breakdown",
39
+ "compare",
40
+ "which",
41
+ "what",
42
+ "when",
43
+ # Display / grouping words — common in natural phrasing
44
+ "show",
45
+ "show me",
46
+ "give me",
47
+ "list",
48
+ "get",
49
+ "find",
50
+ "by",
51
+ "per",
52
+ "across",
53
+ "grouped",
54
+ "group",
55
+ # Sort order
56
+ "ascending",
57
+ "descending",
58
+ "asc",
59
+ "desc",
60
+ "increasing",
61
+ "decreasing",
62
+ "sorted",
63
+ "order",
64
+ }
65
+
66
+ # Order matters: first match wins.
67
+ QUERY_TYPE_MAP = [
68
+ ("top", "top_n", "sum"),
69
+ ("bottom", "top_n", "sum"),
70
+ ("highest", "comparison", "sum"),
71
+ ("lowest", "comparison", "sum"),
72
+ ("most", "comparison", "sum"),
73
+ ("least", "comparison", "sum"),
74
+ ("average", "aggregation", "mean"),
75
+ ("avg", "aggregation", "mean"),
76
+ ("mean", "aggregation", "mean"),
77
+ ("trend", "trend", "sum"),
78
+ ("over time", "trend", "sum"),
79
+ ("monthly", "trend", "sum"),
80
+ ("daily", "trend", "sum"),
81
+ ("total", "aggregation", "sum"),
82
+ ("sum", "aggregation", "sum"),
83
+ ("distribution", "aggregation", "sum"),
84
+ ("breakdown", "aggregation", "sum"),
85
+ ("compare", "comparison", "sum"),
86
+ ("max", "comparison", "sum"),
87
+ ("min", "comparison", "sum"),
88
+ ("weekly", "trend", "sum"),
89
+ ("yearly", "trend", "sum"),
90
+ ("which", "comparison", "sum"),
91
+ ("when", "trend", "sum"),
92
+ ]
93
+
94
+ # Natural-language synonyms that mean "use the configured metric".
95
+ # When a user says "top 5 items by SALES", "sales" isn't a column —
96
+ # it's a synonym for whatever metric they configured.
97
+ METRIC_SYNONYMS = {
98
+ "sales",
99
+ "revenue",
100
+ "profit",
101
+ "spend",
102
+ "spending",
103
+ "spent",
104
+ "cost",
105
+ "costs",
106
+ "amount",
107
+ "amounts",
108
+ "value",
109
+ "values",
110
+ "earning",
111
+ "earnings",
112
+ "income",
113
+ "price",
114
+ "prices",
115
+ }
116
+
117
+ # Natural-language synonyms that map to known dimension keywords.
118
+ DIMENSION_KEYWORDS = {
119
+ "location": "location",
120
+ "city": "location",
121
+ "region": "location",
122
+ "payment": "payment_method",
123
+ "payment method": "payment_method",
124
+ "item": "item",
125
+ "items": "item",
126
+ "product": "item",
127
+ "products": "item",
128
+ "category": "item",
129
+ }
130
+
131
+ # Time-related words that should route dimension → time column
132
+ TIME_WORDS = {
133
+ "month",
134
+ "monthly",
135
+ "year",
136
+ "yearly",
137
+ "annual",
138
+ "week",
139
+ "weekly",
140
+ "daily",
141
+ "day",
142
+ "date",
143
+ "over time",
144
+ "when",
145
+ }
146
+
147
+ def run(self, context):
148
+ query = context["user_query"].lower().strip()
149
+ schema = context["schema"]["columns"]
150
+ semantic_map = context["semantic_map"]
151
+
152
+ columns = [col["name"] for col in schema]
153
+
154
+ # Internal/system columns that must never appear in intent
155
+ # (defined early so validation below can reference it)
156
+ INTERNAL_COLS = {"_sheet"}
157
+
158
+ # Validate semantic_map columns exist in the dataframe and are not internal
159
+ default_metric = semantic_map.get("metric")
160
+ default_dimension = semantic_map.get("dimension")
161
+
162
+ if default_metric not in columns or default_metric in INTERNAL_COLS:
163
+ default_metric = None
164
+ if default_dimension not in columns or default_dimension in INTERNAL_COLS:
165
+ # Pick the first real categorical column as fallback
166
+ default_dimension = next(
167
+ (
168
+ c
169
+ for c in columns
170
+ if c not in INTERNAL_COLS
171
+ and not any(h in c for h in {"id", "row_id", "index", "key"})
172
+ ),
173
+ None,
174
+ )
175
+
176
+ # Guard: empty / numeric-only
177
+ if not query or query.isdigit():
178
+ context["error"] = "Please enter a meaningful question."
179
+ return context
180
+
181
+ # ── Base intent ──────────────────────────────────────────────────
182
+ intent = {
183
+ "metric": default_metric,
184
+ "dimension": default_dimension,
185
+ "query_type": "aggregation",
186
+ "operation": "sum",
187
+ "limit": None,
188
+ }
189
+
190
+ # ── Metric resolution ────────────────────────────────────────────
191
+ # (INTERNAL_COLS defined above at validation step)
192
+
193
+ # 1. If an exact column name appears in the query → use it as metric
194
+ metric_found = False
195
+ for col in columns:
196
+ if col in INTERNAL_COLS:
197
+ continue
198
+ readable = col.replace("_", " ").strip()
199
+ if not readable:
200
+ continue
201
+ if col in query or readable in query:
202
+ # Heuristic: if this column is numeric-ish name, treat as metric
203
+ numeric_hints = {
204
+ "amount",
205
+ "price",
206
+ "spent",
207
+ "revenue",
208
+ "sales",
209
+ "cost",
210
+ "total",
211
+ "sum",
212
+ "qty",
213
+ "quantity",
214
+ "profit",
215
+ }
216
+ if any(h in col for h in numeric_hints):
217
+ intent["metric"] = col
218
+ metric_found = True
219
+ break
220
+
221
+ # 2. If a metric synonym appears, keep the configured default metric
222
+ # (don't override — the synonym just confirms "use the metric column")
223
+ if not metric_found:
224
+ for syn in self.METRIC_SYNONYMS:
225
+ if syn in query:
226
+ intent["metric"] = default_metric # keep semantic default
227
+ break
228
+
229
+ # ── Dimension resolution ─────────────────────────────────────────
230
+ # 1. Keyword → dimension map (explicit, fast)
231
+ dim_set = False
232
+ for keyword, col_name in self.DIMENSION_KEYWORDS.items():
233
+ if keyword in query and col_name in columns:
234
+ intent["dimension"] = col_name
235
+ dim_set = True
236
+ break
237
+
238
+ # 2. Exact column name match (catches user's own column names)
239
+ if not dim_set:
240
+ for col in columns:
241
+ if col in INTERNAL_COLS:
242
+ continue
243
+ readable = col.replace("_", " ").strip()
244
+ if not readable:
245
+ continue
246
+ if col in query or readable in query:
247
+ if col != intent.get("metric"): # don't use metric as dimension
248
+ intent["dimension"] = col
249
+ break
250
+
251
+ # 3. Time-column override for trend queries (handled below after query type)
252
+
253
+ # ── Query-type detection ─────────────────────────────────────────
254
+ matched_type = None
255
+ matched_op = None
256
+
257
+ for keyword, q_type, op in self.QUERY_TYPE_MAP:
258
+ if keyword in query:
259
+ matched_type = q_type
260
+ matched_op = op
261
+ break
262
+
263
+ if matched_type:
264
+ intent["query_type"] = matched_type
265
+ intent["operation"] = matched_op
266
+
267
+ # ── top-N: extract explicit number ───────────────────────────────
268
+ if intent["query_type"] == "top_n":
269
+ m = re.search(r"(?:top|bottom)\s+(\d+)", query)
270
+ intent["limit"] = int(m.group(1)) if m else 5
271
+ if "bottom" in query:
272
+ intent["ascending"] = True
273
+
274
+ # ── Time-word detection + granularity ───────────────────────────
275
+ # "which month gave max sales?" → trend grouped by month, not by day.
276
+ # Detect granularity first, then override query_type to trend.
277
+ time_col = semantic_map.get("time")
278
+ has_time_word = any(tw in query for tw in self.TIME_WORDS)
279
+
280
+ # Granularity: what period to group by
281
+ if "year" in query or "annual" in query or "yearly" in query:
282
+ intent["time_granularity"] = "year"
283
+ elif "month" in query or "monthly" in query:
284
+ intent["time_granularity"] = "month"
285
+ elif "week" in query or "weekly" in query:
286
+ intent["time_granularity"] = "week"
287
+ else:
288
+ intent["time_granularity"] = "day" # default: daily
289
+
290
+ if has_time_word:
291
+ if time_col and time_col in columns:
292
+ # Time column configured → route to trend
293
+ intent["query_type"] = "trend"
294
+ intent["dimension"] = time_col
295
+ else:
296
+ # No time column configured → flag the error so pipeline
297
+ # can reject cleanly instead of running a bogus trend query
298
+ intent["no_time_column"] = True
299
+
300
+ # ── trend: always ensure dimension is time column ─────────────────
301
+ if intent["query_type"] == "trend":
302
+ if time_col and time_col in columns:
303
+ intent["dimension"] = time_col
304
+ elif not intent.get("no_time_column"):
305
+ intent["no_time_column"] = True
306
+
307
+ # ── Sort order detection ─────────────────────────────────────────
308
+ # Explicit order phrases take priority over implicit min/max words.
309
+
310
+ ASC_PHRASES = {
311
+ "ascending order",
312
+ "ascending",
313
+ "asc order",
314
+ "asc",
315
+ "lowest to highest",
316
+ "low to high",
317
+ "smallest to largest",
318
+ "increasing order",
319
+ "increasing",
320
+ "worst to best",
321
+ "least to most",
322
+ }
323
+ DESC_PHRASES = {
324
+ "descending order",
325
+ "descending",
326
+ "desc order",
327
+ "desc",
328
+ "highest to lowest",
329
+ "high to low",
330
+ "largest to smallest",
331
+ "decreasing order",
332
+ "decreasing",
333
+ "best to worst",
334
+ "most to least",
335
+ }
336
+
337
+ # Check explicit order phrases first
338
+ explicit_asc = any(p in query for p in ASC_PHRASES)
339
+ explicit_desc = any(p in query for p in DESC_PHRASES)
340
+
341
+ # Implicit: min/low/worst words suggest ascending (lowest first)
342
+ MIN_WORDS = {
343
+ "minimum",
344
+ "min",
345
+ "less",
346
+ "least",
347
+ "lowest",
348
+ "worst",
349
+ "bottom",
350
+ "fewest",
351
+ "smallest",
352
+ }
353
+ implicit_asc = any(w in query for w in MIN_WORDS)
354
+
355
+ if explicit_asc and not explicit_desc:
356
+ intent["ascending"] = True
357
+ elif explicit_desc and not explicit_asc:
358
+ intent["ascending"] = False
359
+ elif implicit_asc and not explicit_desc:
360
+ intent["ascending"] = True
361
+ # else: default (False = descending/highest first) stays
362
+
363
+ # ── Sheet scope ──────────────────────────────────────────────────
364
+ # Detect "in sheet Orders", "from Returns sheet", "across all sheets"
365
+ available_sheets = context.get("excel_sheets", [])
366
+ if available_sheets:
367
+ scope = _detect_sheet_scope(query, available_sheets)
368
+
369
+ # Nonexistent sheet → error immediately, don't silently fall back
370
+ if isinstance(scope, tuple) and scope[0] == _SHEET_NOT_FOUND:
371
+ _, mentioned = scope
372
+ context["error"] = (
373
+ f"❌ Sheet '{mentioned}' is not loaded.\n"
374
+ f" Loaded sheets: {available_sheets}\n\n"
375
+ f" Try one of the loaded sheets, or re-run QueryMind "
376
+ f"and select '{mentioned}' if it exists in your file."
377
+ )
378
+ return context
379
+
380
+ intent["sheet"] = scope
381
+
382
+ # ── Sheet-aware dimension fallback ────────────────────────────
383
+ # If a specific sheet is scoped AND no explicit dimension was
384
+ # found in the query, pick the first valid categorical column
385
+ # from THAT sheet rather than using the global semantic default.
386
+ # This prevents "_sheet" or a cross-sheet column from leaking in.
387
+ if (
388
+ scope
389
+ and not isinstance(scope, tuple)
390
+ and (
391
+ intent["dimension"] == default_dimension
392
+ or intent["dimension"] in INTERNAL_COLS
393
+ or intent["dimension"] is None
394
+ )
395
+ ):
396
+ sheet_df = context.get("sheet_dataframes", {}).get(scope)
397
+ if sheet_df is not None:
398
+ # Find first non-internal, non-numeric, non-id column
399
+ id_hints = {"id", "key", "index", "row", "num", "code"}
400
+ sheet_categoricals = [
401
+ c
402
+ for c in sheet_df.columns
403
+ if c not in INTERNAL_COLS
404
+ and c != intent.get("metric")
405
+ and not any(h in c.lower() for h in id_hints)
406
+ and str(sheet_df[c].dtype) in ("object", "str", "string")
407
+ and "datetime" not in str(sheet_df[c].dtype)
408
+ ]
409
+ if sheet_categoricals:
410
+ intent["dimension"] = sheet_categoricals[0]
411
+
412
+ # ── Confidence ───────────────────────────────────────────────────
413
+ has_strong = any(kw in query for kw in self.STRONG_KEYWORDS)
414
+
415
+ # Sheet name mention in query is a strong signal of analytical intent
416
+ sheet_mentioned = any(
417
+ s.lower() in query for s in context.get("excel_sheets", [])
418
+ )
419
+
420
+ # Column name mention is also a strong signal
421
+ col_mentioned = any(
422
+ col.replace("_", " ") in query or col in query for col in columns
423
+ )
424
+
425
+ context["intent_confidence"] = (
426
+ 0.9 if (has_strong or sheet_mentioned or col_mentioned) else 0.2
427
+ )
428
+
429
+ context["intent"] = intent
430
+ return context
431
+
432
+
433
+ # ── Sheet-scope detection (appended at module level, called inside run) ──────
434
+ # This is a standalone helper imported by InterpreterAgent.run()
435
+ # It detects patterns like:
436
+ # "top 5 sales in sheet Orders"
437
+ # "average profit from the Returns sheet"
438
+ # "across all sheets"
439
+
440
+ import re as _re
441
+
442
+ # Sentinel returned when user mentioned a sheet name that doesn't exist
443
+ _SHEET_NOT_FOUND = "__SHEET_NOT_FOUND__"
444
+
445
+
446
+ def _detect_sheet_scope(query: str, available_sheets: list):
447
+ """
448
+ Returns:
449
+ - sheet name (str) → user referenced a loaded sheet
450
+ - None → use combined df (all sheets / no sheet mentioned)
451
+ - _SHEET_NOT_FOUND → user mentioned "sheet X" but X isn't loaded
452
+ """
453
+ q = query.lower()
454
+
455
+ # "across all sheets" / "all sheets" / "every sheet" → None (use combined df)
456
+ if any(p in q for p in ["all sheets", "across sheets", "every sheet", "all data"]):
457
+ return None
458
+
459
+ # Match a loaded sheet name
460
+ for sheet in available_sheets:
461
+ if sheet.lower() in q:
462
+ return sheet
463
+
464
+ # Detect "sheet <word>" pattern where <word> didn't match any loaded sheet
465
+ sheet_ref = _re.search(
466
+ r"(?:in|from|on|the|of)?\s*sheet\s+([\w\s]+?)(?:\s+sheet)?(?:$|\s+by|\s+in|\s+with|\s+for)",
467
+ q,
468
+ )
469
+ if sheet_ref:
470
+ mentioned = sheet_ref.group(1).strip().title()
471
+ return _SHEET_NOT_FOUND, mentioned # return tuple: sentinel + what user typed
472
+
473
+ return None
app/agents/__init__.py ADDED
File without changes
@@ -0,0 +1,151 @@
1
+ import pandas as pd
2
+
3
+
4
+ class InsightGenerator:
5
+ """
6
+ Converts a raw analysis Series into a human-readable insight string.
7
+ Respects intent["ascending"] for direction-aware language.
8
+ """
9
+
10
+ def run(self, context):
11
+ result = context.get("analysis")
12
+ intent = context.get("intent")
13
+
14
+ if result is None or intent is None:
15
+ return context
16
+
17
+ try:
18
+ metric = intent.get("metric", "value")
19
+ dimension = intent.get("dimension", "category")
20
+ query_type = intent.get("query_type", "aggregation")
21
+ operation = intent.get("operation", "sum")
22
+ ascending = intent.get("ascending", False)
23
+
24
+ # Normalise to Series
25
+ if isinstance(result, pd.DataFrame):
26
+ result = result.squeeze()
27
+
28
+ if not isinstance(result, pd.Series) or result.empty:
29
+ return context
30
+
31
+ # --- Human-readable labels ---
32
+ metric_label = metric.replace("_", " ").title()
33
+ dimension_label = dimension.replace("_", " ").title()
34
+ op_label = "Average" if operation == "mean" else "Total"
35
+
36
+ total = result.sum()
37
+ abs_max = result.abs().max() or 1
38
+
39
+ # Always display table high→low for non-trend queries.
40
+ # Always derive featured item from idxmax/idxmin — never rely
41
+ # on sort order — so the insight is correct regardless of how
42
+ # the analyzer returned the result.
43
+ if query_type != "trend":
44
+ display_result = result.sort_values(ascending=False)
45
+ if ascending:
46
+ featured_value = result.min()
47
+ featured_category = result.idxmin()
48
+ else:
49
+ featured_value = result.max()
50
+ featured_category = result.idxmax()
51
+ else:
52
+ display_result = result # trend keeps chronological order
53
+ featured_value = result.iloc[0]
54
+ featured_category = result.index[0]
55
+
56
+ pct = (featured_value / total * 100) if total else 0
57
+
58
+ # --- Build result table (up to 8 rows) ---
59
+ table_rows = []
60
+ for cat, val in display_result.head(8).items():
61
+ bar_len = int((abs(val) / abs_max) * 20)
62
+ bar = "█" * bar_len
63
+ table_rows.append(f" {str(cat):<25} {bar:<20} {val:>12,.2f}")
64
+ table = "\n".join(table_rows)
65
+
66
+ # --- Direction-aware language ---
67
+ if ascending:
68
+ verb = "has the least"
69
+ heading_pfx = "Bottom"
70
+ else:
71
+ verb = "leads with"
72
+ heading_pfx = "Top"
73
+
74
+ # --- Compose answer ---
75
+ if query_type in ("comparison", "top_n"):
76
+ limit = intent.get("limit")
77
+ if limit:
78
+ heading = f"{heading_pfx} {limit}"
79
+ else:
80
+ heading = "Comparison"
81
+ answer = (
82
+ f"📊 {heading} by {dimension_label}\n"
83
+ f"{'─' * 60}\n"
84
+ f"{table}\n\n"
85
+ f"💡 Insight\n"
86
+ f" {featured_category} {verb} {op_label.lower()} {metric_label} "
87
+ f"of ${featured_value:,.2f} ({pct:.1f}% of total ${total:,.2f})."
88
+ )
89
+
90
+ elif query_type == "aggregation":
91
+ rank_word = "least" if ascending else "highest"
92
+ answer = (
93
+ f"📊 {op_label} {metric_label} by {dimension_label}\n"
94
+ f"{'─' * 60}\n"
95
+ f"{table}\n\n"
96
+ f"💡 Insight\n"
97
+ f" {featured_category} has the {rank_word} {op_label.lower()} "
98
+ f"{metric_label} at ${featured_value:,.2f}."
99
+ )
100
+
101
+ elif query_type == "trend":
102
+ granularity = intent.get("time_granularity", "day")
103
+ gran_label = {
104
+ "year": "Year",
105
+ "month": "Month",
106
+ "week": "Week",
107
+ "day": "Date",
108
+ }.get(granularity, "Date")
109
+
110
+ if ascending:
111
+ featured_val = result.min()
112
+ featured_per = result.idxmin()
113
+ superlative = "Lowest"
114
+ else:
115
+ featured_val = result.max()
116
+ featured_per = result.idxmax()
117
+ superlative = "Peak"
118
+
119
+ last_category = result.index[-1]
120
+ last_value = result.iloc[-1]
121
+ answer = (
122
+ f"📈 {metric_label} by {gran_label}\n"
123
+ f"{'─' * 60}\n"
124
+ f"{table}\n\n"
125
+ f"💡 Insight\n"
126
+ f" {superlative} {gran_label.lower()}: {featured_per} "
127
+ f"(${featured_val:,.2f}). "
128
+ f"Latest: {last_category} (${last_value:,.2f})."
129
+ )
130
+
131
+ else:
132
+ answer = f"📊 Results\n{'─' * 60}\n{table}"
133
+
134
+ context["answer"] = answer
135
+ return context
136
+
137
+ except Exception as e:
138
+ import traceback
139
+
140
+ print(f"[InsightGenerator ERROR] {e}")
141
+ traceback.print_exc()
142
+ context["_insight_error"] = str(e)
143
+ # Always set a fallback answer so the user sees something
144
+ if not context.get("answer"):
145
+ raw = context.get("analysis")
146
+ context["answer"] = (
147
+ raw.to_string()
148
+ if raw is not None
149
+ else "⚠️ Could not format results."
150
+ )
151
+ return context
@@ -0,0 +1,59 @@
1
+ class IntentCorrector:
2
+ """
3
+ Post-processes the intent after the interpreter runs.
4
+
5
+ Responsibilities
6
+ ----------------
7
+ 1. If the resolved metric is not a numeric column → replace with the
8
+ first detected numeric column.
9
+ 2. If the resolved dimension is an ID column (high-cardinality, unique)
10
+ → replace with the first proper categorical column.
11
+ 3. Ensures the intent has a valid query_type (defaults to "aggregation").
12
+
13
+ This runs AFTER the interpreter (rule or LLM) and BEFORE the Analyzer,
14
+ acting as a safety layer so the Analyzer never receives bad column names.
15
+
16
+ Depends on context["semantic_columns"] being populated by SchemaEngine.
17
+ """
18
+
19
+ def run(self, context: dict) -> dict:
20
+ intent = context.get("intent")
21
+ semantic = context.get("semantic_columns") # set by SchemaEngine
22
+
23
+ if not intent:
24
+ return context # nothing to correct
25
+
26
+ if not semantic:
27
+ # SchemaEngine didn't run or produced nothing; can't correct
28
+ return context
29
+
30
+ metrics = semantic.get("metrics", [])
31
+ dimensions = semantic.get("dimensions", [])
32
+ ids = semantic.get("ids", [])
33
+
34
+ # --- Fix metric ---
35
+ if intent.get("metric") not in metrics and metrics:
36
+ intent["metric"] = metrics[0]
37
+
38
+ # --- Fix dimension ---
39
+ current_dim = intent.get("dimension")
40
+ if current_dim in ids and dimensions:
41
+ intent["dimension"] = dimensions[0]
42
+ elif current_dim not in dimensions and dimensions:
43
+ # Only override if the current value isn't in any known column list
44
+ all_known = metrics + dimensions + ids
45
+ if current_dim not in all_known and dimensions:
46
+ intent["dimension"] = dimensions[0]
47
+
48
+ # --- Ensure query_type is set ---
49
+ if not intent.get("query_type"):
50
+ intent["query_type"] = "aggregation"
51
+
52
+ # --- Ensure operation is set ---
53
+ if not intent.get("operation"):
54
+ intent["operation"] = "sum"
55
+
56
+ context["intent"] = intent
57
+ context["intent_corrected"] = True
58
+
59
+ return context