querymind-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- app/agents/InterpreterAgent.py +473 -0
- app/agents/__init__.py +0 -0
- app/agents/insights_generator.py +151 -0
- app/agents/intent_corrector.py +59 -0
- app/agents/llm_intepreter.py +132 -0
- app/agents/narrator.py +27 -0
- app/agents/planner.py +77 -0
- app/cli/__init__.py +0 -0
- app/cli/main.py +346 -0
- app/cli/tui_app.py +98 -0
- app/cli/ui.py +21 -0
- app/core/__init__.py +0 -0
- app/core/context.py +10 -0
- app/core/logger.py +2 -0
- app/core/pipeline.py +379 -0
- app/data/__init__.py +0 -0
- app/data/connectors/csv_connector.py +99 -0
- app/data/connectors/excel_connector.py +68 -0
- app/data/connectors/no_sql_db_connector.py +0 -0
- app/data/connectors/sql_db_connector.py +0 -0
- app/data/schema_engine.py +18 -0
- app/data/type_caster.py +128 -0
- app/executor/__init__.py +0 -0
- app/executor/db_executor.py +0 -0
- app/executor/sheet_selector.py +120 -0
- app/llm/ollama_client.py +47 -0
- app/prompts/interpreter_prompt.txt +28 -0
- app/security/__init__.py +0 -0
- app/security/input_guard.py +133 -0
- app/security/schema_filter.py +20 -0
- app/tests/__init__.py +0 -0
- app/tests/llm_test.py +18 -0
- app/tools/__init__.py +0 -0
- app/tools/analyzer.py +157 -0
- app/tools/join_resolver.py +159 -0
- app/tools/sql_writer.py +37 -0
- app/tools/validator.py +0 -0
- querymind_cli-0.1.0.dist-info/METADATA +139 -0
- querymind_cli-0.1.0.dist-info/RECORD +43 -0
- querymind_cli-0.1.0.dist-info/WHEEL +5 -0
- querymind_cli-0.1.0.dist-info/entry_points.txt +2 -0
- querymind_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- querymind_cli-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,473 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class InterpreterAgent:
|
|
5
|
+
"""
|
|
6
|
+
Rule-based intent interpreter.
|
|
7
|
+
|
|
8
|
+
Builds a structured intent dict from the user query using keyword matching
|
|
9
|
+
and the semantic_map supplied at setup time.
|
|
10
|
+
|
|
11
|
+
Confidence rules
|
|
12
|
+
----------------
|
|
13
|
+
- 0.9 → matched a strong analytical keyword; skip LLM
|
|
14
|
+
- 0.2 → no recognisable keyword; defer to LLM
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
STRONG_KEYWORDS = {
|
|
18
|
+
"highest",
|
|
19
|
+
"lowest",
|
|
20
|
+
"top",
|
|
21
|
+
"bottom",
|
|
22
|
+
"average",
|
|
23
|
+
"avg",
|
|
24
|
+
"mean",
|
|
25
|
+
"trend",
|
|
26
|
+
"over time",
|
|
27
|
+
"monthly",
|
|
28
|
+
"daily",
|
|
29
|
+
"weekly",
|
|
30
|
+
"yearly",
|
|
31
|
+
"total",
|
|
32
|
+
"sum",
|
|
33
|
+
"most",
|
|
34
|
+
"least",
|
|
35
|
+
"max",
|
|
36
|
+
"min",
|
|
37
|
+
"distribution",
|
|
38
|
+
"breakdown",
|
|
39
|
+
"compare",
|
|
40
|
+
"which",
|
|
41
|
+
"what",
|
|
42
|
+
"when",
|
|
43
|
+
# Display / grouping words — common in natural phrasing
|
|
44
|
+
"show",
|
|
45
|
+
"show me",
|
|
46
|
+
"give me",
|
|
47
|
+
"list",
|
|
48
|
+
"get",
|
|
49
|
+
"find",
|
|
50
|
+
"by",
|
|
51
|
+
"per",
|
|
52
|
+
"across",
|
|
53
|
+
"grouped",
|
|
54
|
+
"group",
|
|
55
|
+
# Sort order
|
|
56
|
+
"ascending",
|
|
57
|
+
"descending",
|
|
58
|
+
"asc",
|
|
59
|
+
"desc",
|
|
60
|
+
"increasing",
|
|
61
|
+
"decreasing",
|
|
62
|
+
"sorted",
|
|
63
|
+
"order",
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
# Order matters: first match wins.
|
|
67
|
+
QUERY_TYPE_MAP = [
|
|
68
|
+
("top", "top_n", "sum"),
|
|
69
|
+
("bottom", "top_n", "sum"),
|
|
70
|
+
("highest", "comparison", "sum"),
|
|
71
|
+
("lowest", "comparison", "sum"),
|
|
72
|
+
("most", "comparison", "sum"),
|
|
73
|
+
("least", "comparison", "sum"),
|
|
74
|
+
("average", "aggregation", "mean"),
|
|
75
|
+
("avg", "aggregation", "mean"),
|
|
76
|
+
("mean", "aggregation", "mean"),
|
|
77
|
+
("trend", "trend", "sum"),
|
|
78
|
+
("over time", "trend", "sum"),
|
|
79
|
+
("monthly", "trend", "sum"),
|
|
80
|
+
("daily", "trend", "sum"),
|
|
81
|
+
("total", "aggregation", "sum"),
|
|
82
|
+
("sum", "aggregation", "sum"),
|
|
83
|
+
("distribution", "aggregation", "sum"),
|
|
84
|
+
("breakdown", "aggregation", "sum"),
|
|
85
|
+
("compare", "comparison", "sum"),
|
|
86
|
+
("max", "comparison", "sum"),
|
|
87
|
+
("min", "comparison", "sum"),
|
|
88
|
+
("weekly", "trend", "sum"),
|
|
89
|
+
("yearly", "trend", "sum"),
|
|
90
|
+
("which", "comparison", "sum"),
|
|
91
|
+
("when", "trend", "sum"),
|
|
92
|
+
]
|
|
93
|
+
|
|
94
|
+
# Natural-language synonyms that mean "use the configured metric".
|
|
95
|
+
# When a user says "top 5 items by SALES", "sales" isn't a column —
|
|
96
|
+
# it's a synonym for whatever metric they configured.
|
|
97
|
+
METRIC_SYNONYMS = {
|
|
98
|
+
"sales",
|
|
99
|
+
"revenue",
|
|
100
|
+
"profit",
|
|
101
|
+
"spend",
|
|
102
|
+
"spending",
|
|
103
|
+
"spent",
|
|
104
|
+
"cost",
|
|
105
|
+
"costs",
|
|
106
|
+
"amount",
|
|
107
|
+
"amounts",
|
|
108
|
+
"value",
|
|
109
|
+
"values",
|
|
110
|
+
"earning",
|
|
111
|
+
"earnings",
|
|
112
|
+
"income",
|
|
113
|
+
"price",
|
|
114
|
+
"prices",
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
# Natural-language synonyms that map to known dimension keywords.
|
|
118
|
+
DIMENSION_KEYWORDS = {
|
|
119
|
+
"location": "location",
|
|
120
|
+
"city": "location",
|
|
121
|
+
"region": "location",
|
|
122
|
+
"payment": "payment_method",
|
|
123
|
+
"payment method": "payment_method",
|
|
124
|
+
"item": "item",
|
|
125
|
+
"items": "item",
|
|
126
|
+
"product": "item",
|
|
127
|
+
"products": "item",
|
|
128
|
+
"category": "item",
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
# Time-related words that should route dimension → time column
|
|
132
|
+
TIME_WORDS = {
|
|
133
|
+
"month",
|
|
134
|
+
"monthly",
|
|
135
|
+
"year",
|
|
136
|
+
"yearly",
|
|
137
|
+
"annual",
|
|
138
|
+
"week",
|
|
139
|
+
"weekly",
|
|
140
|
+
"daily",
|
|
141
|
+
"day",
|
|
142
|
+
"date",
|
|
143
|
+
"over time",
|
|
144
|
+
"when",
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
def run(self, context):
|
|
148
|
+
query = context["user_query"].lower().strip()
|
|
149
|
+
schema = context["schema"]["columns"]
|
|
150
|
+
semantic_map = context["semantic_map"]
|
|
151
|
+
|
|
152
|
+
columns = [col["name"] for col in schema]
|
|
153
|
+
|
|
154
|
+
# Internal/system columns that must never appear in intent
|
|
155
|
+
# (defined early so validation below can reference it)
|
|
156
|
+
INTERNAL_COLS = {"_sheet"}
|
|
157
|
+
|
|
158
|
+
# Validate semantic_map columns exist in the dataframe and are not internal
|
|
159
|
+
default_metric = semantic_map.get("metric")
|
|
160
|
+
default_dimension = semantic_map.get("dimension")
|
|
161
|
+
|
|
162
|
+
if default_metric not in columns or default_metric in INTERNAL_COLS:
|
|
163
|
+
default_metric = None
|
|
164
|
+
if default_dimension not in columns or default_dimension in INTERNAL_COLS:
|
|
165
|
+
# Pick the first real categorical column as fallback
|
|
166
|
+
default_dimension = next(
|
|
167
|
+
(
|
|
168
|
+
c
|
|
169
|
+
for c in columns
|
|
170
|
+
if c not in INTERNAL_COLS
|
|
171
|
+
and not any(h in c for h in {"id", "row_id", "index", "key"})
|
|
172
|
+
),
|
|
173
|
+
None,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# Guard: empty / numeric-only
|
|
177
|
+
if not query or query.isdigit():
|
|
178
|
+
context["error"] = "Please enter a meaningful question."
|
|
179
|
+
return context
|
|
180
|
+
|
|
181
|
+
# ── Base intent ──────────────────────────────────────────────────
|
|
182
|
+
intent = {
|
|
183
|
+
"metric": default_metric,
|
|
184
|
+
"dimension": default_dimension,
|
|
185
|
+
"query_type": "aggregation",
|
|
186
|
+
"operation": "sum",
|
|
187
|
+
"limit": None,
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
# ── Metric resolution ────────────────────────────────────────────
|
|
191
|
+
# (INTERNAL_COLS defined above at validation step)
|
|
192
|
+
|
|
193
|
+
# 1. If an exact column name appears in the query → use it as metric
|
|
194
|
+
metric_found = False
|
|
195
|
+
for col in columns:
|
|
196
|
+
if col in INTERNAL_COLS:
|
|
197
|
+
continue
|
|
198
|
+
readable = col.replace("_", " ").strip()
|
|
199
|
+
if not readable:
|
|
200
|
+
continue
|
|
201
|
+
if col in query or readable in query:
|
|
202
|
+
# Heuristic: if this column is numeric-ish name, treat as metric
|
|
203
|
+
numeric_hints = {
|
|
204
|
+
"amount",
|
|
205
|
+
"price",
|
|
206
|
+
"spent",
|
|
207
|
+
"revenue",
|
|
208
|
+
"sales",
|
|
209
|
+
"cost",
|
|
210
|
+
"total",
|
|
211
|
+
"sum",
|
|
212
|
+
"qty",
|
|
213
|
+
"quantity",
|
|
214
|
+
"profit",
|
|
215
|
+
}
|
|
216
|
+
if any(h in col for h in numeric_hints):
|
|
217
|
+
intent["metric"] = col
|
|
218
|
+
metric_found = True
|
|
219
|
+
break
|
|
220
|
+
|
|
221
|
+
# 2. If a metric synonym appears, keep the configured default metric
|
|
222
|
+
# (don't override — the synonym just confirms "use the metric column")
|
|
223
|
+
if not metric_found:
|
|
224
|
+
for syn in self.METRIC_SYNONYMS:
|
|
225
|
+
if syn in query:
|
|
226
|
+
intent["metric"] = default_metric # keep semantic default
|
|
227
|
+
break
|
|
228
|
+
|
|
229
|
+
# ── Dimension resolution ─────────────────────────────────────────
|
|
230
|
+
# 1. Keyword → dimension map (explicit, fast)
|
|
231
|
+
dim_set = False
|
|
232
|
+
for keyword, col_name in self.DIMENSION_KEYWORDS.items():
|
|
233
|
+
if keyword in query and col_name in columns:
|
|
234
|
+
intent["dimension"] = col_name
|
|
235
|
+
dim_set = True
|
|
236
|
+
break
|
|
237
|
+
|
|
238
|
+
# 2. Exact column name match (catches user's own column names)
|
|
239
|
+
if not dim_set:
|
|
240
|
+
for col in columns:
|
|
241
|
+
if col in INTERNAL_COLS:
|
|
242
|
+
continue
|
|
243
|
+
readable = col.replace("_", " ").strip()
|
|
244
|
+
if not readable:
|
|
245
|
+
continue
|
|
246
|
+
if col in query or readable in query:
|
|
247
|
+
if col != intent.get("metric"): # don't use metric as dimension
|
|
248
|
+
intent["dimension"] = col
|
|
249
|
+
break
|
|
250
|
+
|
|
251
|
+
# 3. Time-column override for trend queries (handled below after query type)
|
|
252
|
+
|
|
253
|
+
# ── Query-type detection ─────────────────────────────────────────
|
|
254
|
+
matched_type = None
|
|
255
|
+
matched_op = None
|
|
256
|
+
|
|
257
|
+
for keyword, q_type, op in self.QUERY_TYPE_MAP:
|
|
258
|
+
if keyword in query:
|
|
259
|
+
matched_type = q_type
|
|
260
|
+
matched_op = op
|
|
261
|
+
break
|
|
262
|
+
|
|
263
|
+
if matched_type:
|
|
264
|
+
intent["query_type"] = matched_type
|
|
265
|
+
intent["operation"] = matched_op
|
|
266
|
+
|
|
267
|
+
# ── top-N: extract explicit number ───────────────────────────────
|
|
268
|
+
if intent["query_type"] == "top_n":
|
|
269
|
+
m = re.search(r"(?:top|bottom)\s+(\d+)", query)
|
|
270
|
+
intent["limit"] = int(m.group(1)) if m else 5
|
|
271
|
+
if "bottom" in query:
|
|
272
|
+
intent["ascending"] = True
|
|
273
|
+
|
|
274
|
+
# ── Time-word detection + granularity ───────────────────────────
|
|
275
|
+
# "which month gave max sales?" → trend grouped by month, not by day.
|
|
276
|
+
# Detect granularity first, then override query_type to trend.
|
|
277
|
+
time_col = semantic_map.get("time")
|
|
278
|
+
has_time_word = any(tw in query for tw in self.TIME_WORDS)
|
|
279
|
+
|
|
280
|
+
# Granularity: what period to group by
|
|
281
|
+
if "year" in query or "annual" in query or "yearly" in query:
|
|
282
|
+
intent["time_granularity"] = "year"
|
|
283
|
+
elif "month" in query or "monthly" in query:
|
|
284
|
+
intent["time_granularity"] = "month"
|
|
285
|
+
elif "week" in query or "weekly" in query:
|
|
286
|
+
intent["time_granularity"] = "week"
|
|
287
|
+
else:
|
|
288
|
+
intent["time_granularity"] = "day" # default: daily
|
|
289
|
+
|
|
290
|
+
if has_time_word:
|
|
291
|
+
if time_col and time_col in columns:
|
|
292
|
+
# Time column configured → route to trend
|
|
293
|
+
intent["query_type"] = "trend"
|
|
294
|
+
intent["dimension"] = time_col
|
|
295
|
+
else:
|
|
296
|
+
# No time column configured → flag the error so pipeline
|
|
297
|
+
# can reject cleanly instead of running a bogus trend query
|
|
298
|
+
intent["no_time_column"] = True
|
|
299
|
+
|
|
300
|
+
# ── trend: always ensure dimension is time column ─────────────────
|
|
301
|
+
if intent["query_type"] == "trend":
|
|
302
|
+
if time_col and time_col in columns:
|
|
303
|
+
intent["dimension"] = time_col
|
|
304
|
+
elif not intent.get("no_time_column"):
|
|
305
|
+
intent["no_time_column"] = True
|
|
306
|
+
|
|
307
|
+
# ── Sort order detection ─────────────────────────────────────────
|
|
308
|
+
# Explicit order phrases take priority over implicit min/max words.
|
|
309
|
+
|
|
310
|
+
ASC_PHRASES = {
|
|
311
|
+
"ascending order",
|
|
312
|
+
"ascending",
|
|
313
|
+
"asc order",
|
|
314
|
+
"asc",
|
|
315
|
+
"lowest to highest",
|
|
316
|
+
"low to high",
|
|
317
|
+
"smallest to largest",
|
|
318
|
+
"increasing order",
|
|
319
|
+
"increasing",
|
|
320
|
+
"worst to best",
|
|
321
|
+
"least to most",
|
|
322
|
+
}
|
|
323
|
+
DESC_PHRASES = {
|
|
324
|
+
"descending order",
|
|
325
|
+
"descending",
|
|
326
|
+
"desc order",
|
|
327
|
+
"desc",
|
|
328
|
+
"highest to lowest",
|
|
329
|
+
"high to low",
|
|
330
|
+
"largest to smallest",
|
|
331
|
+
"decreasing order",
|
|
332
|
+
"decreasing",
|
|
333
|
+
"best to worst",
|
|
334
|
+
"most to least",
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
# Check explicit order phrases first
|
|
338
|
+
explicit_asc = any(p in query for p in ASC_PHRASES)
|
|
339
|
+
explicit_desc = any(p in query for p in DESC_PHRASES)
|
|
340
|
+
|
|
341
|
+
# Implicit: min/low/worst words suggest ascending (lowest first)
|
|
342
|
+
MIN_WORDS = {
|
|
343
|
+
"minimum",
|
|
344
|
+
"min",
|
|
345
|
+
"less",
|
|
346
|
+
"least",
|
|
347
|
+
"lowest",
|
|
348
|
+
"worst",
|
|
349
|
+
"bottom",
|
|
350
|
+
"fewest",
|
|
351
|
+
"smallest",
|
|
352
|
+
}
|
|
353
|
+
implicit_asc = any(w in query for w in MIN_WORDS)
|
|
354
|
+
|
|
355
|
+
if explicit_asc and not explicit_desc:
|
|
356
|
+
intent["ascending"] = True
|
|
357
|
+
elif explicit_desc and not explicit_asc:
|
|
358
|
+
intent["ascending"] = False
|
|
359
|
+
elif implicit_asc and not explicit_desc:
|
|
360
|
+
intent["ascending"] = True
|
|
361
|
+
# else: default (False = descending/highest first) stays
|
|
362
|
+
|
|
363
|
+
# ── Sheet scope ──────────────────────────────────────────────────
|
|
364
|
+
# Detect "in sheet Orders", "from Returns sheet", "across all sheets"
|
|
365
|
+
available_sheets = context.get("excel_sheets", [])
|
|
366
|
+
if available_sheets:
|
|
367
|
+
scope = _detect_sheet_scope(query, available_sheets)
|
|
368
|
+
|
|
369
|
+
# Nonexistent sheet → error immediately, don't silently fall back
|
|
370
|
+
if isinstance(scope, tuple) and scope[0] == _SHEET_NOT_FOUND:
|
|
371
|
+
_, mentioned = scope
|
|
372
|
+
context["error"] = (
|
|
373
|
+
f"❌ Sheet '{mentioned}' is not loaded.\n"
|
|
374
|
+
f" Loaded sheets: {available_sheets}\n\n"
|
|
375
|
+
f" Try one of the loaded sheets, or re-run QueryMind "
|
|
376
|
+
f"and select '{mentioned}' if it exists in your file."
|
|
377
|
+
)
|
|
378
|
+
return context
|
|
379
|
+
|
|
380
|
+
intent["sheet"] = scope
|
|
381
|
+
|
|
382
|
+
# ── Sheet-aware dimension fallback ────────────────────────────
|
|
383
|
+
# If a specific sheet is scoped AND no explicit dimension was
|
|
384
|
+
# found in the query, pick the first valid categorical column
|
|
385
|
+
# from THAT sheet rather than using the global semantic default.
|
|
386
|
+
# This prevents "_sheet" or a cross-sheet column from leaking in.
|
|
387
|
+
if (
|
|
388
|
+
scope
|
|
389
|
+
and not isinstance(scope, tuple)
|
|
390
|
+
and (
|
|
391
|
+
intent["dimension"] == default_dimension
|
|
392
|
+
or intent["dimension"] in INTERNAL_COLS
|
|
393
|
+
or intent["dimension"] is None
|
|
394
|
+
)
|
|
395
|
+
):
|
|
396
|
+
sheet_df = context.get("sheet_dataframes", {}).get(scope)
|
|
397
|
+
if sheet_df is not None:
|
|
398
|
+
# Find first non-internal, non-numeric, non-id column
|
|
399
|
+
id_hints = {"id", "key", "index", "row", "num", "code"}
|
|
400
|
+
sheet_categoricals = [
|
|
401
|
+
c
|
|
402
|
+
for c in sheet_df.columns
|
|
403
|
+
if c not in INTERNAL_COLS
|
|
404
|
+
and c != intent.get("metric")
|
|
405
|
+
and not any(h in c.lower() for h in id_hints)
|
|
406
|
+
and str(sheet_df[c].dtype) in ("object", "str", "string")
|
|
407
|
+
and "datetime" not in str(sheet_df[c].dtype)
|
|
408
|
+
]
|
|
409
|
+
if sheet_categoricals:
|
|
410
|
+
intent["dimension"] = sheet_categoricals[0]
|
|
411
|
+
|
|
412
|
+
# ── Confidence ───────────────────────────────────────────────────
|
|
413
|
+
has_strong = any(kw in query for kw in self.STRONG_KEYWORDS)
|
|
414
|
+
|
|
415
|
+
# Sheet name mention in query is a strong signal of analytical intent
|
|
416
|
+
sheet_mentioned = any(
|
|
417
|
+
s.lower() in query for s in context.get("excel_sheets", [])
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
# Column name mention is also a strong signal
|
|
421
|
+
col_mentioned = any(
|
|
422
|
+
col.replace("_", " ") in query or col in query for col in columns
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
context["intent_confidence"] = (
|
|
426
|
+
0.9 if (has_strong or sheet_mentioned or col_mentioned) else 0.2
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
context["intent"] = intent
|
|
430
|
+
return context
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
# ── Sheet-scope detection (appended at module level, called inside run) ──────
|
|
434
|
+
# This is a standalone helper imported by InterpreterAgent.run()
|
|
435
|
+
# It detects patterns like:
|
|
436
|
+
# "top 5 sales in sheet Orders"
|
|
437
|
+
# "average profit from the Returns sheet"
|
|
438
|
+
# "across all sheets"
|
|
439
|
+
|
|
440
|
+
import re as _re
|
|
441
|
+
|
|
442
|
+
# Sentinel returned when user mentioned a sheet name that doesn't exist
|
|
443
|
+
_SHEET_NOT_FOUND = "__SHEET_NOT_FOUND__"
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
def _detect_sheet_scope(query: str, available_sheets: list):
|
|
447
|
+
"""
|
|
448
|
+
Returns:
|
|
449
|
+
- sheet name (str) → user referenced a loaded sheet
|
|
450
|
+
- None → use combined df (all sheets / no sheet mentioned)
|
|
451
|
+
- _SHEET_NOT_FOUND → user mentioned "sheet X" but X isn't loaded
|
|
452
|
+
"""
|
|
453
|
+
q = query.lower()
|
|
454
|
+
|
|
455
|
+
# "across all sheets" / "all sheets" / "every sheet" → None (use combined df)
|
|
456
|
+
if any(p in q for p in ["all sheets", "across sheets", "every sheet", "all data"]):
|
|
457
|
+
return None
|
|
458
|
+
|
|
459
|
+
# Match a loaded sheet name
|
|
460
|
+
for sheet in available_sheets:
|
|
461
|
+
if sheet.lower() in q:
|
|
462
|
+
return sheet
|
|
463
|
+
|
|
464
|
+
# Detect "sheet <word>" pattern where <word> didn't match any loaded sheet
|
|
465
|
+
sheet_ref = _re.search(
|
|
466
|
+
r"(?:in|from|on|the|of)?\s*sheet\s+([\w\s]+?)(?:\s+sheet)?(?:$|\s+by|\s+in|\s+with|\s+for)",
|
|
467
|
+
q,
|
|
468
|
+
)
|
|
469
|
+
if sheet_ref:
|
|
470
|
+
mentioned = sheet_ref.group(1).strip().title()
|
|
471
|
+
return _SHEET_NOT_FOUND, mentioned # return tuple: sentinel + what user typed
|
|
472
|
+
|
|
473
|
+
return None
|
app/agents/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class InsightGenerator:
|
|
5
|
+
"""
|
|
6
|
+
Converts a raw analysis Series into a human-readable insight string.
|
|
7
|
+
Respects intent["ascending"] for direction-aware language.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
def run(self, context):
|
|
11
|
+
result = context.get("analysis")
|
|
12
|
+
intent = context.get("intent")
|
|
13
|
+
|
|
14
|
+
if result is None or intent is None:
|
|
15
|
+
return context
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
metric = intent.get("metric", "value")
|
|
19
|
+
dimension = intent.get("dimension", "category")
|
|
20
|
+
query_type = intent.get("query_type", "aggregation")
|
|
21
|
+
operation = intent.get("operation", "sum")
|
|
22
|
+
ascending = intent.get("ascending", False)
|
|
23
|
+
|
|
24
|
+
# Normalise to Series
|
|
25
|
+
if isinstance(result, pd.DataFrame):
|
|
26
|
+
result = result.squeeze()
|
|
27
|
+
|
|
28
|
+
if not isinstance(result, pd.Series) or result.empty:
|
|
29
|
+
return context
|
|
30
|
+
|
|
31
|
+
# --- Human-readable labels ---
|
|
32
|
+
metric_label = metric.replace("_", " ").title()
|
|
33
|
+
dimension_label = dimension.replace("_", " ").title()
|
|
34
|
+
op_label = "Average" if operation == "mean" else "Total"
|
|
35
|
+
|
|
36
|
+
total = result.sum()
|
|
37
|
+
abs_max = result.abs().max() or 1
|
|
38
|
+
|
|
39
|
+
# Always display table high→low for non-trend queries.
|
|
40
|
+
# Always derive featured item from idxmax/idxmin — never rely
|
|
41
|
+
# on sort order — so the insight is correct regardless of how
|
|
42
|
+
# the analyzer returned the result.
|
|
43
|
+
if query_type != "trend":
|
|
44
|
+
display_result = result.sort_values(ascending=False)
|
|
45
|
+
if ascending:
|
|
46
|
+
featured_value = result.min()
|
|
47
|
+
featured_category = result.idxmin()
|
|
48
|
+
else:
|
|
49
|
+
featured_value = result.max()
|
|
50
|
+
featured_category = result.idxmax()
|
|
51
|
+
else:
|
|
52
|
+
display_result = result # trend keeps chronological order
|
|
53
|
+
featured_value = result.iloc[0]
|
|
54
|
+
featured_category = result.index[0]
|
|
55
|
+
|
|
56
|
+
pct = (featured_value / total * 100) if total else 0
|
|
57
|
+
|
|
58
|
+
# --- Build result table (up to 8 rows) ---
|
|
59
|
+
table_rows = []
|
|
60
|
+
for cat, val in display_result.head(8).items():
|
|
61
|
+
bar_len = int((abs(val) / abs_max) * 20)
|
|
62
|
+
bar = "█" * bar_len
|
|
63
|
+
table_rows.append(f" {str(cat):<25} {bar:<20} {val:>12,.2f}")
|
|
64
|
+
table = "\n".join(table_rows)
|
|
65
|
+
|
|
66
|
+
# --- Direction-aware language ---
|
|
67
|
+
if ascending:
|
|
68
|
+
verb = "has the least"
|
|
69
|
+
heading_pfx = "Bottom"
|
|
70
|
+
else:
|
|
71
|
+
verb = "leads with"
|
|
72
|
+
heading_pfx = "Top"
|
|
73
|
+
|
|
74
|
+
# --- Compose answer ---
|
|
75
|
+
if query_type in ("comparison", "top_n"):
|
|
76
|
+
limit = intent.get("limit")
|
|
77
|
+
if limit:
|
|
78
|
+
heading = f"{heading_pfx} {limit}"
|
|
79
|
+
else:
|
|
80
|
+
heading = "Comparison"
|
|
81
|
+
answer = (
|
|
82
|
+
f"📊 {heading} by {dimension_label}\n"
|
|
83
|
+
f"{'─' * 60}\n"
|
|
84
|
+
f"{table}\n\n"
|
|
85
|
+
f"💡 Insight\n"
|
|
86
|
+
f" {featured_category} {verb} {op_label.lower()} {metric_label} "
|
|
87
|
+
f"of ${featured_value:,.2f} ({pct:.1f}% of total ${total:,.2f})."
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
elif query_type == "aggregation":
|
|
91
|
+
rank_word = "least" if ascending else "highest"
|
|
92
|
+
answer = (
|
|
93
|
+
f"📊 {op_label} {metric_label} by {dimension_label}\n"
|
|
94
|
+
f"{'─' * 60}\n"
|
|
95
|
+
f"{table}\n\n"
|
|
96
|
+
f"💡 Insight\n"
|
|
97
|
+
f" {featured_category} has the {rank_word} {op_label.lower()} "
|
|
98
|
+
f"{metric_label} at ${featured_value:,.2f}."
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
elif query_type == "trend":
|
|
102
|
+
granularity = intent.get("time_granularity", "day")
|
|
103
|
+
gran_label = {
|
|
104
|
+
"year": "Year",
|
|
105
|
+
"month": "Month",
|
|
106
|
+
"week": "Week",
|
|
107
|
+
"day": "Date",
|
|
108
|
+
}.get(granularity, "Date")
|
|
109
|
+
|
|
110
|
+
if ascending:
|
|
111
|
+
featured_val = result.min()
|
|
112
|
+
featured_per = result.idxmin()
|
|
113
|
+
superlative = "Lowest"
|
|
114
|
+
else:
|
|
115
|
+
featured_val = result.max()
|
|
116
|
+
featured_per = result.idxmax()
|
|
117
|
+
superlative = "Peak"
|
|
118
|
+
|
|
119
|
+
last_category = result.index[-1]
|
|
120
|
+
last_value = result.iloc[-1]
|
|
121
|
+
answer = (
|
|
122
|
+
f"📈 {metric_label} by {gran_label}\n"
|
|
123
|
+
f"{'─' * 60}\n"
|
|
124
|
+
f"{table}\n\n"
|
|
125
|
+
f"💡 Insight\n"
|
|
126
|
+
f" {superlative} {gran_label.lower()}: {featured_per} "
|
|
127
|
+
f"(${featured_val:,.2f}). "
|
|
128
|
+
f"Latest: {last_category} (${last_value:,.2f})."
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
else:
|
|
132
|
+
answer = f"📊 Results\n{'─' * 60}\n{table}"
|
|
133
|
+
|
|
134
|
+
context["answer"] = answer
|
|
135
|
+
return context
|
|
136
|
+
|
|
137
|
+
except Exception as e:
|
|
138
|
+
import traceback
|
|
139
|
+
|
|
140
|
+
print(f"[InsightGenerator ERROR] {e}")
|
|
141
|
+
traceback.print_exc()
|
|
142
|
+
context["_insight_error"] = str(e)
|
|
143
|
+
# Always set a fallback answer so the user sees something
|
|
144
|
+
if not context.get("answer"):
|
|
145
|
+
raw = context.get("analysis")
|
|
146
|
+
context["answer"] = (
|
|
147
|
+
raw.to_string()
|
|
148
|
+
if raw is not None
|
|
149
|
+
else "⚠️ Could not format results."
|
|
150
|
+
)
|
|
151
|
+
return context
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
class IntentCorrector:
|
|
2
|
+
"""
|
|
3
|
+
Post-processes the intent after the interpreter runs.
|
|
4
|
+
|
|
5
|
+
Responsibilities
|
|
6
|
+
----------------
|
|
7
|
+
1. If the resolved metric is not a numeric column → replace with the
|
|
8
|
+
first detected numeric column.
|
|
9
|
+
2. If the resolved dimension is an ID column (high-cardinality, unique)
|
|
10
|
+
→ replace with the first proper categorical column.
|
|
11
|
+
3. Ensures the intent has a valid query_type (defaults to "aggregation").
|
|
12
|
+
|
|
13
|
+
This runs AFTER the interpreter (rule or LLM) and BEFORE the Analyzer,
|
|
14
|
+
acting as a safety layer so the Analyzer never receives bad column names.
|
|
15
|
+
|
|
16
|
+
Depends on context["semantic_columns"] being populated by SchemaEngine.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def run(self, context: dict) -> dict:
|
|
20
|
+
intent = context.get("intent")
|
|
21
|
+
semantic = context.get("semantic_columns") # set by SchemaEngine
|
|
22
|
+
|
|
23
|
+
if not intent:
|
|
24
|
+
return context # nothing to correct
|
|
25
|
+
|
|
26
|
+
if not semantic:
|
|
27
|
+
# SchemaEngine didn't run or produced nothing; can't correct
|
|
28
|
+
return context
|
|
29
|
+
|
|
30
|
+
metrics = semantic.get("metrics", [])
|
|
31
|
+
dimensions = semantic.get("dimensions", [])
|
|
32
|
+
ids = semantic.get("ids", [])
|
|
33
|
+
|
|
34
|
+
# --- Fix metric ---
|
|
35
|
+
if intent.get("metric") not in metrics and metrics:
|
|
36
|
+
intent["metric"] = metrics[0]
|
|
37
|
+
|
|
38
|
+
# --- Fix dimension ---
|
|
39
|
+
current_dim = intent.get("dimension")
|
|
40
|
+
if current_dim in ids and dimensions:
|
|
41
|
+
intent["dimension"] = dimensions[0]
|
|
42
|
+
elif current_dim not in dimensions and dimensions:
|
|
43
|
+
# Only override if the current value isn't in any known column list
|
|
44
|
+
all_known = metrics + dimensions + ids
|
|
45
|
+
if current_dim not in all_known and dimensions:
|
|
46
|
+
intent["dimension"] = dimensions[0]
|
|
47
|
+
|
|
48
|
+
# --- Ensure query_type is set ---
|
|
49
|
+
if not intent.get("query_type"):
|
|
50
|
+
intent["query_type"] = "aggregation"
|
|
51
|
+
|
|
52
|
+
# --- Ensure operation is set ---
|
|
53
|
+
if not intent.get("operation"):
|
|
54
|
+
intent["operation"] = "sum"
|
|
55
|
+
|
|
56
|
+
context["intent"] = intent
|
|
57
|
+
context["intent_corrected"] = True
|
|
58
|
+
|
|
59
|
+
return context
|