querymind-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- app/agents/InterpreterAgent.py +473 -0
- app/agents/__init__.py +0 -0
- app/agents/insights_generator.py +151 -0
- app/agents/intent_corrector.py +59 -0
- app/agents/llm_intepreter.py +132 -0
- app/agents/narrator.py +27 -0
- app/agents/planner.py +77 -0
- app/cli/__init__.py +0 -0
- app/cli/main.py +346 -0
- app/cli/tui_app.py +98 -0
- app/cli/ui.py +21 -0
- app/core/__init__.py +0 -0
- app/core/context.py +10 -0
- app/core/logger.py +2 -0
- app/core/pipeline.py +379 -0
- app/data/__init__.py +0 -0
- app/data/connectors/csv_connector.py +99 -0
- app/data/connectors/excel_connector.py +68 -0
- app/data/connectors/no_sql_db_connector.py +0 -0
- app/data/connectors/sql_db_connector.py +0 -0
- app/data/schema_engine.py +18 -0
- app/data/type_caster.py +128 -0
- app/executor/__init__.py +0 -0
- app/executor/db_executor.py +0 -0
- app/executor/sheet_selector.py +120 -0
- app/llm/ollama_client.py +47 -0
- app/prompts/interpreter_prompt.txt +28 -0
- app/security/__init__.py +0 -0
- app/security/input_guard.py +133 -0
- app/security/schema_filter.py +20 -0
- app/tests/__init__.py +0 -0
- app/tests/llm_test.py +18 -0
- app/tools/__init__.py +0 -0
- app/tools/analyzer.py +157 -0
- app/tools/join_resolver.py +159 -0
- app/tools/sql_writer.py +37 -0
- app/tools/validator.py +0 -0
- querymind_cli-0.1.0.dist-info/METADATA +139 -0
- querymind_cli-0.1.0.dist-info/RECORD +43 -0
- querymind_cli-0.1.0.dist-info/WHEEL +5 -0
- querymind_cli-0.1.0.dist-info/entry_points.txt +2 -0
- querymind_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- querymind_cli-0.1.0.dist-info/top_level.txt +1 -0
app/cli/ui.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from rich.console import Console
|
|
2
|
+
from rich.panel import Panel
|
|
3
|
+
|
|
4
|
+
console = Console()
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def show_header():
|
|
8
|
+
console.print(
|
|
9
|
+
Panel.fit(
|
|
10
|
+
"[bold #2dd9fe] 🧠 QueryMind CLI [/bold #2dd9fe]\n"
|
|
11
|
+
"[#74ee15 ] Your AI Data Analyst [/#74ee15]",
|
|
12
|
+
border_style="blue",
|
|
13
|
+
)
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def show_message(role, text):
|
|
18
|
+
if role == "user":
|
|
19
|
+
console.print(f"[bold blue]>> {text}[/bold blue]")
|
|
20
|
+
else:
|
|
21
|
+
console.print(f"[bold green]>>💡 {text}[/bold green]")
|
app/core/__init__.py
ADDED
|
File without changes
|
app/core/context.py
ADDED
app/core/logger.py
ADDED
app/core/pipeline.py
ADDED
|
@@ -0,0 +1,379 @@
|
|
|
1
|
+
from app.agents.InterpreterAgent import InterpreterAgent
|
|
2
|
+
from app.agents.llm_intepreter import LLMInterpreter
|
|
3
|
+
from app.tools.analyzer import Analyzer
|
|
4
|
+
from app.security.input_guard import InputGuard
|
|
5
|
+
from app.data.connectors.csv_connector import CSVConnector
|
|
6
|
+
from app.data.connectors.excel_connector import ExcelConnector
|
|
7
|
+
from app.security.schema_filter import SchemaFilter
|
|
8
|
+
from app.data.schema_engine import SchemaEngine
|
|
9
|
+
from app.agents.insights_generator import InsightGenerator
|
|
10
|
+
from app.tools.join_resolver import JoinResolver
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class QueryMindPipeline:
|
|
14
|
+
"""
|
|
15
|
+
Orchestrates the full query → insight pipeline.
|
|
16
|
+
|
|
17
|
+
Accepts either a CSVConnector or ExcelConnector — the rest of the
|
|
18
|
+
pipeline is connector-agnostic.
|
|
19
|
+
|
|
20
|
+
Step sequence
|
|
21
|
+
-------------
|
|
22
|
+
1. InputGuard – blocks junk / sensitive input
|
|
23
|
+
2. InterpreterAgent – fast rule-based intent extraction
|
|
24
|
+
3. LLMInterpreter – runs only when confidence < 0.8;
|
|
25
|
+
falls back to rule intent on failure
|
|
26
|
+
4. Analyzer – pandas operations; sheet-aware for Excel
|
|
27
|
+
5. InsightGenerator – formats raw Series → readable answer
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, connector, semantic_map: dict):
|
|
31
|
+
"""
|
|
32
|
+
connector – a CSVConnector or ExcelConnector instance
|
|
33
|
+
semantic_map – {"metric": col, "dimension": col, "time": col|None}
|
|
34
|
+
"""
|
|
35
|
+
self.semantic_map = semantic_map
|
|
36
|
+
|
|
37
|
+
# Infrastructure
|
|
38
|
+
self.schema_filter = SchemaFilter()
|
|
39
|
+
self.schema_engine = SchemaEngine()
|
|
40
|
+
|
|
41
|
+
extra_words = [v for v in semantic_map.values() if v]
|
|
42
|
+
self.input_guard = InputGuard(extra_domain_words=extra_words)
|
|
43
|
+
|
|
44
|
+
# Agents
|
|
45
|
+
self.interpreter = InterpreterAgent()
|
|
46
|
+
self.llm_interpreter = LLMInterpreter()
|
|
47
|
+
self.insight_generator = InsightGenerator()
|
|
48
|
+
self.analyzer = Analyzer()
|
|
49
|
+
self.join_resolver = JoinResolver()
|
|
50
|
+
|
|
51
|
+
# Check Ollama availability at startup
|
|
52
|
+
self.llm_available = self._check_ollama()
|
|
53
|
+
|
|
54
|
+
# Load + cache base context once at startup
|
|
55
|
+
self._base_context = {}
|
|
56
|
+
self._base_context = connector.run(self._base_context)
|
|
57
|
+
if self._base_context.get("error"):
|
|
58
|
+
raise RuntimeError(f"Failed to load data: {self._base_context['error']}")
|
|
59
|
+
self._base_context = self.schema_filter.run(self._base_context)
|
|
60
|
+
self._base_context = self.schema_engine.run(self._base_context)
|
|
61
|
+
|
|
62
|
+
# ------------------------------------------------------------------
|
|
63
|
+
def _check_ollama(self) -> bool:
|
|
64
|
+
"""
|
|
65
|
+
Ping Ollama at startup. Returns True if reachable, False otherwise.
|
|
66
|
+
Prints a clear warning so the user knows LLM fallback is disabled.
|
|
67
|
+
"""
|
|
68
|
+
try:
|
|
69
|
+
import requests
|
|
70
|
+
|
|
71
|
+
resp = requests.get("http://localhost:11434", timeout=3)
|
|
72
|
+
if resp.status_code == 200:
|
|
73
|
+
print("✅ Ollama detected — LLM fallback enabled")
|
|
74
|
+
return True
|
|
75
|
+
except Exception:
|
|
76
|
+
pass
|
|
77
|
+
print(
|
|
78
|
+
"⚠️ Ollama not detected on localhost:11434\n"
|
|
79
|
+
" LLM fallback disabled — rule-based interpreter only.\n"
|
|
80
|
+
" To enable: install Ollama from https://ollama.ai and run: ollama pull phi"
|
|
81
|
+
)
|
|
82
|
+
return False
|
|
83
|
+
|
|
84
|
+
# ------------------------------------------------------------------
|
|
85
|
+
def _check_missing_column(self, context: dict) -> dict:
|
|
86
|
+
"""
|
|
87
|
+
Detects when the user's query references a column that doesn't exist
|
|
88
|
+
in any loaded sheet, and the interpreter silently fell back to the
|
|
89
|
+
semantic default dimension.
|
|
90
|
+
|
|
91
|
+
Sets context["error"] with a helpful message if detected.
|
|
92
|
+
"""
|
|
93
|
+
import re
|
|
94
|
+
|
|
95
|
+
query = context.get("user_query", "").lower()
|
|
96
|
+
intent = context.get("intent", {})
|
|
97
|
+
schema = context.get("schema", {})
|
|
98
|
+
semantic = context.get("semantic_map", {})
|
|
99
|
+
|
|
100
|
+
intent_dimension = intent.get("dimension", "")
|
|
101
|
+
semantic_dimension = semantic.get("dimension", "")
|
|
102
|
+
|
|
103
|
+
# Only check when interpreter fell back to semantic default
|
|
104
|
+
# (means it couldn't find an explicit column match in the query)
|
|
105
|
+
if intent_dimension != semantic_dimension:
|
|
106
|
+
return context
|
|
107
|
+
|
|
108
|
+
columns = [col["name"] for col in schema.get("columns", [])]
|
|
109
|
+
col_set = set(columns)
|
|
110
|
+
col_words = set()
|
|
111
|
+
for col in columns:
|
|
112
|
+
for part in col.split("_"):
|
|
113
|
+
if len(part) > 2:
|
|
114
|
+
col_words.add(part)
|
|
115
|
+
|
|
116
|
+
STOP_WORDS = {
|
|
117
|
+
"which",
|
|
118
|
+
"what",
|
|
119
|
+
"who",
|
|
120
|
+
"where",
|
|
121
|
+
"when",
|
|
122
|
+
"how",
|
|
123
|
+
"the",
|
|
124
|
+
"was",
|
|
125
|
+
"were",
|
|
126
|
+
"had",
|
|
127
|
+
"has",
|
|
128
|
+
"have",
|
|
129
|
+
"did",
|
|
130
|
+
"does",
|
|
131
|
+
"most",
|
|
132
|
+
"least",
|
|
133
|
+
"max",
|
|
134
|
+
"min",
|
|
135
|
+
"top",
|
|
136
|
+
"highest",
|
|
137
|
+
"lowest",
|
|
138
|
+
"total",
|
|
139
|
+
"average",
|
|
140
|
+
"used",
|
|
141
|
+
"give",
|
|
142
|
+
"show",
|
|
143
|
+
"list",
|
|
144
|
+
"find",
|
|
145
|
+
"get",
|
|
146
|
+
"and",
|
|
147
|
+
"for",
|
|
148
|
+
"with",
|
|
149
|
+
"from",
|
|
150
|
+
"that",
|
|
151
|
+
"this",
|
|
152
|
+
"are",
|
|
153
|
+
"all",
|
|
154
|
+
"per",
|
|
155
|
+
"across",
|
|
156
|
+
"gave",
|
|
157
|
+
"its",
|
|
158
|
+
"their",
|
|
159
|
+
"use",
|
|
160
|
+
"been",
|
|
161
|
+
"much",
|
|
162
|
+
"many",
|
|
163
|
+
"more",
|
|
164
|
+
"less",
|
|
165
|
+
"than",
|
|
166
|
+
"into",
|
|
167
|
+
"over",
|
|
168
|
+
"each",
|
|
169
|
+
"some",
|
|
170
|
+
"any",
|
|
171
|
+
"our",
|
|
172
|
+
"not",
|
|
173
|
+
"but",
|
|
174
|
+
"can",
|
|
175
|
+
"could",
|
|
176
|
+
"would",
|
|
177
|
+
"using",
|
|
178
|
+
"like",
|
|
179
|
+
"sales",
|
|
180
|
+
"revenue",
|
|
181
|
+
"profit",
|
|
182
|
+
"spend",
|
|
183
|
+
"spending",
|
|
184
|
+
"spent",
|
|
185
|
+
"cost",
|
|
186
|
+
"amount",
|
|
187
|
+
"value",
|
|
188
|
+
"number",
|
|
189
|
+
"count",
|
|
190
|
+
"sum",
|
|
191
|
+
"avg",
|
|
192
|
+
"mean",
|
|
193
|
+
"sheet",
|
|
194
|
+
"data",
|
|
195
|
+
"file",
|
|
196
|
+
"table",
|
|
197
|
+
"column",
|
|
198
|
+
"field",
|
|
199
|
+
"ascending",
|
|
200
|
+
"descending",
|
|
201
|
+
"asc",
|
|
202
|
+
"desc",
|
|
203
|
+
"increasing",
|
|
204
|
+
"decreasing",
|
|
205
|
+
"sort",
|
|
206
|
+
"sorted",
|
|
207
|
+
"order",
|
|
208
|
+
"ordering",
|
|
209
|
+
# Common filler words that are not column names
|
|
210
|
+
"specific",
|
|
211
|
+
"wise",
|
|
212
|
+
"based",
|
|
213
|
+
"overall",
|
|
214
|
+
"give",
|
|
215
|
+
"respective",
|
|
216
|
+
"related",
|
|
217
|
+
"breakdown",
|
|
218
|
+
"detail",
|
|
219
|
+
"particular",
|
|
220
|
+
"certain",
|
|
221
|
+
"various",
|
|
222
|
+
"different",
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
# Add sheet names AND every individual word in each sheet name
|
|
226
|
+
# so "List of Orders" doesn't cause "orders" to be flagged as missing
|
|
227
|
+
for s in context.get("excel_sheets", []):
|
|
228
|
+
STOP_WORDS.add(s.lower())
|
|
229
|
+
for word in s.lower().split():
|
|
230
|
+
STOP_WORDS.add(word)
|
|
231
|
+
|
|
232
|
+
words = re.findall(r"[a-zA-Z]+", query)
|
|
233
|
+
|
|
234
|
+
# Unigrams: words unknown to both stop list and schema
|
|
235
|
+
unknowns = [
|
|
236
|
+
w
|
|
237
|
+
for w in words
|
|
238
|
+
if w not in STOP_WORDS
|
|
239
|
+
and w not in col_words
|
|
240
|
+
and w not in col_set
|
|
241
|
+
and len(w) > 3
|
|
242
|
+
]
|
|
243
|
+
|
|
244
|
+
# Bigrams: adjacent unknown-word pairs as potential column names
|
|
245
|
+
bigrams = []
|
|
246
|
+
for i in range(len(words) - 1):
|
|
247
|
+
a, b = words[i], words[i + 1]
|
|
248
|
+
pair = f"{a}_{b}"
|
|
249
|
+
if (
|
|
250
|
+
pair not in col_set
|
|
251
|
+
and a not in STOP_WORDS
|
|
252
|
+
and b not in STOP_WORDS
|
|
253
|
+
and a not in col_words
|
|
254
|
+
and b not in col_words
|
|
255
|
+
and len(a) > 2
|
|
256
|
+
and len(b) > 2
|
|
257
|
+
):
|
|
258
|
+
bigrams.append(pair)
|
|
259
|
+
|
|
260
|
+
candidates = bigrams + unknowns
|
|
261
|
+
if not candidates:
|
|
262
|
+
return context
|
|
263
|
+
|
|
264
|
+
# Most likely missing column = longest candidate
|
|
265
|
+
most_likely = sorted(set(candidates), key=len, reverse=True)[0]
|
|
266
|
+
|
|
267
|
+
# Suggest the closest real column name using character overlap
|
|
268
|
+
def similarity(a, b):
|
|
269
|
+
a_set = set(a.replace("_", ""))
|
|
270
|
+
b_set = set(b.replace("_", ""))
|
|
271
|
+
return len(a_set & b_set) / max(len(a_set | b_set), 1)
|
|
272
|
+
|
|
273
|
+
ml_words = set(most_likely.replace("_", ""))
|
|
274
|
+
suggestions = sorted(
|
|
275
|
+
[c for c in columns if c != "_sheet"],
|
|
276
|
+
key=lambda c: similarity(most_likely, c),
|
|
277
|
+
reverse=True,
|
|
278
|
+
)[:3]
|
|
279
|
+
|
|
280
|
+
context["error"] = (
|
|
281
|
+
f"❓ Column '{most_likely.replace('_', ' ')}' doesn't exist in your data.\n\n"
|
|
282
|
+
f" Available columns: {[c for c in columns if c != '_sheet']}\n\n"
|
|
283
|
+
f" Closest matches: {suggestions}\n"
|
|
284
|
+
f" Try rephrasing — e.g. 'which {suggestions[0]} had the most {semantic.get('metric', 'value')}?'"
|
|
285
|
+
)
|
|
286
|
+
return context
|
|
287
|
+
|
|
288
|
+
# ------------------------------------------------------------------
|
|
289
|
+
def run(self, context: dict) -> dict:
|
|
290
|
+
# Inject shared state into every query context
|
|
291
|
+
context["dataframe"] = self._base_context.get("dataframe")
|
|
292
|
+
context["schema"] = self._base_context.get("schema")
|
|
293
|
+
context["schema_description"] = self._base_context.get("schema_description")
|
|
294
|
+
context["semantic_map"] = self.semantic_map
|
|
295
|
+
|
|
296
|
+
# Carry Excel-specific metadata so Analyzer / InsightGenerator can use it
|
|
297
|
+
context["sheet_dataframes"] = self._base_context.get("sheet_dataframes", {})
|
|
298
|
+
context["excel_sheets"] = self._base_context.get("excel_sheets", [])
|
|
299
|
+
context["excel_mode"] = self._base_context.get("excel_mode", None)
|
|
300
|
+
|
|
301
|
+
# STEP 1 – Input guard
|
|
302
|
+
context = self.input_guard.run(context)
|
|
303
|
+
if context.get("error"):
|
|
304
|
+
return context
|
|
305
|
+
|
|
306
|
+
# STEP 2 – Rule-based interpreter
|
|
307
|
+
context = self.interpreter.run(context)
|
|
308
|
+
if context.get("error"):
|
|
309
|
+
return context
|
|
310
|
+
|
|
311
|
+
confidence = context.get("intent_confidence", 0)
|
|
312
|
+
|
|
313
|
+
# STEP 3 – LLM fallback (only when Ollama is available and confidence is low)
|
|
314
|
+
if confidence < 0.8:
|
|
315
|
+
if not self.llm_available:
|
|
316
|
+
# Ollama is down — reject low-confidence queries cleanly
|
|
317
|
+
context["error"] = (
|
|
318
|
+
"❓ I couldn't understand that query, and the LLM fallback "
|
|
319
|
+
"is unavailable (Ollama not running).\n\n"
|
|
320
|
+
"Try rephrasing with clearer keywords:\n"
|
|
321
|
+
" • 'top 5 items by sales'\n"
|
|
322
|
+
" • 'highest revenue by location'\n"
|
|
323
|
+
" • 'average spend by payment method'\n"
|
|
324
|
+
" • 'total sales trend over time'"
|
|
325
|
+
)
|
|
326
|
+
return context
|
|
327
|
+
|
|
328
|
+
llm_context = self.llm_interpreter.run(dict(context))
|
|
329
|
+
if llm_context.get("error"):
|
|
330
|
+
context["error"] = (
|
|
331
|
+
"❓ I couldn't understand that query.\n\n"
|
|
332
|
+
"Try something like:\n"
|
|
333
|
+
" • 'top 5 items by sales'\n"
|
|
334
|
+
" • 'highest revenue by location'\n"
|
|
335
|
+
" • 'average spend by payment method'\n"
|
|
336
|
+
" • 'total sales trend over time'"
|
|
337
|
+
)
|
|
338
|
+
return context
|
|
339
|
+
context["intent"] = llm_context["intent"]
|
|
340
|
+
context["llm_used"] = True
|
|
341
|
+
|
|
342
|
+
# STEP 3.5a – Guard: user asked about a column that doesn't exist
|
|
343
|
+
context = self._check_missing_column(context)
|
|
344
|
+
if context.get("error"):
|
|
345
|
+
return context
|
|
346
|
+
|
|
347
|
+
# STEP 3.5b – Guard: trend query but no time column configured
|
|
348
|
+
if context.get("intent", {}).get("no_time_column"):
|
|
349
|
+
context["error"] = (
|
|
350
|
+
"⏱️ This query needs a time column, but none was configured.\n\n"
|
|
351
|
+
"Re-run QueryMind and set a time column at the setup prompt, "
|
|
352
|
+
"or rephrase your question to use a different dimension."
|
|
353
|
+
)
|
|
354
|
+
return context
|
|
355
|
+
|
|
356
|
+
# STEP 3.7 – Cross-sheet join resolution
|
|
357
|
+
# Runs only when dimension column lives in a different sheet
|
|
358
|
+
# than the metric column (e.g. "which manager had max sales?")
|
|
359
|
+
context = self.join_resolver.run(context)
|
|
360
|
+
if context.get("error"):
|
|
361
|
+
return context
|
|
362
|
+
|
|
363
|
+
# STEP 4 – Analyze
|
|
364
|
+
context = self.analyzer.run(context)
|
|
365
|
+
if context.get("error"):
|
|
366
|
+
return context
|
|
367
|
+
|
|
368
|
+
# STEP 5 – Generate insight
|
|
369
|
+
context = self.insight_generator.run(context)
|
|
370
|
+
|
|
371
|
+
if not context.get("answer"):
|
|
372
|
+
raw = context.get("analysis")
|
|
373
|
+
context["answer"] = (
|
|
374
|
+
raw.to_string()
|
|
375
|
+
if raw is not None
|
|
376
|
+
else "⚠️ Could not generate an answer for that query."
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
return context
|
app/data/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import chardet
|
|
3
|
+
from app.data.type_caster import smart_cast_df
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _detect_encoding(file_path: str) -> str:
|
|
7
|
+
"""
|
|
8
|
+
Detect file encoding.
|
|
9
|
+
Checks for BOM first (catches Excel-exported UTF-8 files),
|
|
10
|
+
then falls back to chardet for other encodings (latin-1, cp1252, etc.)
|
|
11
|
+
"""
|
|
12
|
+
with open(file_path, "rb") as f:
|
|
13
|
+
raw = f.read(4096)
|
|
14
|
+
|
|
15
|
+
# UTF-8 BOM — most common cause of \ufeff in column names
|
|
16
|
+
if raw.startswith(b"\xef\xbb\xbf"):
|
|
17
|
+
return "utf-8-sig"
|
|
18
|
+
|
|
19
|
+
# UTF-16 BOMs
|
|
20
|
+
if raw.startswith(b"\xff\xfe") or raw.startswith(b"\xfe\xff"):
|
|
21
|
+
return "utf-16"
|
|
22
|
+
|
|
23
|
+
detected = chardet.detect(raw)
|
|
24
|
+
return detected.get("encoding") or "utf-8"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _detect_delimiter(file_path: str, encoding: str) -> str:
|
|
28
|
+
"""
|
|
29
|
+
Detect delimiter by counting occurrences in the first line.
|
|
30
|
+
Handles comma, semicolon, tab, pipe — in that priority order on ties.
|
|
31
|
+
"""
|
|
32
|
+
try:
|
|
33
|
+
with open(file_path, "r", encoding=encoding, errors="replace") as f:
|
|
34
|
+
first_line = f.readline()
|
|
35
|
+
except Exception:
|
|
36
|
+
return ","
|
|
37
|
+
|
|
38
|
+
candidates = {",": 0, ";": 0, "\t": 0, "|": 0}
|
|
39
|
+
for delim in candidates:
|
|
40
|
+
candidates[delim] = first_line.count(delim)
|
|
41
|
+
|
|
42
|
+
best = max(candidates, key=candidates.get)
|
|
43
|
+
return best if candidates[best] > 0 else ","
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class CSVConnector:
|
|
47
|
+
def __init__(self, file_path: str):
|
|
48
|
+
self.file_path = file_path
|
|
49
|
+
|
|
50
|
+
def run(self, context: dict) -> dict:
|
|
51
|
+
try:
|
|
52
|
+
encoding = _detect_encoding(self.file_path)
|
|
53
|
+
delimiter = _detect_delimiter(self.file_path, encoding)
|
|
54
|
+
|
|
55
|
+
print(f"📄 CSV encoding={encoding} delimiter={repr(delimiter)}")
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
df = pd.read_csv(
|
|
59
|
+
self.file_path,
|
|
60
|
+
encoding=encoding,
|
|
61
|
+
sep=delimiter,
|
|
62
|
+
on_bad_lines="warn", # skip malformed rows, don't crash
|
|
63
|
+
)
|
|
64
|
+
except pd.errors.EmptyDataError:
|
|
65
|
+
context["error"] = (
|
|
66
|
+
f"'{self.file_path}' is completely empty. "
|
|
67
|
+
f"Please provide a file with headers and at least one row of data."
|
|
68
|
+
)
|
|
69
|
+
return context
|
|
70
|
+
|
|
71
|
+
# Guard: headers-only (parsed fine but zero rows)
|
|
72
|
+
if df.empty:
|
|
73
|
+
context["error"] = (
|
|
74
|
+
f"'{self.file_path}' contains only headers and no data rows. "
|
|
75
|
+
f"Please provide a file with at least one row of data."
|
|
76
|
+
)
|
|
77
|
+
return context
|
|
78
|
+
|
|
79
|
+
# Normalize column names
|
|
80
|
+
df.columns = [col.lower().strip().replace(" ", "_") for col in df.columns]
|
|
81
|
+
|
|
82
|
+
# Smart cast: numeric + packed-date detection (shared logic)
|
|
83
|
+
df = smart_cast_df(df)
|
|
84
|
+
|
|
85
|
+
context["dataframe"] = df
|
|
86
|
+
context["schema"] = {
|
|
87
|
+
"columns": [
|
|
88
|
+
{"name": col, "type": str(df[col].dtype)} for col in df.columns
|
|
89
|
+
]
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
print("✅ Columns detected:", df.columns.tolist())
|
|
93
|
+
print("📊 Data types:\n", df.dtypes.to_string())
|
|
94
|
+
|
|
95
|
+
return context
|
|
96
|
+
|
|
97
|
+
except Exception as e:
|
|
98
|
+
context["error"] = f"Failed to load CSV: {e}"
|
|
99
|
+
return context
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from app.data.type_caster import smart_cast_df
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def _normalize_col(col: str) -> str:
|
|
6
|
+
return col.lower().strip().replace(" ", "_")
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ExcelConnector:
|
|
10
|
+
def __init__(self, file_path: str, selected_sheets: list):
|
|
11
|
+
self.file_path = file_path
|
|
12
|
+
self.selected_sheets = selected_sheets
|
|
13
|
+
|
|
14
|
+
def run(self, context: dict) -> dict:
|
|
15
|
+
try:
|
|
16
|
+
xl = pd.ExcelFile(self.file_path)
|
|
17
|
+
except Exception as e:
|
|
18
|
+
context["error"] = f"Cannot open Excel file: {e}"
|
|
19
|
+
return context
|
|
20
|
+
|
|
21
|
+
available = xl.sheet_names
|
|
22
|
+
invalid = [s for s in self.selected_sheets if s not in available]
|
|
23
|
+
if invalid:
|
|
24
|
+
context["error"] = f"Sheet(s) not found: {invalid}. Available: {available}"
|
|
25
|
+
return context
|
|
26
|
+
|
|
27
|
+
sheet_dfs = {}
|
|
28
|
+
for sheet in self.selected_sheets:
|
|
29
|
+
try:
|
|
30
|
+
df = xl.parse(sheet)
|
|
31
|
+
df.columns = [_normalize_col(c) for c in df.columns]
|
|
32
|
+
df = smart_cast_df(df) # ← shared smart caster
|
|
33
|
+
sheet_dfs[sheet] = df
|
|
34
|
+
except Exception as e:
|
|
35
|
+
context["error"] = f"Failed to parse sheet '{sheet}': {e}"
|
|
36
|
+
return context
|
|
37
|
+
|
|
38
|
+
mode = "single" if len(self.selected_sheets) == 1 else "multi"
|
|
39
|
+
|
|
40
|
+
if mode == "single":
|
|
41
|
+
combined = sheet_dfs[self.selected_sheets[0]].copy()
|
|
42
|
+
combined["_sheet"] = self.selected_sheets[0]
|
|
43
|
+
else:
|
|
44
|
+
frames = []
|
|
45
|
+
for sheet, df in sheet_dfs.items():
|
|
46
|
+
df = df.copy()
|
|
47
|
+
df["_sheet"] = sheet
|
|
48
|
+
frames.append(df)
|
|
49
|
+
combined = pd.concat(frames, ignore_index=True, sort=False)
|
|
50
|
+
# pd.concat upcasts int64 → float64 when a column is missing
|
|
51
|
+
# in some sheets (NaN rows force the upcast). Re-run downcast.
|
|
52
|
+
combined = smart_cast_df(combined)
|
|
53
|
+
|
|
54
|
+
print(f"✅ Excel loaded ({mode}): {self.selected_sheets}")
|
|
55
|
+
print(f" Shape: {combined.shape} Cols: {combined.columns.tolist()}")
|
|
56
|
+
print(f" Dtypes: {combined.dtypes.to_dict()}")
|
|
57
|
+
|
|
58
|
+
context["dataframe"] = combined
|
|
59
|
+
context["sheet_dataframes"] = sheet_dfs
|
|
60
|
+
context["excel_sheets"] = self.selected_sheets
|
|
61
|
+
context["excel_mode"] = mode
|
|
62
|
+
context["schema"] = {
|
|
63
|
+
"columns": [
|
|
64
|
+
{"name": col, "type": str(combined[col].dtype)}
|
|
65
|
+
for col in combined.columns
|
|
66
|
+
]
|
|
67
|
+
}
|
|
68
|
+
return context
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
class SchemaEngine:
|
|
2
|
+
def run(self, context):
|
|
3
|
+
schema = context["schema"]
|
|
4
|
+
print("Schema: ", schema)
|
|
5
|
+
|
|
6
|
+
if not schema:
|
|
7
|
+
print("\n -------------- CONTEXT------------ \n", context)
|
|
8
|
+
|
|
9
|
+
return context
|
|
10
|
+
|
|
11
|
+
description = "Table: data\n\nColumns:\n"
|
|
12
|
+
|
|
13
|
+
for col in schema["columns"]:
|
|
14
|
+
description += f"- {col['name']} ({col['type']})\n"
|
|
15
|
+
|
|
16
|
+
context["schema_description"] = description
|
|
17
|
+
|
|
18
|
+
return context
|