PyPI - querymind-cli - Versions diffs - 0.1.0__py3-none-any.whl - Mend

querymind-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

app/agents/InterpreterAgent.py +473 -0
app/agents/__init__.py +0 -0
app/agents/insights_generator.py +151 -0
app/agents/intent_corrector.py +59 -0
app/agents/llm_intepreter.py +132 -0
app/agents/narrator.py +27 -0
app/agents/planner.py +77 -0
app/cli/__init__.py +0 -0
app/cli/main.py +346 -0
app/cli/tui_app.py +98 -0
app/cli/ui.py +21 -0
app/core/__init__.py +0 -0
app/core/context.py +10 -0
app/core/logger.py +2 -0
app/core/pipeline.py +379 -0
app/data/__init__.py +0 -0
app/data/connectors/csv_connector.py +99 -0
app/data/connectors/excel_connector.py +68 -0
app/data/connectors/no_sql_db_connector.py +0 -0
app/data/connectors/sql_db_connector.py +0 -0
app/data/schema_engine.py +18 -0
app/data/type_caster.py +128 -0
app/executor/__init__.py +0 -0
app/executor/db_executor.py +0 -0
app/executor/sheet_selector.py +120 -0
app/llm/ollama_client.py +47 -0
app/prompts/interpreter_prompt.txt +28 -0
app/security/__init__.py +0 -0
app/security/input_guard.py +133 -0
app/security/schema_filter.py +20 -0
app/tests/__init__.py +0 -0
app/tests/llm_test.py +18 -0
app/tools/__init__.py +0 -0
app/tools/analyzer.py +157 -0
app/tools/join_resolver.py +159 -0
app/tools/sql_writer.py +37 -0
app/tools/validator.py +0 -0
querymind_cli-0.1.0.dist-info/METADATA +139 -0
querymind_cli-0.1.0.dist-info/RECORD +43 -0
querymind_cli-0.1.0.dist-info/WHEEL +5 -0
querymind_cli-0.1.0.dist-info/entry_points.txt +2 -0
querymind_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
querymind_cli-0.1.0.dist-info/top_level.txt +1 -0

app/agents/llm_intepreter.py ADDED Viewed

@@ -0,0 +1,132 @@
+import json
+from app.llm.ollama_client import OllamaClient
+class LLMInterpreter:
+    """
+    Falls back to an LLM when the rule-based interpreter has low confidence.
+    Converts a natural-language query into the same structured intent dict
+    that InterpreterAgent produces, so the rest of the pipeline is unaware
+    of which interpreter was used.
+    """
+    def __init__(self):
+        self.client = OllamaClient()
+    # ------------------------------------------------------------------
+    # Prompt
+    # ------------------------------------------------------------------
+    def build_prompt(self, query: str, columns: list, semantic_map: dict) -> str:
+        metric = semantic_map.get("metric", "")
+        dimension = semantic_map.get("dimension", "")
+        time_col = semantic_map.get("time", "")
+        numeric_cols = [
+            c
+            for c in columns
+            if any(
+                hint in c
+                for hint in (
+                    "amount",
+                    "price",
+                    "spent",
+                    "revenue",
+                    "sales",
+                    "total",
+                    "count",
+                    "qty",
+                    "quantity",
+                )
+            )
+        ]
+        categorical_cols = [c for c in columns if c not in numeric_cols]
+        return f"""You are a data analyst assistant.
+Convert the user query into a JSON intent object. Use ONLY column names from the list below.
+Dataset columns  : {columns}
+Numeric columns  : {numeric_cols or "unknown — pick best match"}
+Categorical cols : {categorical_cols or "unknown — pick best match"}
+Default metric   : {metric}
+Default dimension: {dimension}
+Time column      : {time_col or "none"}
+User query: "{query}"
+Return ONLY valid JSON — no explanation, no markdown fences.
+{{
+  "metric":     "<column_name>",
+  "dimension":  "<column_name>",
+  "operation":  "sum | mean | count",
+  "query_type": "comparison | aggregation | trend | top_n",
+  "limit":      <number or null>
+}}
+Rules
+- metric    must be a numeric column
+- dimension must be a categorical or time column
+- "highest" / "most" / "compare"    → query_type = "comparison"
+- "top N" / "bottom N"              → query_type = "top_n", limit = N (default 5)
+- "average" / "avg" / "mean"        → query_type = "aggregation", operation = "mean"
+- "trend" / "over time" / "monthly" → query_type = "trend", dimension = time column
+- anything else                     → query_type = "aggregation", operation = "sum"
+"""
+    # ------------------------------------------------------------------
+    # Helpers
+    # ------------------------------------------------------------------
+    def _parse(self, text: str) -> dict | None:
+        try:
+            start = text.find("{")
+            end = text.rfind("}") + 1
+            if start == -1 or end == 0:
+                return None
+            return json.loads(text[start:end])
+        except Exception:
+            return None
+    def _valid(self, intent: dict, columns: list) -> bool:
+        if not intent:
+            return False
+        if intent.get("metric") not in columns:
+            return False
+        if intent.get("dimension") not in columns:
+            return False
+        if intent.get("query_type") not in (
+            "comparison",
+            "aggregation",
+            "trend",
+            "top_n",
+        ):
+            return False
+        return True
+    # ------------------------------------------------------------------
+    # Main
+    # ------------------------------------------------------------------
+    def run(self, context: dict) -> dict:
+        query = context["user_query"]
+        schema = context["schema"]["columns"]
+        semantic_map = context.get("semantic_map", {})
+        columns = [col["name"] for col in schema]
+        prompt = self.build_prompt(query, columns, semantic_map)
+        response = self.client.generate(prompt)
+        intent = self._parse(response)
+        if not self._valid(intent, columns):
+            context["error"] = (
+                f"LLM returned an invalid intent. Raw response: {response[:200]}"
+            )
+            return context
+        context["intent"] = intent
+        context["llm_used"] = True
+        return context

app/agents/narrator.py ADDED Viewed

@@ -0,0 +1,27 @@
+class Narrator:
+    """
+    Final layer: formats and cleans the output before sending to UI
+    """
+    def run(self, context):
+        answer = context.get("answer")
+        if not answer:
+            return context
+        try:
+            # Clean duplicate emojis
+            answer = answer.replace("💡 💡", "💡")
+            # Ensure spacing
+            answer = answer.strip()
+            # Add fallback formatting if plain text
+            if not any(x in answer for x in ["💡", "📊", "📌"]):
+                answer = f"💡 {answer}"
+            context["answer"] = answer
+            return context
+        except Exception:
+            return context

app/agents/planner.py ADDED Viewed

@@ -0,0 +1,77 @@
+"""
+PlannerAgent — lightweight pre-flight check.
+NOTE: This agent is currently unused by the pipeline (InterpreterAgent
+handles intent extraction). Keep it here for future multi-step query
+planning (e.g. chaining two analyses together, or deciding whether a
+query needs a trend + comparison combined answer).
+"""
+class PlannerAgent:
+    """
+    Validates that the semantic_map is properly configured and that the
+    query contains at least one actionable keyword before the pipeline runs.
+    Returns an error only for genuinely unactionable input (empty, numeric-only).
+    For low-confidence / keyword-free natural-language queries the pipeline
+    should route to the LLMInterpreter instead of hard-failing here.
+    """
+    VALID_KEYWORDS = {
+        "highest",
+        "lowest",
+        "top",
+        "bottom",
+        "average",
+        "avg",
+        "mean",
+        "trend",
+        "over time",
+        "monthly",
+        "daily",
+        "total",
+        "sum",
+        "most",
+        "least",
+        "distribution",
+        "breakdown",
+        "compare",
+        "how much",
+        "how many",
+        "which",
+        "where",
+        "what",
+    }
+    def run(self, context: dict) -> dict:
+        query = context.get("user_query", "").lower().strip()
+        semantic_map = context.get("semantic_map")
+        # --- Validate semantic map ---
+        if not semantic_map:
+            context["error"] = (
+                "Semantic map missing. Please restart and configure your dataset."
+            )
+            return context
+        metric = semantic_map.get("metric")
+        dimension = semantic_map.get("dimension")
+        if not metric or not dimension:
+            context["error"] = (
+                "Metric or dimension not configured. "
+                "Please restart and select valid columns."
+            )
+            return context
+        # --- Guard obviously bad input ---
+        if not query or query.isdigit():
+            context["error"] = "Please enter a meaningful question."
+            return context
+        # --- Route decision (informational, does NOT block) ---
+        has_keyword = any(kw in query for kw in self.VALID_KEYWORDS)
+        context["planner_has_keyword"] = has_keyword
+        return context

app/cli/__init__.py ADDED Viewed

File without changes

app/cli/main.py ADDED Viewed

@@ -0,0 +1,346 @@
+import os
+import sys
+import pandas as pd
+from rich.console import Console
+from rich.panel import Panel
+from rich.prompt import Prompt
+from rich.table import Table
+from app.core.pipeline import QueryMindPipeline
+from app.cli.tui_app import QueryMindApp
+from app.data.connectors.csv_connector import CSVConnector
+from app.data.connectors.excel_connector import ExcelConnector
+from app.executor.sheet_selector import prompt_sheet_selection
+console = Console()
+EXCEL_EXTS = {".xlsx", ".xls", ".xlsm", ".xlsb"}
+CSV_EXTS = {".csv", ".tsv"}
+# Words that mean "I want to quit" at any prompt
+EXIT_WORDS = {"exit", "quit", "/exit", "/quit", "bye", "q", ":q", "bye", "/bye"}
+# ─────────────────────────────────────────────
+# CLEAN EXIT SIGNAL
+# ─────────────────────────────────────────────
+class UserExitError(Exception):
+    """Raised when the user types an exit command at any prompt."""
+    pass
+def ask(message: str, default: str = None) -> str:
+    """
+    Wrapper around Prompt.ask that:
+    - Checks for exit words before returning
+    - Raises UserExitError so the caller doesn't need any special logic
+    - Handles KeyboardInterrupt (Ctrl+C) as a clean exit too
+    """
+    try:
+        value = (
+            Prompt.ask(message, default=default)
+            if default is not None
+            else Prompt.ask(message)
+        )
+    except (KeyboardInterrupt, EOFError):
+        raise UserExitError()
+    if value.strip().lower() in EXIT_WORDS:
+        raise UserExitError()
+    return value
+# ─────────────────────────────────────────────
+# HELPERS
+# ─────────────────────────────────────────────
+def normalize_column(col: str) -> str:
+    return col.lower().strip().replace(" ", "_")
+def validate_column(input_col: str, columns: list) -> str | None:
+    col = normalize_column(input_col)
+    if col not in columns:
+        console.print(f"[red]❌ '{col}' not found. Choose from the list above.[/red]")
+        return None
+    return col
+def prompt_column(message: str, columns: list, optional: bool = False) -> str | None:
+    """Prompt for a column name, with exit-word detection on every attempt."""
+    while True:
+        value = (
+            ask(f"[cyan]{message}[/cyan]", default="")
+            if optional
+            else ask(f"[cyan]{message}[/cyan]")
+        )
+        if optional and value.strip() == "":
+            return None
+        validated = validate_column(value, columns)
+        if validated:
+            return validated
+def detect_column_types(df: pd.DataFrame) -> tuple:
+    """Returns (numeric_cols, categorical_cols, datetime_cols)."""
+    numeric = df.select_dtypes(include="number").columns.tolist()
+    obj_cols = df.select_dtypes(exclude="number").columns.tolist()
+    datetime_cols, categorical = [], []
+    for col in obj_cols:
+        if col == "_sheet":
+            continue
+        col_data = df[col]
+        if isinstance(col_data, pd.DataFrame):
+            col_data = col_data.iloc[:, 0]  # duplicate col name → take first
+        sample = col_data.dropna().head(20)
+        try:
+            pd.to_datetime(sample, infer_datetime_format=True)
+            datetime_cols.append(col)
+        except Exception:
+            categorical.append(col)
+    return numeric, categorical, datetime_cols
+def show_columns(df: pd.DataFrame, numeric: list, categorical: list, datetime: list):
+    table = Table(title="Detected Columns", border_style="blue", show_lines=False)
+    table.add_column("Column", style="bold white")
+    table.add_column("Type", style="dim")
+    table.add_column("Sample values", style="dim")
+    for col in df.columns:
+        if col == "_sheet":
+            continue
+        if col in numeric:
+            col_type = "[green]numeric[/green]"
+        elif col in datetime:
+            col_type = "[magenta]datetime[/magenta]"
+        else:
+            col_type = "[yellow]categorical[/yellow]"
+        try:
+            col_data = df[col]
+            # Duplicate column names → df[col] returns DataFrame not Series
+            if isinstance(col_data, pd.DataFrame):
+                col_data = col_data.iloc[:, 0]
+            sample = col_data.dropna().head(3).tolist()
+            sample_str = ", ".join(str(v) for v in sample)
+        except Exception:
+            sample_str = "(error reading samples)"
+        table.add_row(col, col_type, sample_str)
+    console.print(table)
+# ─────────────────────────────────────────────
+# FILE LOADING
+# ─────────────────────────────────────────────
+def load_file(file_path: str) -> tuple:
+    """
+    Returns (connector, preview_df).
+    Raises RuntimeError for bad files, UserExitError if user quits mid-flow.
+    """
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext in EXCEL_EXTS:
+        selected_sheets = prompt_sheet_selection(
+            file_path
+        )  # exit-aware (see sheet_selector.py)
+        if not selected_sheets:
+            raise RuntimeError("No sheets selected.")
+        connector = ExcelConnector(file_path, selected_sheets)
+        frames = []
+        xl = pd.ExcelFile(file_path)
+        for s in selected_sheets:
+            df = xl.parse(s, nrows=100)
+            df.columns = [normalize_column(c) for c in df.columns]
+            df["_sheet"] = s
+            frames.append(df)
+        preview_df = pd.concat(frames, ignore_index=True, sort=False)
+        real_cols = [c for c in preview_df.columns if c != "_sheet"]
+        if len(real_cols) < 2:
+            col_name = real_cols[0] if real_cols else "none"
+            raise RuntimeError(
+                f"The selected sheet(s) only have 1 column ('{col_name}'). "
+                f"QueryMind needs at least one metric column and one dimension column. "
+                f"Please select sheets with 2 or more columns."
+            )
+        return connector, preview_df
+    elif ext in CSV_EXTS:
+        from app.data.connectors.csv_connector import (
+            _detect_encoding,
+            _detect_delimiter,
+        )
+        connector = CSVConnector(file_path)
+        encoding = _detect_encoding(file_path)
+        delimiter = _detect_delimiter(file_path, encoding)
+        try:
+            preview_df = pd.read_csv(
+                file_path,
+                encoding=encoding,
+                sep=delimiter,
+                nrows=100,
+                on_bad_lines="warn",
+            )
+        except pd.errors.EmptyDataError:
+            raise RuntimeError(
+                f"'{file_path}' is completely empty. "
+                f"Please provide a file with headers and at least one row of data."
+            )
+        if preview_df.empty:
+            raise RuntimeError(
+                f"'{file_path}' contains only headers and no data rows. "
+                f"Please provide a file with at least one row of data."
+            )
+        preview_df.columns = [normalize_column(c) for c in preview_df.columns]
+        # Warn about duplicate column names after normalization
+        dupes = [
+            c for c in preview_df.columns if preview_df.columns.tolist().count(c) > 1
+        ]
+        if dupes:
+            unique_dupes = list(dict.fromkeys(dupes))  # preserve order, deduplicate
+            console.print(
+                f"[yellow]⚠️  Duplicate column names detected after normalization: "
+                f"{unique_dupes}. Only the first occurrence of each will be used.[/yellow]"
+            )
+            preview_df = preview_df.loc[:, ~preview_df.columns.duplicated()]
+        if len(preview_df.columns) < 2:
+            raise RuntimeError(
+                f"'{file_path}' only has 1 column ('{preview_df.columns[0]}'). "
+                f"QueryMind needs at least one metric column and one dimension column. "
+                f"Please provide a file with 2 or more columns."
+            )
+        return connector, preview_df
+    else:
+        raise RuntimeError(
+            f"Unsupported file type: '{ext}'. "
+            f"Supported: {sorted(EXCEL_EXTS | CSV_EXTS)}"
+        )
+# ─────────────────────────────────────────────
+# MAIN
+# ─────────────────────────────────────────────
+def main():
+    console.print(
+        Panel.fit(
+            "[bold cyan]🧠 QueryMind[/bold cyan]\n[green]CLI AI Data Analyst[/green]",
+            border_style="blue",
+        )
+    )
+    console.print(
+        "[dim]  Type [bold]exit[/bold] or [bold]quit[/bold] at any prompt to leave.[/dim]\n"
+    )
+    try:
+        # ── File input ────────────────────────────────────────────────────
+        while True:
+            file_path = ask(
+                "\n[cyan]📁 Enter file path[/cyan] [dim](.csv, .xlsx, .xls)[/dim]"
+            )
+            try:
+                connector, preview_df = load_file(file_path)
+                rows = len(preview_df)
+                console.print(
+                    f"[green]✅ Loaded {rows:,} preview rows × "
+                    f"{len([c for c in preview_df.columns if c != '_sheet'])} columns[/green]"
+                )
+                break
+            except UserExitError:
+                raise  # bubble up — sheet selector raised it mid-flow
+            except RuntimeError as e:
+                console.print(f"[red]❌ {e}[/red]")
+                console.print(
+                    "[yellow]Please try again or type 'exit' to quit.[/yellow]"
+                )
+            except Exception as e:
+                console.print(f"[red]❌ Failed to load: {e}[/red]")
+                console.print(
+                    "[yellow]Please try again or type 'exit' to quit.[/yellow]"
+                )
+        columns_all = [c for c in preview_df.columns if c != "_sheet"]
+        numeric_cols, categorical_cols, datetime_cols = detect_column_types(preview_df)
+        # ── Show column overview ──────────────────────────────────────────
+        console.print()
+        show_columns(preview_df, numeric_cols, categorical_cols, datetime_cols)
+        # ── Semantic mapping ──────────────────────────────────────────────
+        console.print("\n[bold cyan]🧠 Help me understand your data[/bold cyan]")
+        console.print("[dim]Use column names exactly as shown above.[/dim]\n")
+        default_metric = numeric_cols[0] if numeric_cols else ""
+        default_dimension = categorical_cols[0] if categorical_cols else ""
+        default_time = datetime_cols[0] if datetime_cols else ""
+        if default_metric:
+            console.print(f"[dim]  Suggested metric    → {default_metric}[/dim]")
+        if default_dimension:
+            console.print(f"[dim]  Suggested dimension → {default_dimension}[/dim]")
+        if default_time:
+            console.print(f"[dim]  Suggested time col  → {default_time}[/dim]")
+        console.print()
+        metric = prompt_column(
+            "👉 Which column is the main VALUE to measure? (metric)", columns_all
+        )
+        dimension = prompt_column(
+            "👉 Which column to GROUP BY by default? (dimension)", columns_all
+        )
+        time_col = prompt_column(
+            "👉 Time column for trend queries? (optional — press Enter to skip)",
+            columns_all,
+            optional=True,
+        )
+        semantic_map = {"metric": metric, "dimension": dimension, "time": time_col}
+        console.print(
+            f"\n[green]✅ Semantic map:[/green] "
+            f"metric=[bold]{metric}[/bold]  "
+            f"dimension=[bold]{dimension}[/bold]  "
+            f"time=[bold]{time_col or 'none'}[/bold]"
+        )
+        # ── Build pipeline ────────────────────────────────────────────────
+        console.print("\n[dim]Loading data and building pipeline…[/dim]")
+        try:
+            pipeline = QueryMindPipeline(connector, semantic_map)
+        except RuntimeError as e:
+            console.print(f"[red]❌ Pipeline failed to start: {e}[/red]")
+            return
+        console.print("[green]✅ Launching QueryMind UI…[/green]\n")
+        app = QueryMindApp(pipeline)
+        app.run()
+    except UserExitError:
+        pass  # fall through to goodbye message
+    os.system("cls" if os.name == "nt" else "clear")
+    console.print(
+        Panel.fit(
+            "[bold cyan]👋 Goodbye![/bold cyan]\n"
+            "[dim]Thanks for using QueryMind.[/dim]",
+            border_style="blue",
+        )
+    )
+if __name__ == "__main__":
+    main()

app/cli/tui_app.py ADDED Viewed

@@ -0,0 +1,98 @@
+from textual.app import App, ComposeResult
+from textual.widgets import Header, Footer, Input, Static
+from textual.containers import Horizontal, Vertical
+from textual.reactive import reactive
+from app.core.pipeline import QueryMindPipeline
+from app.core.context import Context
+class QueryMindApp(App):
+    CSS = """
+    #top {
+        height: 10;
+    }
+    #chat {
+        border: round green;
+        padding: 1;
+    }
+    #input {
+        dock: bottom;
+    }
+    """
+    BINDINGS = [
+        ("q", "quit", "Quit"),
+        ("ctrl+c", "quit", "Quit"),
+    ]
+    def __init__(self, pipeline: QueryMindPipeline):
+        super().__init__()
+        self.pipeline = pipeline
+        self.chat_history = "🧠 QueryMind Ready\n"
+        # Show active sheet in system info if available
+        active = getattr(pipeline, "_base_context", {}).get("active_sheet", "")
+        self._active_sheet = active
+    def compose(self) -> ComposeResult:
+        yield Header()
+        with Horizontal(id="top"):
+            yield Static(self._get_banner(), id="banner")
+            yield Static(self._get_system_info(), id="system")
+        self.chat = Static(self.chat_history, id="chat")
+        yield self.chat
+        self.input = Input(placeholder="Ask a question about your data...", id="input")
+        yield self.input
+        yield Footer()
+    # ------------------------------------------------------------------ #
+    def _get_banner(self) -> str:
+        return "   🧠 QueryMind\n   AI Data Analyst\n"
+    def _get_system_info(self) -> str:
+        sheet_line = f"Sheet : {self._active_sheet}\n" if self._active_sheet else ""
+        llm_status = (
+            "LLM   : ✅ Ollama (phi)"
+            if getattr(self.pipeline, "llm_available", False)
+            else "LLM   : ⚠️  Offline (rule-based only)"
+        )
+        return f"Agent : QueryMind\nMode  : Local Analysis\n{llm_status}\n{sheet_line}"
+    # ------------------------------------------------------------------ #
+    async def on_input_submitted(self, event: Input.Submitted):
+        query = event.value.strip()
+        if not query:
+            return
+        if query.lower() in ("exit", "quit", "/bye", "bye", "/c"):
+            self.exit()
+            return
+        self.chat_history += f"\n>> {query}"
+        context = Context(query)
+        result = self.pipeline.run(context)
+        if result.get("error"):
+            response = f"❌ {result['error']}"
+        else:
+            response = result.get("answer", "No answer generated.")
+        # Show which sheet the answer came from (useful in multi-sheet mode)
+        active = result.get("active_sheet", "")
+        if active and "+" in active:
+            response = f"[{active}]\n{response}"
+        self.chat_history += f"\n💡 {response}\n"
+        self.chat.update(self.chat_history)
+        self.input.value = ""