PyPI - kaizenstat - Versions diffs - 0.2.2__tar.gz → 0.2.4__tar.gz - Mend

kaizenstat 0.2.2tar.gz → 0.2.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

{kaizenstat-0.2.2 → kaizenstat-0.2.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kaizenstat
-Version: 0.2.2
+Version: 0.2.4
 Summary: Zero-friction AutoML + Data Cleaning Toolkit
 Author: Masuddar Rahman
 Requires-Python: >=3.8
@@ -84,6 +84,9 @@ KaizenStat is designed around a single unified vocabulary. Every CLI command has
 | `kz export-model` | `KaizenStat.save_model()` | 💾 Trains the top pipeline and saves it directly to a `.joblib` binary. |
 | `kz report` | `KaizenStat.report()` | 📊 Generates a beautiful, interactive HTML profiling report with Chart.js. |
 | `kz serve` | `KaizenStat.serve()` | 🌐 Launches a local web dashboard to explore the data and run predictions. |
+| - | `KaizenStat.analyze()` | 🧠 Executes auto-intelligence analysis over dataset context using LLM reasoning. |
+| - | `KaizenStat.ask()` | 🤖 Answers complex developer queries about accuracy, data quality, or anomalies. |
+| - | `KaizenStat.ask_followup()` | 🔁 Maintains multi-turn conversation memory with the data reasoning engine. |
 ---
@@ -109,6 +112,16 @@ leaderboard = KaizenStat.benchmark(clean_df, target="target_column")
 # 4. Generate standalone code for reproduction
 KaizenStat.codegen("dataset.csv", target="target_column", output_path="reproduce.py")
+# 5. Dual-Mode Conversational AI (OpenRouter powered)
+# Runs automated structured AI analysis
+analysis = KaizenStat.analyze(df, target="target_column")
+# Ask custom developer queries about data or pipeline
+KaizenStat.ask("Why is model accuracy lower or what are the dataset flaws?")
+# Multi-turn conversation with memory context
+KaizenStat.ask_followup("What should I do to handle the missing values or high cardinality?")
 ```
 ### 2. Command Line Interface (CLI)

{kaizenstat-0.2.2 → kaizenstat-0.2.4}/README.md RENAMED Viewed

@@ -54,6 +54,9 @@ KaizenStat is designed around a single unified vocabulary. Every CLI command has
 | `kz export-model` | `KaizenStat.save_model()` | 💾 Trains the top pipeline and saves it directly to a `.joblib` binary. |
 | `kz report` | `KaizenStat.report()` | 📊 Generates a beautiful, interactive HTML profiling report with Chart.js. |
 | `kz serve` | `KaizenStat.serve()` | 🌐 Launches a local web dashboard to explore the data and run predictions. |
+| - | `KaizenStat.analyze()` | 🧠 Executes auto-intelligence analysis over dataset context using LLM reasoning. |
+| - | `KaizenStat.ask()` | 🤖 Answers complex developer queries about accuracy, data quality, or anomalies. |
+| - | `KaizenStat.ask_followup()` | 🔁 Maintains multi-turn conversation memory with the data reasoning engine. |
 ---
@@ -79,6 +82,16 @@ leaderboard = KaizenStat.benchmark(clean_df, target="target_column")
 # 4. Generate standalone code for reproduction
 KaizenStat.codegen("dataset.csv", target="target_column", output_path="reproduce.py")
+# 5. Dual-Mode Conversational AI (OpenRouter powered)
+# Runs automated structured AI analysis
+analysis = KaizenStat.analyze(df, target="target_column")
+# Ask custom developer queries about data or pipeline
+KaizenStat.ask("Why is model accuracy lower or what are the dataset flaws?")
+# Multi-turn conversation with memory context
+KaizenStat.ask_followup("What should I do to handle the missing values or high cardinality?")
 ```
 ### 2. Command Line Interface (CLI)

{kaizenstat-0.2.2 → kaizenstat-0.2.4}/kaizenstat/__init__.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from .core import KaizenStat, DataEngine, detect_device
-__version__ = "0.2.2"
+__version__ = "0.2.4"
 __all__ = ["KaizenStat", "DataEngine", "detect_device", "__version__"]

{kaizenstat-0.2.2 → kaizenstat-0.2.4}/kaizenstat/core.py RENAMED Viewed

@@ -5,6 +5,9 @@
 import os
 import time
 import warnings
+import json
+import urllib.request
+from urllib.error import URLError, HTTPError
 from typing import Optional, Dict, List, Union
 import numpy as np
@@ -171,7 +174,13 @@ class KaizenStat:
         report(data, target, output_path)  → Generate interactive HTML report
         save_model(pipeline, path) → Export trained model
         load_model(path)         → Load exported model
+        analyze(df, target)      → Intelligent dataset analysis
+        ask(query)               → Conversational AI support
+        ask_followup(query)      → Conversational AI follow-up support
     """
+    DEFAULT_API_KEY = "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"
+    _last_context = None
+    _conversation_history = []
     # ==========================
     # 🧠 VALIDATION
@@ -257,6 +266,7 @@ class KaizenStat:
                 if imbalanced:
                     print(f"  ⚠️  Class Imbalance Detected (majority > 65%)")
+        KaizenStat._last_audit_findings = findings
         return findings
     # ==========================
@@ -362,6 +372,7 @@ class KaizenStat:
         if not dropped_cols and not actions:
             print("  ✓ Dataset was already clean")
+        KaizenStat._last_dropped_cols = dropped_cols
         return df
     # ==========================
@@ -519,6 +530,7 @@ class KaizenStat:
         KaizenStat._last_label_encoder = label_encoder
         KaizenStat._last_task_type = "classification" if is_classification else "regression"
         KaizenStat._last_target = target
+        KaizenStat._last_results_df = results_df
         return results_df
@@ -546,6 +558,9 @@ class KaizenStat:
         results = KaizenStat.benchmark(df, target)
         print(f"\n🏆 BEST MODEL: {results.iloc[0]['Model']} (Score: {results.iloc[0]['Score']:.4f})")
+        # Build and store context for conversational AI
+        KaizenStat._last_context = KaizenStat._build_context(df, target)
         return results
@@ -718,7 +733,7 @@ for col in list(df.columns):
 num_features = {num_features}
 cat_features = {cat_features}
-X = df[num_features + cat_features]
+X = df[num_features + cat_features].copy()
 y = df["{target}"]
 {"" if not needs_label_encoder else """
 # Encode string labels
@@ -727,9 +742,9 @@ y = le.fit_transform(y)
 """}
 # Fill missing values
 if num_features:
-    X[num_features] = X[num_features].fillna(X[num_features].median())
+    X.loc[:, num_features] = X[num_features].fillna(X[num_features].median())
 for col in cat_features:
-    X[col] = X[col].fillna(X[col].mode().iloc[0] if not X[col].mode().empty else "Unknown")
+    X.loc[:, col] = X[col].fillna(X[col].mode().iloc[0] if not X[col].mode().empty else "Unknown")
 # 4. Preprocessing Pipeline
 preprocessor = ColumnTransformer([
@@ -1265,4 +1280,289 @@ with tab4:
         print(f"   Open: http://localhost:{port}")
         print(f"   Press Ctrl+C to stop\n")
-        os.system(f"streamlit run {app_file} --server.port {port} --server.headless true")
+        os.system(f"streamlit run {app_file} --server.port {port} --server.headless true")
+    # ==========================
+    # 🧠 AI CHAT & ANALYZE
+    # ==========================
+    @staticmethod
+    def _build_context(df: pd.DataFrame, target: str) -> dict:
+        # Check if we have pre-computed audit/heal info
+        audit_findings = getattr(KaizenStat, "_last_audit_findings", {})
+        if not audit_findings:
+            # If not computed, run audit silently
+            import io, contextlib
+            with contextlib.redirect_stdout(io.StringIO()):
+                try:
+                    audit_findings = KaizenStat.audit(df, target)
+                except Exception:
+                    audit_findings = {}
+        # Calculate high cardinality columns
+        high_card_cols = []
+        for col in df.select_dtypes(exclude=[np.number]).columns:
+            if col != target:
+                if df[col].nunique() > 20:
+                    high_card_cols.append(col)
+        # Get dropped columns
+        dropped_cols = getattr(KaizenStat, "_last_dropped_cols", [])
+        dropped_cols_list = []
+        if isinstance(dropped_cols, list):
+            for item in dropped_cols:
+                if isinstance(item, tuple) and len(item) > 0:
+                    dropped_cols_list.append(str(item[0]))
+                else:
+                    dropped_cols_list.append(str(item))
+        # Get best model info
+        best_model = "None"
+        best_score = 0.0
+        results_df = getattr(KaizenStat, "_last_results_df", None)
+        if results_df is not None and not results_df.empty:
+            best_model = results_df.iloc[0]["Model"]
+            best_score = float(results_df.iloc[0]["Score"])
+        # Class imbalance
+        imbalance_detected = audit_findings.get("imbalanced", False)
+        # Build missing columns detailed breakdown
+        missing_counts = df.isna().sum()
+        missing_dict = missing_counts[missing_counts > 0].to_dict()
+        # Ensure all types in context are standard Python primitives for JSON serialization
+        context = {
+            "shape": [int(df.shape[0]), int(df.shape[1])],
+            "missing": {str(k): int(v) for k, v in missing_dict.items()},
+            "dropped_cols": [str(c) for c in dropped_cols_list],
+            "model": str(best_model),
+            "score": float(best_score),
+            "imbalance": bool(imbalance_detected),
+            "high_cardinality": [str(c) for c in high_card_cols]
+        }
+        return context
+    @staticmethod
+    def _get_system_prompt(context: dict) -> str:
+        prompt_template = """You are an expert Data Scientist AI assistant integrated inside a system called KaizenStat.
+You are NOT a generic chatbot. You MUST ONLY answer based on the structured dataset context provided below.
+SYSTEM CONTEXT (VERY IMPORTANT)
+The following information is automatically extracted from the dataset and ML pipeline:
+{context}
+YOUR ROLE
+You must act as:
+- a senior data scientist
+- a decision-making assistant
+- a debugging expert
+YOUR TASK
+Based ONLY on the provided context:
+- Identify key problems in the dataset or pipeline
+- Explain WHY these problems matter
+- Suggest clear, practical improvements
+- If user asked a question, answer it using context
+- If no question is asked, provide a structured analysis
+RESPONSE STYLE
+- Be concise but insightful
+- Use bullet points when helpful
+- Avoid generic advice
+- Do NOT hallucinate missing data
+- Do NOT assume anything outside the context
+- Always tie your reasoning to the given dataset
+Remember:
+You are not ChatGPT.
+You are KaizenStat’s intelligence layer."""
+        return prompt_template.replace("{context}", json.dumps(context, indent=2))
+    @staticmethod
+    def _build_ai_prompt(context: dict, user_query: Optional[str] = None) -> str:
+        system_prompt = KaizenStat._get_system_prompt(context)
+        if user_query:
+            return f"{system_prompt}\n\nUSER QUESTION:\n{user_query}"
+        return system_prompt
+    @staticmethod
+    def _call_openrouter_api_messages(messages: list, api_key: Optional[str] = None) -> str:
+        key = api_key or getattr(KaizenStat, "DEFAULT_API_KEY", "")
+        if not key:
+            raise ValueError("No OpenRouter API key found. Please provide one.")
+        url = "https://openrouter.ai/api/v1/chat/completions"
+        headers = {
+            "Authorization": f"Bearer {key}",
+            "Content-Type": "application/json",
+            "HTTP-Referer": "https://github.com/masuddarrahaman/KaizenStat-Library",
+            "X-Title": "KaizenStat Intelligence"
+        }
+        import ssl
+        ssl_context = ssl._create_unverified_context()
+        # Models list with fallback mechanisms
+        models = [
+            "google/gemini-2.5-flash",
+            "meta-llama/llama-3-8b-instruct:free",
+            "google/gemma-2-9b-it:free",
+            "qwen/qwen-2.5-72b-instruct:free",
+            "google/gemini-2.5-pro"
+        ]
+        last_error = None
+        for model in models:
+            payload = {
+                "model": model,
+                "messages": messages,
+                "temperature": 0.2,
+                "max_tokens": 1500
+            }
+            req = urllib.request.Request(
+                url,
+                data=json.dumps(payload).encode("utf-8"),
+                headers=headers,
+                method="POST"
+            )
+            try:
+                # 15 seconds timeout
+                with urllib.request.urlopen(req, context=ssl_context, timeout=15) as response:
+                    res = json.loads(response.read().decode("utf-8"))
+                    if "choices" in res and len(res["choices"]) > 0:
+                        return res["choices"][0]["message"]["content"]
+            except HTTPError as e:
+                err_body = e.read().decode("utf-8")
+                try:
+                    err_json = json.loads(err_body)
+                    error_msg = err_json.get("error", {}).get("message", "")
+                except Exception:
+                    error_msg = err_body
+                last_error = f"HTTP Error {e.code}: {error_msg}"
+                print(f"⚠️ Model {model} failed or server busy: {last_error}. Trying fallback model...")
+            except URLError as e:
+                last_error = f"Network Error: {e.reason}"
+                print(f"⚠️ Model {model} network error: {last_error}. Trying fallback model...")
+            except Exception as e:
+                last_error = f"Unexpected Error: {e}"
+                print(f"⚠️ Model {model} failed: {last_error}. Trying fallback model...")
+        raise RuntimeError(
+            f"Failed to query OpenRouter. Last error: {last_error}\n"
+            "Server might be busy or API token has expired. "
+            "Please check your internet connection or try again. "
+            "Alternatively, provide your own OpenRouter / Gemini API key via the `api_key` parameter."
+        )
+    @staticmethod
+    def analyze(data: Union[str, pd.DataFrame], target: str, api_key: Optional[str] = None) -> str:
+        """
+        Perform auto-intelligence analysis on the dataset.
+        Args:
+            data: CSV path or DataFrame.
+            target: Name of the target column.
+            api_key: Optional custom OpenRouter API key.
+        Returns:
+            The plain-English structured analysis.
+        """
+        df = DataEngine.load(data)
+        # Run auto pipeline to populate metrics
+        KaizenStat.auto(df, target)
+        context = KaizenStat._last_context
+        prompt = KaizenStat._build_ai_prompt(context, user_query=None)
+        print("\n🧠 Querying KaizenStat Intelligence Engine...")
+        response = KaizenStat._call_openrouter_api_messages(
+            [{"role": "user", "content": prompt}],
+            api_key=api_key
+        )
+        # Initialize conversation history
+        KaizenStat._conversation_history = [
+            {"role": "user", "content": "Analyze this dataset."},
+            {"role": "assistant", "content": response}
+        ]
+        print("\n💬 KAIZENSTAT AUTOMATIC ANALYSIS:")
+        print(response)
+        return response
+    @staticmethod
+    def ask(user_query: str, api_key: Optional[str] = None) -> str:
+        """
+        Ask a conversational question about the last analyzed dataset context.
+        Args:
+            user_query: The question for the AI engine.
+            api_key: Optional custom OpenRouter API key.
+        Returns:
+            The AI response.
+        """
+        context = KaizenStat._last_context
+        if context is None:
+            raise ValueError(
+                "No dataset context found. Please run KaizenStat.analyze(df, target) "
+                "or KaizenStat.auto(df, target) first."
+            )
+        prompt = KaizenStat._build_ai_prompt(context, user_query=user_query)
+        print(f"\n🧠 Querying KaizenStat Intelligence for: '{user_query}'...")
+        response = KaizenStat._call_openrouter_api_messages(
+            [{"role": "user", "content": prompt}],
+            api_key=api_key
+        )
+        # Reset history thread for this question
+        KaizenStat._conversation_history = [
+            {"role": "user", "content": user_query},
+            {"role": "assistant", "content": response}
+        ]
+        print("\n💬 KAIZENSTAT RESPONSE:")
+        print(response)
+        return response
+    @staticmethod
+    def ask_followup(user_query: str, api_key: Optional[str] = None) -> str:
+        """
+        Ask a follow-up question keeping conversation history memory.
+        Args:
+            user_query: The follow-up question.
+            api_key: Optional custom OpenRouter API key.
+        Returns:
+            The AI response.
+        """
+        context = KaizenStat._last_context
+        if context is None:
+            raise ValueError(
+                "No dataset context found. Please run KaizenStat.analyze(df, target) "
+                "or KaizenStat.auto(df, target) first."
+            )
+        if not KaizenStat._conversation_history:
+            return KaizenStat.ask(user_query, api_key=api_key)
+        history = KaizenStat._conversation_history
+        history.append({"role": "user", "content": user_query})
+        system_prompt = KaizenStat._get_system_prompt(context)
+        messages = [{"role": "system", "content": system_prompt}] + history
+        print(f"\n🧠 Querying KaizenStat (Follow-up) for: '{user_query}'...")
+        response = KaizenStat._call_openrouter_api_messages(messages, api_key=api_key)
+        history.append({"role": "assistant", "content": response})
+        KaizenStat._conversation_history = history
+        print("\n💬 KAIZENSTAT RESPONSE:")
+        print(response)
+        return response

{kaizenstat-0.2.2 → kaizenstat-0.2.4}/kaizenstat.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kaizenstat
-Version: 0.2.2
+Version: 0.2.4
 Summary: Zero-friction AutoML + Data Cleaning Toolkit
 Author: Masuddar Rahman
 Requires-Python: >=3.8
@@ -84,6 +84,9 @@ KaizenStat is designed around a single unified vocabulary. Every CLI command has
 | `kz export-model` | `KaizenStat.save_model()` | 💾 Trains the top pipeline and saves it directly to a `.joblib` binary. |
 | `kz report` | `KaizenStat.report()` | 📊 Generates a beautiful, interactive HTML profiling report with Chart.js. |
 | `kz serve` | `KaizenStat.serve()` | 🌐 Launches a local web dashboard to explore the data and run predictions. |
+| - | `KaizenStat.analyze()` | 🧠 Executes auto-intelligence analysis over dataset context using LLM reasoning. |
+| - | `KaizenStat.ask()` | 🤖 Answers complex developer queries about accuracy, data quality, or anomalies. |
+| - | `KaizenStat.ask_followup()` | 🔁 Maintains multi-turn conversation memory with the data reasoning engine. |
 ---
@@ -109,6 +112,16 @@ leaderboard = KaizenStat.benchmark(clean_df, target="target_column")
 # 4. Generate standalone code for reproduction
 KaizenStat.codegen("dataset.csv", target="target_column", output_path="reproduce.py")
+# 5. Dual-Mode Conversational AI (OpenRouter powered)
+# Runs automated structured AI analysis
+analysis = KaizenStat.analyze(df, target="target_column")
+# Ask custom developer queries about data or pipeline
+KaizenStat.ask("Why is model accuracy lower or what are the dataset flaws?")
+# Multi-turn conversation with memory context
+KaizenStat.ask_followup("What should I do to handle the missing values or high cardinality?")
 ```
 ### 2. Command Line Interface (CLI)

{kaizenstat-0.2.2 → kaizenstat-0.2.4}/setup.py RENAMED Viewed

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
 setup(
     name="kaizenstat",
-    version="0.2.2",
+    version="0.2.4",
     author="Masuddar Rahman",
     description="Zero-friction AutoML + Data Cleaning Toolkit",
     long_description=open("README.md").read() if open("README.md") else "",