kaizenstat 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kaizenstat
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Zero-friction AutoML + Data Cleaning Toolkit
5
5
  Author: Masuddar Rahman
6
6
  Requires-Python: >=3.8
@@ -84,6 +84,9 @@ KaizenStat is designed around a single unified vocabulary. Every CLI command has
84
84
  | `kz export-model` | `KaizenStat.save_model()` | 💾 Trains the top pipeline and saves it directly to a `.joblib` binary. |
85
85
  | `kz report` | `KaizenStat.report()` | 📊 Generates a beautiful, interactive HTML profiling report with Chart.js. |
86
86
  | `kz serve` | `KaizenStat.serve()` | 🌐 Launches a local web dashboard to explore the data and run predictions. |
87
+ | - | `KaizenStat.analyze()` | 🧠 Executes auto-intelligence analysis over dataset context using LLM reasoning. |
88
+ | - | `KaizenStat.ask()` | 🤖 Answers complex developer queries about accuracy, data quality, or anomalies. |
89
+ | - | `KaizenStat.ask_followup()` | 🔁 Maintains multi-turn conversation memory with the data reasoning engine. |
87
90
 
88
91
  ---
89
92
 
@@ -109,6 +112,16 @@ leaderboard = KaizenStat.benchmark(clean_df, target="target_column")
109
112
 
110
113
  # 4. Generate standalone code for reproduction
111
114
  KaizenStat.codegen("dataset.csv", target="target_column", output_path="reproduce.py")
115
+
116
+ # 5. Dual-Mode Conversational AI (OpenRouter powered)
117
+ # Runs automated structured AI analysis
118
+ analysis = KaizenStat.analyze(df, target="target_column")
119
+
120
+ # Ask custom developer queries about data or pipeline
121
+ KaizenStat.ask("Why is model accuracy lower or what are the dataset flaws?")
122
+
123
+ # Multi-turn conversation with memory context
124
+ KaizenStat.ask_followup("What should I do to handle the missing values or high cardinality?")
112
125
  ```
113
126
 
114
127
  ### 2. Command Line Interface (CLI)
@@ -54,6 +54,9 @@ KaizenStat is designed around a single unified vocabulary. Every CLI command has
54
54
  | `kz export-model` | `KaizenStat.save_model()` | 💾 Trains the top pipeline and saves it directly to a `.joblib` binary. |
55
55
  | `kz report` | `KaizenStat.report()` | 📊 Generates a beautiful, interactive HTML profiling report with Chart.js. |
56
56
  | `kz serve` | `KaizenStat.serve()` | 🌐 Launches a local web dashboard to explore the data and run predictions. |
57
+ | - | `KaizenStat.analyze()` | 🧠 Executes auto-intelligence analysis over dataset context using LLM reasoning. |
58
+ | - | `KaizenStat.ask()` | 🤖 Answers complex developer queries about accuracy, data quality, or anomalies. |
59
+ | - | `KaizenStat.ask_followup()` | 🔁 Maintains multi-turn conversation memory with the data reasoning engine. |
57
60
 
58
61
  ---
59
62
 
@@ -79,6 +82,16 @@ leaderboard = KaizenStat.benchmark(clean_df, target="target_column")
79
82
 
80
83
  # 4. Generate standalone code for reproduction
81
84
  KaizenStat.codegen("dataset.csv", target="target_column", output_path="reproduce.py")
85
+
86
+ # 5. Dual-Mode Conversational AI (OpenRouter powered)
87
+ # Runs automated structured AI analysis
88
+ analysis = KaizenStat.analyze(df, target="target_column")
89
+
90
+ # Ask custom developer queries about data or pipeline
91
+ KaizenStat.ask("Why is model accuracy lower or what are the dataset flaws?")
92
+
93
+ # Multi-turn conversation with memory context
94
+ KaizenStat.ask_followup("What should I do to handle the missing values or high cardinality?")
82
95
  ```
83
96
 
84
97
  ### 2. Command Line Interface (CLI)
@@ -1,6 +1,6 @@
1
1
  from .core import KaizenStat, DataEngine, detect_device
2
2
 
3
- __version__ = "0.2.2"
3
+ __version__ = "0.2.4"
4
4
 
5
5
  __all__ = ["KaizenStat", "DataEngine", "detect_device", "__version__"]
6
6
 
@@ -5,6 +5,9 @@
5
5
  import os
6
6
  import time
7
7
  import warnings
8
+ import json
9
+ import urllib.request
10
+ from urllib.error import URLError, HTTPError
8
11
  from typing import Optional, Dict, List, Union
9
12
 
10
13
  import numpy as np
@@ -171,7 +174,13 @@ class KaizenStat:
171
174
  report(data, target, output_path) → Generate interactive HTML report
172
175
  save_model(pipeline, path) → Export trained model
173
176
  load_model(path) → Load exported model
177
+ analyze(df, target) → Intelligent dataset analysis
178
+ ask(query) → Conversational AI support
179
+ ask_followup(query) → Conversational AI follow-up support
174
180
  """
181
+ DEFAULT_API_KEY = "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"
182
+ _last_context = None
183
+ _conversation_history = []
175
184
 
176
185
  # ==========================
177
186
  # 🧠 VALIDATION
@@ -257,6 +266,7 @@ class KaizenStat:
257
266
  if imbalanced:
258
267
  print(f" ⚠️ Class Imbalance Detected (majority > 65%)")
259
268
 
269
+ KaizenStat._last_audit_findings = findings
260
270
  return findings
261
271
 
262
272
  # ==========================
@@ -362,6 +372,7 @@ class KaizenStat:
362
372
  if not dropped_cols and not actions:
363
373
  print(" ✓ Dataset was already clean")
364
374
 
375
+ KaizenStat._last_dropped_cols = dropped_cols
365
376
  return df
366
377
 
367
378
  # ==========================
@@ -519,6 +530,7 @@ class KaizenStat:
519
530
  KaizenStat._last_label_encoder = label_encoder
520
531
  KaizenStat._last_task_type = "classification" if is_classification else "regression"
521
532
  KaizenStat._last_target = target
533
+ KaizenStat._last_results_df = results_df
522
534
 
523
535
  return results_df
524
536
 
@@ -546,6 +558,9 @@ class KaizenStat:
546
558
  results = KaizenStat.benchmark(df, target)
547
559
 
548
560
  print(f"\n🏆 BEST MODEL: {results.iloc[0]['Model']} (Score: {results.iloc[0]['Score']:.4f})")
561
+
562
+ # Build and store context for conversational AI
563
+ KaizenStat._last_context = KaizenStat._build_context(df, target)
549
564
 
550
565
  return results
551
566
 
@@ -718,7 +733,7 @@ for col in list(df.columns):
718
733
  num_features = {num_features}
719
734
  cat_features = {cat_features}
720
735
 
721
- X = df[num_features + cat_features]
736
+ X = df[num_features + cat_features].copy()
722
737
  y = df["{target}"]
723
738
  {"" if not needs_label_encoder else """
724
739
  # Encode string labels
@@ -727,9 +742,9 @@ y = le.fit_transform(y)
727
742
  """}
728
743
  # Fill missing values
729
744
  if num_features:
730
- X[num_features] = X[num_features].fillna(X[num_features].median())
745
+ X.loc[:, num_features] = X[num_features].fillna(X[num_features].median())
731
746
  for col in cat_features:
732
- X[col] = X[col].fillna(X[col].mode().iloc[0] if not X[col].mode().empty else "Unknown")
747
+ X.loc[:, col] = X[col].fillna(X[col].mode().iloc[0] if not X[col].mode().empty else "Unknown")
733
748
 
734
749
  # 4. Preprocessing Pipeline
735
750
  preprocessor = ColumnTransformer([
@@ -1265,4 +1280,289 @@ with tab4:
1265
1280
  print(f" Open: http://localhost:{port}")
1266
1281
  print(f" Press Ctrl+C to stop\n")
1267
1282
 
1268
- os.system(f"streamlit run {app_file} --server.port {port} --server.headless true")
1283
+ os.system(f"streamlit run {app_file} --server.port {port} --server.headless true")
1284
+
1285
+ # ==========================
1286
+ # 🧠 AI CHAT & ANALYZE
1287
+ # ==========================
1288
+ @staticmethod
1289
+ def _build_context(df: pd.DataFrame, target: str) -> dict:
1290
+ # Check if we have pre-computed audit/heal info
1291
+ audit_findings = getattr(KaizenStat, "_last_audit_findings", {})
1292
+ if not audit_findings:
1293
+ # If not computed, run audit silently
1294
+ import io, contextlib
1295
+ with contextlib.redirect_stdout(io.StringIO()):
1296
+ try:
1297
+ audit_findings = KaizenStat.audit(df, target)
1298
+ except Exception:
1299
+ audit_findings = {}
1300
+
1301
+ # Calculate high cardinality columns
1302
+ high_card_cols = []
1303
+ for col in df.select_dtypes(exclude=[np.number]).columns:
1304
+ if col != target:
1305
+ if df[col].nunique() > 20:
1306
+ high_card_cols.append(col)
1307
+
1308
+ # Get dropped columns
1309
+ dropped_cols = getattr(KaizenStat, "_last_dropped_cols", [])
1310
+ dropped_cols_list = []
1311
+ if isinstance(dropped_cols, list):
1312
+ for item in dropped_cols:
1313
+ if isinstance(item, tuple) and len(item) > 0:
1314
+ dropped_cols_list.append(str(item[0]))
1315
+ else:
1316
+ dropped_cols_list.append(str(item))
1317
+
1318
+ # Get best model info
1319
+ best_model = "None"
1320
+ best_score = 0.0
1321
+ results_df = getattr(KaizenStat, "_last_results_df", None)
1322
+ if results_df is not None and not results_df.empty:
1323
+ best_model = results_df.iloc[0]["Model"]
1324
+ best_score = float(results_df.iloc[0]["Score"])
1325
+
1326
+ # Class imbalance
1327
+ imbalance_detected = audit_findings.get("imbalanced", False)
1328
+
1329
+ # Build missing columns detailed breakdown
1330
+ missing_counts = df.isna().sum()
1331
+ missing_dict = missing_counts[missing_counts > 0].to_dict()
1332
+
1333
+ # Ensure all types in context are standard Python primitives for JSON serialization
1334
+ context = {
1335
+ "shape": [int(df.shape[0]), int(df.shape[1])],
1336
+ "missing": {str(k): int(v) for k, v in missing_dict.items()},
1337
+ "dropped_cols": [str(c) for c in dropped_cols_list],
1338
+ "model": str(best_model),
1339
+ "score": float(best_score),
1340
+ "imbalance": bool(imbalance_detected),
1341
+ "high_cardinality": [str(c) for c in high_card_cols]
1342
+ }
1343
+ return context
1344
+
1345
+ @staticmethod
1346
+ def _get_system_prompt(context: dict) -> str:
1347
+ prompt_template = """You are an expert Data Scientist AI assistant integrated inside a system called KaizenStat.
1348
+
1349
+ You are NOT a generic chatbot. You MUST ONLY answer based on the structured dataset context provided below.
1350
+
1351
+ SYSTEM CONTEXT (VERY IMPORTANT)
1352
+ The following information is automatically extracted from the dataset and ML pipeline:
1353
+ {context}
1354
+
1355
+ YOUR ROLE
1356
+ You must act as:
1357
+ - a senior data scientist
1358
+ - a decision-making assistant
1359
+ - a debugging expert
1360
+
1361
+ YOUR TASK
1362
+ Based ONLY on the provided context:
1363
+ - Identify key problems in the dataset or pipeline
1364
+ - Explain WHY these problems matter
1365
+ - Suggest clear, practical improvements
1366
+ - If user asked a question, answer it using context
1367
+ - If no question is asked, provide a structured analysis
1368
+
1369
+ RESPONSE STYLE
1370
+ - Be concise but insightful
1371
+ - Use bullet points when helpful
1372
+ - Avoid generic advice
1373
+ - Do NOT hallucinate missing data
1374
+ - Do NOT assume anything outside the context
1375
+ - Always tie your reasoning to the given dataset
1376
+
1377
+ Remember:
1378
+ You are not ChatGPT.
1379
+ You are KaizenStat’s intelligence layer."""
1380
+ return prompt_template.replace("{context}", json.dumps(context, indent=2))
1381
+
1382
+ @staticmethod
1383
+ def _build_ai_prompt(context: dict, user_query: Optional[str] = None) -> str:
1384
+ system_prompt = KaizenStat._get_system_prompt(context)
1385
+ if user_query:
1386
+ return f"{system_prompt}\n\nUSER QUESTION:\n{user_query}"
1387
+ return system_prompt
1388
+
1389
+ @staticmethod
1390
+ def _call_openrouter_api_messages(messages: list, api_key: Optional[str] = None) -> str:
1391
+ key = api_key or getattr(KaizenStat, "DEFAULT_API_KEY", "")
1392
+ if not key:
1393
+ raise ValueError("No OpenRouter API key found. Please provide one.")
1394
+
1395
+ url = "https://openrouter.ai/api/v1/chat/completions"
1396
+ headers = {
1397
+ "Authorization": f"Bearer {key}",
1398
+ "Content-Type": "application/json",
1399
+ "HTTP-Referer": "https://github.com/masuddarrahaman/KaizenStat-Library",
1400
+ "X-Title": "KaizenStat Intelligence"
1401
+ }
1402
+
1403
+ import ssl
1404
+ ssl_context = ssl._create_unverified_context()
1405
+
1406
+ # Models list with fallback mechanisms
1407
+ models = [
1408
+ "google/gemini-2.5-flash",
1409
+ "meta-llama/llama-3-8b-instruct:free",
1410
+ "google/gemma-2-9b-it:free",
1411
+ "qwen/qwen-2.5-72b-instruct:free",
1412
+ "google/gemini-2.5-pro"
1413
+ ]
1414
+
1415
+ last_error = None
1416
+ for model in models:
1417
+ payload = {
1418
+ "model": model,
1419
+ "messages": messages,
1420
+ "temperature": 0.2,
1421
+ "max_tokens": 1500
1422
+ }
1423
+ req = urllib.request.Request(
1424
+ url,
1425
+ data=json.dumps(payload).encode("utf-8"),
1426
+ headers=headers,
1427
+ method="POST"
1428
+ )
1429
+
1430
+ try:
1431
+ # 15 seconds timeout
1432
+ with urllib.request.urlopen(req, context=ssl_context, timeout=15) as response:
1433
+ res = json.loads(response.read().decode("utf-8"))
1434
+ if "choices" in res and len(res["choices"]) > 0:
1435
+ return res["choices"][0]["message"]["content"]
1436
+ except HTTPError as e:
1437
+ err_body = e.read().decode("utf-8")
1438
+ try:
1439
+ err_json = json.loads(err_body)
1440
+ error_msg = err_json.get("error", {}).get("message", "")
1441
+ except Exception:
1442
+ error_msg = err_body
1443
+ last_error = f"HTTP Error {e.code}: {error_msg}"
1444
+ print(f"⚠️ Model {model} failed or server busy: {last_error}. Trying fallback model...")
1445
+ except URLError as e:
1446
+ last_error = f"Network Error: {e.reason}"
1447
+ print(f"⚠️ Model {model} network error: {last_error}. Trying fallback model...")
1448
+ except Exception as e:
1449
+ last_error = f"Unexpected Error: {e}"
1450
+ print(f"⚠️ Model {model} failed: {last_error}. Trying fallback model...")
1451
+
1452
+ raise RuntimeError(
1453
+ f"Failed to query OpenRouter. Last error: {last_error}\n"
1454
+ "Server might be busy or API token has expired. "
1455
+ "Please check your internet connection or try again. "
1456
+ "Alternatively, provide your own OpenRouter / Gemini API key via the `api_key` parameter."
1457
+ )
1458
+
1459
+ @staticmethod
1460
+ def analyze(data: Union[str, pd.DataFrame], target: str, api_key: Optional[str] = None) -> str:
1461
+ """
1462
+ Perform auto-intelligence analysis on the dataset.
1463
+
1464
+ Args:
1465
+ data: CSV path or DataFrame.
1466
+ target: Name of the target column.
1467
+ api_key: Optional custom OpenRouter API key.
1468
+
1469
+ Returns:
1470
+ The plain-English structured analysis.
1471
+ """
1472
+ df = DataEngine.load(data)
1473
+ # Run auto pipeline to populate metrics
1474
+ KaizenStat.auto(df, target)
1475
+
1476
+ context = KaizenStat._last_context
1477
+ prompt = KaizenStat._build_ai_prompt(context, user_query=None)
1478
+
1479
+ print("\n🧠 Querying KaizenStat Intelligence Engine...")
1480
+ response = KaizenStat._call_openrouter_api_messages(
1481
+ [{"role": "user", "content": prompt}],
1482
+ api_key=api_key
1483
+ )
1484
+
1485
+ # Initialize conversation history
1486
+ KaizenStat._conversation_history = [
1487
+ {"role": "user", "content": "Analyze this dataset."},
1488
+ {"role": "assistant", "content": response}
1489
+ ]
1490
+
1491
+ print("\n💬 KAIZENSTAT AUTOMATIC ANALYSIS:")
1492
+ print(response)
1493
+ return response
1494
+
1495
+ @staticmethod
1496
+ def ask(user_query: str, api_key: Optional[str] = None) -> str:
1497
+ """
1498
+ Ask a conversational question about the last analyzed dataset context.
1499
+
1500
+ Args:
1501
+ user_query: The question for the AI engine.
1502
+ api_key: Optional custom OpenRouter API key.
1503
+
1504
+ Returns:
1505
+ The AI response.
1506
+ """
1507
+ context = KaizenStat._last_context
1508
+ if context is None:
1509
+ raise ValueError(
1510
+ "No dataset context found. Please run KaizenStat.analyze(df, target) "
1511
+ "or KaizenStat.auto(df, target) first."
1512
+ )
1513
+
1514
+ prompt = KaizenStat._build_ai_prompt(context, user_query=user_query)
1515
+
1516
+ print(f"\n🧠 Querying KaizenStat Intelligence for: '{user_query}'...")
1517
+ response = KaizenStat._call_openrouter_api_messages(
1518
+ [{"role": "user", "content": prompt}],
1519
+ api_key=api_key
1520
+ )
1521
+
1522
+ # Reset history thread for this question
1523
+ KaizenStat._conversation_history = [
1524
+ {"role": "user", "content": user_query},
1525
+ {"role": "assistant", "content": response}
1526
+ ]
1527
+
1528
+ print("\n💬 KAIZENSTAT RESPONSE:")
1529
+ print(response)
1530
+ return response
1531
+
1532
+ @staticmethod
1533
+ def ask_followup(user_query: str, api_key: Optional[str] = None) -> str:
1534
+ """
1535
+ Ask a follow-up question keeping conversation history memory.
1536
+
1537
+ Args:
1538
+ user_query: The follow-up question.
1539
+ api_key: Optional custom OpenRouter API key.
1540
+
1541
+ Returns:
1542
+ The AI response.
1543
+ """
1544
+ context = KaizenStat._last_context
1545
+ if context is None:
1546
+ raise ValueError(
1547
+ "No dataset context found. Please run KaizenStat.analyze(df, target) "
1548
+ "or KaizenStat.auto(df, target) first."
1549
+ )
1550
+
1551
+ if not KaizenStat._conversation_history:
1552
+ return KaizenStat.ask(user_query, api_key=api_key)
1553
+
1554
+ history = KaizenStat._conversation_history
1555
+ history.append({"role": "user", "content": user_query})
1556
+
1557
+ system_prompt = KaizenStat._get_system_prompt(context)
1558
+ messages = [{"role": "system", "content": system_prompt}] + history
1559
+
1560
+ print(f"\n🧠 Querying KaizenStat (Follow-up) for: '{user_query}'...")
1561
+ response = KaizenStat._call_openrouter_api_messages(messages, api_key=api_key)
1562
+
1563
+ history.append({"role": "assistant", "content": response})
1564
+ KaizenStat._conversation_history = history
1565
+
1566
+ print("\n💬 KAIZENSTAT RESPONSE:")
1567
+ print(response)
1568
+ return response
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kaizenstat
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Zero-friction AutoML + Data Cleaning Toolkit
5
5
  Author: Masuddar Rahman
6
6
  Requires-Python: >=3.8
@@ -84,6 +84,9 @@ KaizenStat is designed around a single unified vocabulary. Every CLI command has
84
84
  | `kz export-model` | `KaizenStat.save_model()` | 💾 Trains the top pipeline and saves it directly to a `.joblib` binary. |
85
85
  | `kz report` | `KaizenStat.report()` | 📊 Generates a beautiful, interactive HTML profiling report with Chart.js. |
86
86
  | `kz serve` | `KaizenStat.serve()` | 🌐 Launches a local web dashboard to explore the data and run predictions. |
87
+ | - | `KaizenStat.analyze()` | 🧠 Executes auto-intelligence analysis over dataset context using LLM reasoning. |
88
+ | - | `KaizenStat.ask()` | 🤖 Answers complex developer queries about accuracy, data quality, or anomalies. |
89
+ | - | `KaizenStat.ask_followup()` | 🔁 Maintains multi-turn conversation memory with the data reasoning engine. |
87
90
 
88
91
  ---
89
92
 
@@ -109,6 +112,16 @@ leaderboard = KaizenStat.benchmark(clean_df, target="target_column")
109
112
 
110
113
  # 4. Generate standalone code for reproduction
111
114
  KaizenStat.codegen("dataset.csv", target="target_column", output_path="reproduce.py")
115
+
116
+ # 5. Dual-Mode Conversational AI (OpenRouter powered)
117
+ # Runs automated structured AI analysis
118
+ analysis = KaizenStat.analyze(df, target="target_column")
119
+
120
+ # Ask custom developer queries about data or pipeline
121
+ KaizenStat.ask("Why is model accuracy lower or what are the dataset flaws?")
122
+
123
+ # Multi-turn conversation with memory context
124
+ KaizenStat.ask_followup("What should I do to handle the missing values or high cardinality?")
112
125
  ```
113
126
 
114
127
  ### 2. Command Line Interface (CLI)
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="kaizenstat",
5
- version="0.2.2",
5
+ version="0.2.4",
6
6
  author="Masuddar Rahman",
7
7
  description="Zero-friction AutoML + Data Cleaning Toolkit",
8
8
  long_description=open("README.md").read() if open("README.md") else "",
File without changes
File without changes
File without changes