sol-ai-core 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,77 @@
1
+ Metadata-Version: 2.4
2
+ Name: sol-ai-core
3
+ Version: 0.1.0
4
+ Summary: A professional Python Library for automated data cleaning, model selection, and visualization.
5
+ Project-URL: Homepage, https://github.com/yourusername/sol-ai-core
6
+ Author-email: Your Name <your.email@example.com>
7
+ License: MIT
8
+ Requires-Python: >=3.8
9
+ Requires-Dist: matplotlib>=3.4.0
10
+ Requires-Dist: openai>=1.0.0
11
+ Requires-Dist: pandas>=1.3.0
12
+ Requires-Dist: scikit-learn>=1.0.0
13
+ Description-Content-Type: text/markdown
14
+
15
+ # sol-ai-core
16
+
17
+ **SOL Engine** is now `sol-ai-core`, a professional, environment-agnostic Python library for automated data cleaning, machine learning model selection, and visualization.
18
+
19
+ ## Features
20
+ - **Logic Separated:** Works seamlessly in any Python environment (Jupyter, VS Code, CI/CD) without requiring Streamlit.
21
+ - **Class-Based Design:** Easy to use object-oriented approach.
22
+ - **End-to-End Automation:** Pass a DataFrame, auto-clean it, train models, and generate insights in just a few lines of code.
23
+
24
+ ## Installation
25
+
26
+ You can install the package via pip once published to PyPI:
27
+
28
+ ```bash
29
+ pip install sol-ai-core
30
+ ```
31
+
32
+ For local development:
33
+
34
+ ```bash
35
+ git clone https://github.com/yourusername/sol-ai-core.git
36
+ cd sol-ai-core
37
+ pip install -e .
38
+ ```
39
+
40
+ ## Usage
41
+
42
+ ```python
43
+ import pandas as pd
44
+ from sol_core.engine import SolEngine
45
+
46
+ # 1. Load your data
47
+ df = pd.read_csv("data.csv")
48
+
49
+ # 2. Initialize the engine
50
+ engine = SolEngine(df)
51
+
52
+ # 3. Clean the data automatically
53
+ clean_df = engine.auto_clean()
54
+
55
+ # 4. Train a model automatically (Classification or Regression)
56
+ report = engine.select_and_train_model(target_column="target")
57
+ print(report)
58
+
59
+ # 5. Generate Visualizations (Returns Matplotlib figure objects)
60
+ figures = engine.generate_visualizations(target_column="target")
61
+ figures['target_distribution'].show()
62
+ ```
63
+
64
+ ## Publishing to PyPI
65
+
66
+ To build and publish this library to PyPI, use the following commands:
67
+
68
+ ```bash
69
+ # Install build tools and twine
70
+ pip install build twine
71
+
72
+ # Build the package (creates dist/ directory)
73
+ python -m build
74
+
75
+ # Upload to PyPI (will prompt for username and password/token)
76
+ python -m twine upload dist/*
77
+ ```
@@ -0,0 +1,63 @@
1
+ # sol-ai-core
2
+
3
+ **SOL Engine** is now `sol-ai-core`, a professional, environment-agnostic Python library for automated data cleaning, machine learning model selection, and visualization.
4
+
5
+ ## Features
6
+ - **Logic Separated:** Works seamlessly in any Python environment (Jupyter, VS Code, CI/CD) without requiring Streamlit.
7
+ - **Class-Based Design:** Easy to use object-oriented approach.
8
+ - **End-to-End Automation:** Pass a DataFrame, auto-clean it, train models, and generate insights in just a few lines of code.
9
+
10
+ ## Installation
11
+
12
+ You can install the package via pip once published to PyPI:
13
+
14
+ ```bash
15
+ pip install sol-ai-core
16
+ ```
17
+
18
+ For local development:
19
+
20
+ ```bash
21
+ git clone https://github.com/yourusername/sol-ai-core.git
22
+ cd sol-ai-core
23
+ pip install -e .
24
+ ```
25
+
26
+ ## Usage
27
+
28
+ ```python
29
+ import pandas as pd
30
+ from sol_core.engine import SolEngine
31
+
32
+ # 1. Load your data
33
+ df = pd.read_csv("data.csv")
34
+
35
+ # 2. Initialize the engine
36
+ engine = SolEngine(df)
37
+
38
+ # 3. Clean the data automatically
39
+ clean_df = engine.auto_clean()
40
+
41
+ # 4. Train a model automatically (Classification or Regression)
42
+ report = engine.select_and_train_model(target_column="target")
43
+ print(report)
44
+
45
+ # 5. Generate Visualizations (Returns Matplotlib figure objects)
46
+ figures = engine.generate_visualizations(target_column="target")
47
+ figures['target_distribution'].show()
48
+ ```
49
+
50
+ ## Publishing to PyPI
51
+
52
+ To build and publish this library to PyPI, use the following commands:
53
+
54
+ ```bash
55
+ # Install build tools and twine
56
+ pip install build twine
57
+
58
+ # Build the package (creates dist/ directory)
59
+ python -m build
60
+
61
+ # Upload to PyPI (will prompt for username and password/token)
62
+ python -m twine upload dist/*
63
+ ```
@@ -0,0 +1,27 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "sol-ai-core"
7
+ version = "0.1.0"
8
+ description = "A professional Python Library for automated data cleaning, model selection, and visualization."
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ license = { text = "MIT" }
12
+ authors = [
13
+ { name = "Your Name", email = "your.email@example.com" }
14
+ ]
15
+ dependencies = [
16
+ "pandas>=1.3.0",
17
+ "scikit-learn>=1.0.0",
18
+ "matplotlib>=3.4.0",
19
+ "openai>=1.0.0"
20
+ ]
21
+
22
+ [project.urls]
23
+ Homepage = "https://github.com/yourusername/sol-ai-core"
24
+
25
+ [tool.hatch.build.targets.wheel]
26
+ packages = ["sol_core"]
27
+
@@ -0,0 +1,3 @@
1
+ from .engine import SolEngine
2
+
3
+ __all__ = ["SolEngine"]
@@ -0,0 +1,154 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
5
+ from sklearn.metrics import mean_squared_error, accuracy_score
6
+ import matplotlib.pyplot as plt
7
+ import openai
8
+
9
+ class SolEngine:
10
+ def __init__(self, df: pd.DataFrame):
11
+ """
12
+ Initializes the SolEngine with a Pandas DataFrame.
13
+ """
14
+ self.raw_df = df.copy()
15
+ self.clean_df = None
16
+ self.model = None
17
+ self.report = None
18
+ self.figures = {}
19
+
20
+ def auto_clean(self) -> pd.DataFrame:
21
+ """
22
+ Automatically cleans the DataFrame by:
23
+ - Dropping columns with > 50% missing values
24
+ - Imputing missing values for numeric and categorical columns
25
+ - Encoding categorical variables
26
+ """
27
+ df = self.raw_df.copy()
28
+
29
+ # Drop columns with > 50% missing values
30
+ threshold = len(df) * 0.5
31
+ df = df.dropna(thresh=threshold, axis=1)
32
+
33
+ # Impute missing values
34
+ for col in df.columns:
35
+ if df[col].dtype in ['int64', 'float64']:
36
+ df[col] = df[col].fillna(df[col].median())
37
+ else:
38
+ df[col] = df[col].fillna(df[col].mode()[0])
39
+
40
+ # Basic categorical encoding (one-hot) for columns with few unique values
41
+ cat_cols = df.select_dtypes(include=['object', 'category']).columns
42
+ df = pd.get_dummies(df, columns=[c for c in cat_cols if df[c].nunique() < 10], drop_first=True)
43
+
44
+ # Drop remaining objects that weren't encoded
45
+ df = df.select_dtypes(exclude=['object'])
46
+
47
+ self.clean_df = df
48
+ return self.clean_df
49
+
50
+ def select_and_train_model(self, target_column: str, task_type: str = 'auto') -> dict:
51
+ """
52
+ Selects and trains a basic model (Classification or Regression).
53
+ Returns a dictionary report with metrics and feature importances.
54
+ """
55
+ if self.clean_df is None:
56
+ self.auto_clean()
57
+
58
+ if target_column not in self.clean_df.columns:
59
+ raise ValueError(f"Target column '{target_column}' not found in cleaned data.")
60
+
61
+ X = self.clean_df.drop(columns=[target_column])
62
+ y = self.clean_df[target_column]
63
+
64
+ # Auto-detect task type if not specified
65
+ if task_type == 'auto':
66
+ if y.nunique() < 20 and y.dtype in ['int64', 'bool', 'uint8']:
67
+ task_type = 'classification'
68
+ else:
69
+ task_type = 'regression'
70
+
71
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
72
+
73
+ if task_type == 'classification':
74
+ self.model = RandomForestClassifier(random_state=42)
75
+ self.model.fit(X_train, y_train)
76
+ preds = self.model.predict(X_test)
77
+ score = accuracy_score(y_test, preds)
78
+ metric_name = "Accuracy"
79
+ else:
80
+ self.model = RandomForestRegressor(random_state=42)
81
+ self.model.fit(X_train, y_train)
82
+ preds = self.model.predict(X_test)
83
+ score = mean_squared_error(y_test, preds) ** 0.5
84
+ metric_name = "RMSE"
85
+
86
+ self.report = {
87
+ "task_type": task_type,
88
+ "target_column": target_column,
89
+ metric_name: score,
90
+ "feature_importances": dict(zip(X.columns, self.model.feature_importances_))
91
+ }
92
+
93
+ return self.report
94
+
95
+ def generate_visualizations(self, target_column: str) -> dict:
96
+ """
97
+ Generates standard visualizations and returns a dictionary of matplotlib figure objects.
98
+ """
99
+ if self.clean_df is None:
100
+ self.auto_clean()
101
+
102
+ fig, ax = plt.subplots(figsize=(8, 5))
103
+ self.clean_df[target_column].hist(bins=20, ax=ax)
104
+ ax.set_title(f"Distribution of {target_column}")
105
+ ax.set_xlabel(target_column)
106
+ ax.set_ylabel("Frequency")
107
+
108
+ self.figures['target_distribution'] = fig
109
+
110
+ # Add feature importance visualization if a model has been trained
111
+ if self.model and self.report:
112
+ importances = pd.Series(self.report['feature_importances']).sort_values(ascending=False).head(10)
113
+ fig2, ax2 = plt.subplots(figsize=(10, 6))
114
+ importances.plot(kind='bar', ax=ax2)
115
+ ax2.set_title("Top 10 Feature Importances")
116
+ ax2.set_ylabel("Importance")
117
+ plt.tight_layout()
118
+ self.figures['feature_importance'] = fig2
119
+
120
+ return self.figures
121
+
122
+ def get_openai_insights(self, api_key: str, prompt: str) -> str:
123
+ """
124
+ Uses OpenAI to generate insights based on the model report.
125
+ """
126
+ if not self.report:
127
+ raise ValueError("Train a model first to generate insights based on the report.")
128
+
129
+ openai.api_key = api_key
130
+
131
+ system_prompt = "You are a data science expert analyzing machine learning model results."
132
+ user_prompt = f"Here is the model report:\n{self.report}\n\nUser Question: {prompt}"
133
+
134
+ # Uses the newer OpenAI API format if openai>=1.0.0
135
+ try:
136
+ client = openai.OpenAI(api_key=api_key)
137
+ response = client.chat.completions.create(
138
+ model="gpt-3.5-turbo",
139
+ messages=[
140
+ {"role": "system", "content": system_prompt},
141
+ {"role": "user", "content": user_prompt}
142
+ ]
143
+ )
144
+ return response.choices[0].message.content
145
+ except AttributeError:
146
+ # Fallback for older openai versions
147
+ response = openai.ChatCompletion.create(
148
+ model="gpt-3.5-turbo",
149
+ messages=[
150
+ {"role": "system", "content": system_prompt},
151
+ {"role": "user", "content": user_prompt}
152
+ ]
153
+ )
154
+ return response.choices[0].message.content
@@ -0,0 +1,39 @@
1
+ import pandas as pd
2
+ from sklearn.datasets import load_iris
3
+ from sol_core.engine import SolEngine
4
+
5
+ # 1. Load sample dataset
6
+ print("Loading sample dataset...")
7
+ data = load_iris()
8
+ df = pd.DataFrame(data.data, columns=data.feature_names)
9
+ df['target'] = data.target
10
+
11
+ # 2. Initialize the engine
12
+ print("Initializing SolEngine...")
13
+ engine = SolEngine(df)
14
+
15
+ # 3. Clean the data automatically
16
+ print("Running auto_clean()...")
17
+ clean_df = engine.auto_clean()
18
+ print("Cleaned DataFrame shape:", clean_df.shape)
19
+
20
+ # 4. Train a model automatically (Classification)
21
+ print("\nRunning select_and_train_model()...")
22
+ report = engine.select_and_train_model(target_column="target")
23
+
24
+ print("\n--- Model Report ---")
25
+ for key, value in report.items():
26
+ if key == "feature_importances":
27
+ print(f"{key}:")
28
+ for feat, imp in list(value.items()):
29
+ print(f" - {feat}: {imp:.4f}")
30
+ else:
31
+ print(f"{key}: {value}")
32
+ print("--------------------")
33
+
34
+ # 5. Generate Visualizations
35
+ print("\nGenerating visualizations...")
36
+ figures = engine.generate_visualizations(target_column="target")
37
+ print("Generated figure objects:", list(figures.keys()))
38
+
39
+ print("\nCode executed successfully!")
@@ -0,0 +1,72 @@
1
+ import pytest
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sol_core.engine import SolEngine
5
+
6
+ @pytest.fixture
7
+ def sample_dataframe():
8
+ """Returns a sample dataframe with missing values and mixed types for testing."""
9
+ data = {
10
+ 'num_col': [1, 2, np.nan, 4, 5, 6, 7, 8, 9, 10],
11
+ 'cat_col': ['A', 'B', 'A', np.nan, 'C', 'A', 'B', 'C', 'A', 'B'],
12
+ 'mostly_missing': [np.nan] * 6 + [1, 2, 3, 4], # > 50% missing
13
+ 'target': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
14
+ }
15
+ return pd.DataFrame(data)
16
+
17
+ def test_engine_initialization(sample_dataframe):
18
+ engine = SolEngine(sample_dataframe)
19
+ assert engine.raw_df.equals(sample_dataframe)
20
+ assert engine.clean_df is None
21
+ assert engine.model is None
22
+
23
+ def test_auto_clean(sample_dataframe):
24
+ engine = SolEngine(sample_dataframe)
25
+ clean_df = engine.auto_clean()
26
+
27
+ # Check that column 'mostly_missing' is dropped
28
+ assert 'mostly_missing' not in clean_df.columns
29
+
30
+ # Check that missing values are imputed
31
+ assert not clean_df.isnull().values.any()
32
+
33
+ # Check that categorical variable is encoded
34
+ assert 'cat_col' not in clean_df.columns
35
+ assert any(col.startswith('cat_col_') for col in clean_df.columns)
36
+
37
+ def test_select_and_train_model_classification(sample_dataframe):
38
+ engine = SolEngine(sample_dataframe)
39
+ engine.auto_clean()
40
+
41
+ # Train classification model
42
+ report = engine.select_and_train_model(target_column='target')
43
+
44
+ assert report['task_type'] == 'classification'
45
+ assert 'Accuracy' in report
46
+ assert 'feature_importances' in report
47
+ assert engine.model is not None
48
+
49
+ def test_select_and_train_model_regression():
50
+ # Generate regression data
51
+ from sklearn.datasets import make_regression
52
+ X, y = make_regression(n_samples=100, n_features=3, random_state=42)
53
+ df = pd.DataFrame(X, columns=['f1', 'f2', 'f3'])
54
+ df['target'] = y
55
+
56
+ engine = SolEngine(df)
57
+ report = engine.select_and_train_model(target_column='target')
58
+
59
+ assert report['task_type'] == 'regression'
60
+ assert 'RMSE' in report
61
+ assert engine.model is not None
62
+
63
+ def test_generate_visualizations(sample_dataframe):
64
+ engine = SolEngine(sample_dataframe)
65
+ # Train a model first to get feature importance visualization too
66
+ engine.select_and_train_model(target_column='target')
67
+ figures = engine.generate_visualizations(target_column='target')
68
+
69
+ assert 'target_distribution' in figures
70
+ assert 'feature_importance' in figures
71
+ import matplotlib.pyplot as plt
72
+ assert isinstance(figures['target_distribution'], plt.Figure)