sol-ai-core 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sol-ai-core
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A professional Python Library for automated data cleaning, model selection, and visualization.
|
|
5
|
+
Project-URL: Homepage, https://github.com/yourusername/sol-ai-core
|
|
6
|
+
Author-email: Your Name <your.email@example.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Requires-Python: >=3.8
|
|
9
|
+
Requires-Dist: matplotlib>=3.4.0
|
|
10
|
+
Requires-Dist: openai>=1.0.0
|
|
11
|
+
Requires-Dist: pandas>=1.3.0
|
|
12
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# sol-ai-core
|
|
16
|
+
|
|
17
|
+
**SOL Engine** is now `sol-ai-core`, a professional, environment-agnostic Python library for automated data cleaning, machine learning model selection, and visualization.
|
|
18
|
+
|
|
19
|
+
## Features
|
|
20
|
+
- **Logic Separated:** Works seamlessly in any Python environment (Jupyter, VS Code, CI/CD) without requiring Streamlit.
|
|
21
|
+
- **Class-Based Design:** Easy to use object-oriented approach.
|
|
22
|
+
- **End-to-End Automation:** Pass a DataFrame, auto-clean it, train models, and generate insights in just a few lines of code.
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
You can install the package via pip once published to PyPI:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install sol-ai-core
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
For local development:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
git clone https://github.com/yourusername/sol-ai-core.git
|
|
36
|
+
cd sol-ai-core
|
|
37
|
+
pip install -e .
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Usage
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
import pandas as pd
|
|
44
|
+
from sol_core.engine import SolEngine
|
|
45
|
+
|
|
46
|
+
# 1. Load your data
|
|
47
|
+
df = pd.read_csv("data.csv")
|
|
48
|
+
|
|
49
|
+
# 2. Initialize the engine
|
|
50
|
+
engine = SolEngine(df)
|
|
51
|
+
|
|
52
|
+
# 3. Clean the data automatically
|
|
53
|
+
clean_df = engine.auto_clean()
|
|
54
|
+
|
|
55
|
+
# 4. Train a model automatically (Classification or Regression)
|
|
56
|
+
report = engine.select_and_train_model(target_column="target")
|
|
57
|
+
print(report)
|
|
58
|
+
|
|
59
|
+
# 5. Generate Visualizations (Returns Matplotlib figure objects)
|
|
60
|
+
figures = engine.generate_visualizations(target_column="target")
|
|
61
|
+
figures['target_distribution'].show()
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Publishing to PyPI
|
|
65
|
+
|
|
66
|
+
To build and publish this library to PyPI, use the following commands:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
# Install build tools and twine
|
|
70
|
+
pip install build twine
|
|
71
|
+
|
|
72
|
+
# Build the package (creates dist/ directory)
|
|
73
|
+
python -m build
|
|
74
|
+
|
|
75
|
+
# Upload to PyPI (will prompt for username and password/token)
|
|
76
|
+
python -m twine upload dist/*
|
|
77
|
+
```
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# sol-ai-core
|
|
2
|
+
|
|
3
|
+
**SOL Engine** is now `sol-ai-core`, a professional, environment-agnostic Python library for automated data cleaning, machine learning model selection, and visualization.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
- **Logic Separated:** Works seamlessly in any Python environment (Jupyter, VS Code, CI/CD) without requiring Streamlit.
|
|
7
|
+
- **Class-Based Design:** Easy to use object-oriented approach.
|
|
8
|
+
- **End-to-End Automation:** Pass a DataFrame, auto-clean it, train models, and generate insights in just a few lines of code.
|
|
9
|
+
|
|
10
|
+
## Installation
|
|
11
|
+
|
|
12
|
+
You can install the package via pip once published to PyPI:
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install sol-ai-core
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
For local development:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
git clone https://github.com/yourusername/sol-ai-core.git
|
|
22
|
+
cd sol-ai-core
|
|
23
|
+
pip install -e .
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Usage
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
import pandas as pd
|
|
30
|
+
from sol_core.engine import SolEngine
|
|
31
|
+
|
|
32
|
+
# 1. Load your data
|
|
33
|
+
df = pd.read_csv("data.csv")
|
|
34
|
+
|
|
35
|
+
# 2. Initialize the engine
|
|
36
|
+
engine = SolEngine(df)
|
|
37
|
+
|
|
38
|
+
# 3. Clean the data automatically
|
|
39
|
+
clean_df = engine.auto_clean()
|
|
40
|
+
|
|
41
|
+
# 4. Train a model automatically (Classification or Regression)
|
|
42
|
+
report = engine.select_and_train_model(target_column="target")
|
|
43
|
+
print(report)
|
|
44
|
+
|
|
45
|
+
# 5. Generate Visualizations (Returns Matplotlib figure objects)
|
|
46
|
+
figures = engine.generate_visualizations(target_column="target")
|
|
47
|
+
figures['target_distribution'].show()
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Publishing to PyPI
|
|
51
|
+
|
|
52
|
+
To build and publish this library to PyPI, use the following commands:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
# Install build tools and twine
|
|
56
|
+
pip install build twine
|
|
57
|
+
|
|
58
|
+
# Build the package (creates dist/ directory)
|
|
59
|
+
python -m build
|
|
60
|
+
|
|
61
|
+
# Upload to PyPI (will prompt for username and password/token)
|
|
62
|
+
python -m twine upload dist/*
|
|
63
|
+
```
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "sol-ai-core"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A professional Python Library for automated data cleaning, model selection, and visualization."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Your Name", email = "your.email@example.com" }
|
|
14
|
+
]
|
|
15
|
+
dependencies = [
|
|
16
|
+
"pandas>=1.3.0",
|
|
17
|
+
"scikit-learn>=1.0.0",
|
|
18
|
+
"matplotlib>=3.4.0",
|
|
19
|
+
"openai>=1.0.0"
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[project.urls]
|
|
23
|
+
Homepage = "https://github.com/yourusername/sol-ai-core"
|
|
24
|
+
|
|
25
|
+
[tool.hatch.build.targets.wheel]
|
|
26
|
+
packages = ["sol_core"]
|
|
27
|
+
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from sklearn.model_selection import train_test_split
|
|
4
|
+
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
|
|
5
|
+
from sklearn.metrics import mean_squared_error, accuracy_score
|
|
6
|
+
import matplotlib.pyplot as plt
|
|
7
|
+
import openai
|
|
8
|
+
|
|
9
|
+
class SolEngine:
|
|
10
|
+
def __init__(self, df: pd.DataFrame):
|
|
11
|
+
"""
|
|
12
|
+
Initializes the SolEngine with a Pandas DataFrame.
|
|
13
|
+
"""
|
|
14
|
+
self.raw_df = df.copy()
|
|
15
|
+
self.clean_df = None
|
|
16
|
+
self.model = None
|
|
17
|
+
self.report = None
|
|
18
|
+
self.figures = {}
|
|
19
|
+
|
|
20
|
+
def auto_clean(self) -> pd.DataFrame:
|
|
21
|
+
"""
|
|
22
|
+
Automatically cleans the DataFrame by:
|
|
23
|
+
- Dropping columns with > 50% missing values
|
|
24
|
+
- Imputing missing values for numeric and categorical columns
|
|
25
|
+
- Encoding categorical variables
|
|
26
|
+
"""
|
|
27
|
+
df = self.raw_df.copy()
|
|
28
|
+
|
|
29
|
+
# Drop columns with > 50% missing values
|
|
30
|
+
threshold = len(df) * 0.5
|
|
31
|
+
df = df.dropna(thresh=threshold, axis=1)
|
|
32
|
+
|
|
33
|
+
# Impute missing values
|
|
34
|
+
for col in df.columns:
|
|
35
|
+
if df[col].dtype in ['int64', 'float64']:
|
|
36
|
+
df[col] = df[col].fillna(df[col].median())
|
|
37
|
+
else:
|
|
38
|
+
df[col] = df[col].fillna(df[col].mode()[0])
|
|
39
|
+
|
|
40
|
+
# Basic categorical encoding (one-hot) for columns with few unique values
|
|
41
|
+
cat_cols = df.select_dtypes(include=['object', 'category']).columns
|
|
42
|
+
df = pd.get_dummies(df, columns=[c for c in cat_cols if df[c].nunique() < 10], drop_first=True)
|
|
43
|
+
|
|
44
|
+
# Drop remaining objects that weren't encoded
|
|
45
|
+
df = df.select_dtypes(exclude=['object'])
|
|
46
|
+
|
|
47
|
+
self.clean_df = df
|
|
48
|
+
return self.clean_df
|
|
49
|
+
|
|
50
|
+
def select_and_train_model(self, target_column: str, task_type: str = 'auto') -> dict:
|
|
51
|
+
"""
|
|
52
|
+
Selects and trains a basic model (Classification or Regression).
|
|
53
|
+
Returns a dictionary report with metrics and feature importances.
|
|
54
|
+
"""
|
|
55
|
+
if self.clean_df is None:
|
|
56
|
+
self.auto_clean()
|
|
57
|
+
|
|
58
|
+
if target_column not in self.clean_df.columns:
|
|
59
|
+
raise ValueError(f"Target column '{target_column}' not found in cleaned data.")
|
|
60
|
+
|
|
61
|
+
X = self.clean_df.drop(columns=[target_column])
|
|
62
|
+
y = self.clean_df[target_column]
|
|
63
|
+
|
|
64
|
+
# Auto-detect task type if not specified
|
|
65
|
+
if task_type == 'auto':
|
|
66
|
+
if y.nunique() < 20 and y.dtype in ['int64', 'bool', 'uint8']:
|
|
67
|
+
task_type = 'classification'
|
|
68
|
+
else:
|
|
69
|
+
task_type = 'regression'
|
|
70
|
+
|
|
71
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
72
|
+
|
|
73
|
+
if task_type == 'classification':
|
|
74
|
+
self.model = RandomForestClassifier(random_state=42)
|
|
75
|
+
self.model.fit(X_train, y_train)
|
|
76
|
+
preds = self.model.predict(X_test)
|
|
77
|
+
score = accuracy_score(y_test, preds)
|
|
78
|
+
metric_name = "Accuracy"
|
|
79
|
+
else:
|
|
80
|
+
self.model = RandomForestRegressor(random_state=42)
|
|
81
|
+
self.model.fit(X_train, y_train)
|
|
82
|
+
preds = self.model.predict(X_test)
|
|
83
|
+
score = mean_squared_error(y_test, preds) ** 0.5
|
|
84
|
+
metric_name = "RMSE"
|
|
85
|
+
|
|
86
|
+
self.report = {
|
|
87
|
+
"task_type": task_type,
|
|
88
|
+
"target_column": target_column,
|
|
89
|
+
metric_name: score,
|
|
90
|
+
"feature_importances": dict(zip(X.columns, self.model.feature_importances_))
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return self.report
|
|
94
|
+
|
|
95
|
+
def generate_visualizations(self, target_column: str) -> dict:
|
|
96
|
+
"""
|
|
97
|
+
Generates standard visualizations and returns a dictionary of matplotlib figure objects.
|
|
98
|
+
"""
|
|
99
|
+
if self.clean_df is None:
|
|
100
|
+
self.auto_clean()
|
|
101
|
+
|
|
102
|
+
fig, ax = plt.subplots(figsize=(8, 5))
|
|
103
|
+
self.clean_df[target_column].hist(bins=20, ax=ax)
|
|
104
|
+
ax.set_title(f"Distribution of {target_column}")
|
|
105
|
+
ax.set_xlabel(target_column)
|
|
106
|
+
ax.set_ylabel("Frequency")
|
|
107
|
+
|
|
108
|
+
self.figures['target_distribution'] = fig
|
|
109
|
+
|
|
110
|
+
# Add feature importance visualization if a model has been trained
|
|
111
|
+
if self.model and self.report:
|
|
112
|
+
importances = pd.Series(self.report['feature_importances']).sort_values(ascending=False).head(10)
|
|
113
|
+
fig2, ax2 = plt.subplots(figsize=(10, 6))
|
|
114
|
+
importances.plot(kind='bar', ax=ax2)
|
|
115
|
+
ax2.set_title("Top 10 Feature Importances")
|
|
116
|
+
ax2.set_ylabel("Importance")
|
|
117
|
+
plt.tight_layout()
|
|
118
|
+
self.figures['feature_importance'] = fig2
|
|
119
|
+
|
|
120
|
+
return self.figures
|
|
121
|
+
|
|
122
|
+
def get_openai_insights(self, api_key: str, prompt: str) -> str:
|
|
123
|
+
"""
|
|
124
|
+
Uses OpenAI to generate insights based on the model report.
|
|
125
|
+
"""
|
|
126
|
+
if not self.report:
|
|
127
|
+
raise ValueError("Train a model first to generate insights based on the report.")
|
|
128
|
+
|
|
129
|
+
openai.api_key = api_key
|
|
130
|
+
|
|
131
|
+
system_prompt = "You are a data science expert analyzing machine learning model results."
|
|
132
|
+
user_prompt = f"Here is the model report:\n{self.report}\n\nUser Question: {prompt}"
|
|
133
|
+
|
|
134
|
+
# Uses the newer OpenAI API format if openai>=1.0.0
|
|
135
|
+
try:
|
|
136
|
+
client = openai.OpenAI(api_key=api_key)
|
|
137
|
+
response = client.chat.completions.create(
|
|
138
|
+
model="gpt-3.5-turbo",
|
|
139
|
+
messages=[
|
|
140
|
+
{"role": "system", "content": system_prompt},
|
|
141
|
+
{"role": "user", "content": user_prompt}
|
|
142
|
+
]
|
|
143
|
+
)
|
|
144
|
+
return response.choices[0].message.content
|
|
145
|
+
except AttributeError:
|
|
146
|
+
# Fallback for older openai versions
|
|
147
|
+
response = openai.ChatCompletion.create(
|
|
148
|
+
model="gpt-3.5-turbo",
|
|
149
|
+
messages=[
|
|
150
|
+
{"role": "system", "content": system_prompt},
|
|
151
|
+
{"role": "user", "content": user_prompt}
|
|
152
|
+
]
|
|
153
|
+
)
|
|
154
|
+
return response.choices[0].message.content
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from sklearn.datasets import load_iris
|
|
3
|
+
from sol_core.engine import SolEngine
|
|
4
|
+
|
|
5
|
+
# 1. Load sample dataset
|
|
6
|
+
print("Loading sample dataset...")
|
|
7
|
+
data = load_iris()
|
|
8
|
+
df = pd.DataFrame(data.data, columns=data.feature_names)
|
|
9
|
+
df['target'] = data.target
|
|
10
|
+
|
|
11
|
+
# 2. Initialize the engine
|
|
12
|
+
print("Initializing SolEngine...")
|
|
13
|
+
engine = SolEngine(df)
|
|
14
|
+
|
|
15
|
+
# 3. Clean the data automatically
|
|
16
|
+
print("Running auto_clean()...")
|
|
17
|
+
clean_df = engine.auto_clean()
|
|
18
|
+
print("Cleaned DataFrame shape:", clean_df.shape)
|
|
19
|
+
|
|
20
|
+
# 4. Train a model automatically (Classification)
|
|
21
|
+
print("\nRunning select_and_train_model()...")
|
|
22
|
+
report = engine.select_and_train_model(target_column="target")
|
|
23
|
+
|
|
24
|
+
print("\n--- Model Report ---")
|
|
25
|
+
for key, value in report.items():
|
|
26
|
+
if key == "feature_importances":
|
|
27
|
+
print(f"{key}:")
|
|
28
|
+
for feat, imp in list(value.items()):
|
|
29
|
+
print(f" - {feat}: {imp:.4f}")
|
|
30
|
+
else:
|
|
31
|
+
print(f"{key}: {value}")
|
|
32
|
+
print("--------------------")
|
|
33
|
+
|
|
34
|
+
# 5. Generate Visualizations
|
|
35
|
+
print("\nGenerating visualizations...")
|
|
36
|
+
figures = engine.generate_visualizations(target_column="target")
|
|
37
|
+
print("Generated figure objects:", list(figures.keys()))
|
|
38
|
+
|
|
39
|
+
print("\nCode executed successfully!")
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import numpy as np
|
|
4
|
+
from sol_core.engine import SolEngine
|
|
5
|
+
|
|
6
|
+
@pytest.fixture
|
|
7
|
+
def sample_dataframe():
|
|
8
|
+
"""Returns a sample dataframe with missing values and mixed types for testing."""
|
|
9
|
+
data = {
|
|
10
|
+
'num_col': [1, 2, np.nan, 4, 5, 6, 7, 8, 9, 10],
|
|
11
|
+
'cat_col': ['A', 'B', 'A', np.nan, 'C', 'A', 'B', 'C', 'A', 'B'],
|
|
12
|
+
'mostly_missing': [np.nan] * 6 + [1, 2, 3, 4], # > 50% missing
|
|
13
|
+
'target': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
14
|
+
}
|
|
15
|
+
return pd.DataFrame(data)
|
|
16
|
+
|
|
17
|
+
def test_engine_initialization(sample_dataframe):
|
|
18
|
+
engine = SolEngine(sample_dataframe)
|
|
19
|
+
assert engine.raw_df.equals(sample_dataframe)
|
|
20
|
+
assert engine.clean_df is None
|
|
21
|
+
assert engine.model is None
|
|
22
|
+
|
|
23
|
+
def test_auto_clean(sample_dataframe):
|
|
24
|
+
engine = SolEngine(sample_dataframe)
|
|
25
|
+
clean_df = engine.auto_clean()
|
|
26
|
+
|
|
27
|
+
# Check that column 'mostly_missing' is dropped
|
|
28
|
+
assert 'mostly_missing' not in clean_df.columns
|
|
29
|
+
|
|
30
|
+
# Check that missing values are imputed
|
|
31
|
+
assert not clean_df.isnull().values.any()
|
|
32
|
+
|
|
33
|
+
# Check that categorical variable is encoded
|
|
34
|
+
assert 'cat_col' not in clean_df.columns
|
|
35
|
+
assert any(col.startswith('cat_col_') for col in clean_df.columns)
|
|
36
|
+
|
|
37
|
+
def test_select_and_train_model_classification(sample_dataframe):
|
|
38
|
+
engine = SolEngine(sample_dataframe)
|
|
39
|
+
engine.auto_clean()
|
|
40
|
+
|
|
41
|
+
# Train classification model
|
|
42
|
+
report = engine.select_and_train_model(target_column='target')
|
|
43
|
+
|
|
44
|
+
assert report['task_type'] == 'classification'
|
|
45
|
+
assert 'Accuracy' in report
|
|
46
|
+
assert 'feature_importances' in report
|
|
47
|
+
assert engine.model is not None
|
|
48
|
+
|
|
49
|
+
def test_select_and_train_model_regression():
|
|
50
|
+
# Generate regression data
|
|
51
|
+
from sklearn.datasets import make_regression
|
|
52
|
+
X, y = make_regression(n_samples=100, n_features=3, random_state=42)
|
|
53
|
+
df = pd.DataFrame(X, columns=['f1', 'f2', 'f3'])
|
|
54
|
+
df['target'] = y
|
|
55
|
+
|
|
56
|
+
engine = SolEngine(df)
|
|
57
|
+
report = engine.select_and_train_model(target_column='target')
|
|
58
|
+
|
|
59
|
+
assert report['task_type'] == 'regression'
|
|
60
|
+
assert 'RMSE' in report
|
|
61
|
+
assert engine.model is not None
|
|
62
|
+
|
|
63
|
+
def test_generate_visualizations(sample_dataframe):
|
|
64
|
+
engine = SolEngine(sample_dataframe)
|
|
65
|
+
# Train a model first to get feature importance visualization too
|
|
66
|
+
engine.select_and_train_model(target_column='target')
|
|
67
|
+
figures = engine.generate_visualizations(target_column='target')
|
|
68
|
+
|
|
69
|
+
assert 'target_distribution' in figures
|
|
70
|
+
assert 'feature_importance' in figures
|
|
71
|
+
import matplotlib.pyplot as plt
|
|
72
|
+
assert isinstance(figures['target_distribution'], plt.Figure)
|