stats-compass-core 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- stats_compass_core-0.1.0/LICENSE +21 -0
- stats_compass_core-0.1.0/PKG-INFO +562 -0
- stats_compass_core-0.1.0/README.md +528 -0
- stats_compass_core-0.1.0/datasets/Bukayo_Saka_7322.csv +98 -0
- stats_compass_core-0.1.0/datasets/Housing.csv +546 -0
- stats_compass_core-0.1.0/datasets/TATASTEEL.csv +5307 -0
- stats_compass_core-0.1.0/pyproject.toml +83 -0
- stats_compass_core-0.1.0/stats_compass_core/__init__.py +52 -0
- stats_compass_core-0.1.0/stats_compass_core/cleaning/__init__.py +13 -0
- stats_compass_core-0.1.0/stats_compass_core/cleaning/apply_imputation.py +133 -0
- stats_compass_core-0.1.0/stats_compass_core/cleaning/dedupe.py +91 -0
- stats_compass_core-0.1.0/stats_compass_core/cleaning/dropna.py +98 -0
- stats_compass_core-0.1.0/stats_compass_core/cleaning/handle_outliers.py +314 -0
- stats_compass_core-0.1.0/stats_compass_core/data/__init__.py +32 -0
- stats_compass_core-0.1.0/stats_compass_core/data/add_column.py +126 -0
- stats_compass_core-0.1.0/stats_compass_core/data/concat_dataframes.py +142 -0
- stats_compass_core-0.1.0/stats_compass_core/data/datasets.py +94 -0
- stats_compass_core-0.1.0/stats_compass_core/data/drop_columns.py +102 -0
- stats_compass_core-0.1.0/stats_compass_core/data/get_sample.py +80 -0
- stats_compass_core-0.1.0/stats_compass_core/data/get_schema.py +109 -0
- stats_compass_core-0.1.0/stats_compass_core/data/list_dataframes.py +48 -0
- stats_compass_core-0.1.0/stats_compass_core/data/load_csv.py +104 -0
- stats_compass_core-0.1.0/stats_compass_core/data/merge_dataframes.py +179 -0
- stats_compass_core-0.1.0/stats_compass_core/data/rename_columns.py +100 -0
- stats_compass_core-0.1.0/stats_compass_core/eda/__init__.py +27 -0
- stats_compass_core-0.1.0/stats_compass_core/eda/chi_square_tests.py +271 -0
- stats_compass_core-0.1.0/stats_compass_core/eda/correlations.py +115 -0
- stats_compass_core-0.1.0/stats_compass_core/eda/data_quality.py +559 -0
- stats_compass_core-0.1.0/stats_compass_core/eda/describe.py +107 -0
- stats_compass_core-0.1.0/stats_compass_core/eda/hypothesis_tests.py +182 -0
- stats_compass_core-0.1.0/stats_compass_core/ml/__init__.py +56 -0
- stats_compass_core-0.1.0/stats_compass_core/ml/arima.py +1097 -0
- stats_compass_core-0.1.0/stats_compass_core/ml/common.py +143 -0
- stats_compass_core-0.1.0/stats_compass_core/ml/evaluate_classification_model.py +135 -0
- stats_compass_core-0.1.0/stats_compass_core/ml/evaluate_regression_model.py +86 -0
- stats_compass_core-0.1.0/stats_compass_core/ml/train_gradient_boosting_classifier.py +111 -0
- stats_compass_core-0.1.0/stats_compass_core/ml/train_gradient_boosting_regressor.py +111 -0
- stats_compass_core-0.1.0/stats_compass_core/ml/train_linear_regression.py +96 -0
- stats_compass_core-0.1.0/stats_compass_core/ml/train_logistic_regression.py +103 -0
- stats_compass_core-0.1.0/stats_compass_core/ml/train_random_forest_classifier.py +106 -0
- stats_compass_core-0.1.0/stats_compass_core/ml/train_random_forest_regressor.py +106 -0
- stats_compass_core-0.1.0/stats_compass_core/plots/__init__.py +21 -0
- stats_compass_core-0.1.0/stats_compass_core/plots/bar_chart.py +115 -0
- stats_compass_core-0.1.0/stats_compass_core/plots/classification_curves.py +351 -0
- stats_compass_core-0.1.0/stats_compass_core/plots/feature_importance.py +148 -0
- stats_compass_core-0.1.0/stats_compass_core/plots/histogram.py +122 -0
- stats_compass_core-0.1.0/stats_compass_core/plots/lineplot.py +133 -0
- stats_compass_core-0.1.0/stats_compass_core/plots/scatter_plot.py +122 -0
- stats_compass_core-0.1.0/stats_compass_core/registry.py +199 -0
- stats_compass_core-0.1.0/stats_compass_core/results.py +665 -0
- stats_compass_core-0.1.0/stats_compass_core/state.py +367 -0
- stats_compass_core-0.1.0/stats_compass_core/transforms/__init__.py +20 -0
- stats_compass_core-0.1.0/stats_compass_core/transforms/bin_rare_categories.py +263 -0
- stats_compass_core-0.1.0/stats_compass_core/transforms/filter_dataframe.py +92 -0
- stats_compass_core-0.1.0/stats_compass_core/transforms/groupby_aggregate.py +120 -0
- stats_compass_core-0.1.0/stats_compass_core/transforms/mean_target_encoding.py +301 -0
- stats_compass_core-0.1.0/stats_compass_core/transforms/pivot.py +125 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 machine_surfer
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,562 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: stats-compass-core
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A clean toolkit of deterministic pandas-based data tools
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: pandas,data,tools,data-science,eda,ml
|
|
7
|
+
Author: Olamide Ogunbiyi
|
|
8
|
+
Author-email: oogunbiyi21@users.noreply.github.com
|
|
9
|
+
Requires-Python: >=3.11,<4.0
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Provides-Extra: all
|
|
21
|
+
Provides-Extra: ml
|
|
22
|
+
Provides-Extra: plots
|
|
23
|
+
Provides-Extra: timeseries
|
|
24
|
+
Requires-Dist: matplotlib (>=3.6.0,<4.0.0) ; extra == "plots" or extra == "timeseries" or extra == "all"
|
|
25
|
+
Requires-Dist: numpy (>=2.0.0,<3.0.0)
|
|
26
|
+
Requires-Dist: pandas (>=2.0.0,<3.0.0)
|
|
27
|
+
Requires-Dist: pydantic (>=2.0.0,<3.0.0)
|
|
28
|
+
Requires-Dist: scikit-learn (>=1.3.0,<2.0.0) ; extra == "ml" or extra == "all"
|
|
29
|
+
Requires-Dist: scipy (>=1.13.0,<2.0.0)
|
|
30
|
+
Requires-Dist: seaborn (>=0.12.0,<0.13.0) ; extra == "plots" or extra == "all"
|
|
31
|
+
Requires-Dist: statsmodels (>=0.14.0,<0.15.0) ; extra == "timeseries" or extra == "all"
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
# stats-compass-core
|
|
35
|
+
|
|
36
|
+
A stateful, MCP-compatible toolkit of pandas-based data tools for AI-powered data analysis.
|
|
37
|
+
|
|
38
|
+
## Overview
|
|
39
|
+
|
|
40
|
+
**stats-compass-core** is a Python package that provides a curated collection of data tools designed for use with LLM agents via the Model Context Protocol (MCP). Unlike traditional pandas libraries, this package manages server-side state, allowing AI agents to work with DataFrames across multiple tool invocations without passing raw data over the wire.
|
|
41
|
+
|
|
42
|
+
### Key Features
|
|
43
|
+
|
|
44
|
+
- 🔄 **Stateful Design**: Server-side `DataFrameState` manages multiple DataFrames and trained models
|
|
45
|
+
- 📦 **MCP-Compatible**: All tools return JSON-serializable Pydantic models
|
|
46
|
+
- 🧹 **Clean Architecture**: Organized into logical categories (data, cleaning, transforms, eda, ml, plots)
|
|
47
|
+
- 🔒 **Type-Safe**: Complete type hints with Pydantic schemas for input validation
|
|
48
|
+
- 🎯 **Memory-Managed**: Configurable memory limits prevent runaway state growth
|
|
49
|
+
- 📊 **Base64 Charts**: Visualization tools return PNG images as base64 strings
|
|
50
|
+
- 🤖 **Model Storage**: Trained ML models stored by ID for later use
|
|
51
|
+
|
|
52
|
+
## Architecture
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
┌─────────────────────────────────────────────────────────────────┐
|
|
56
|
+
│ stats-compass-core │
|
|
57
|
+
│ ┌─────────────────────────────────────────────────────────┐ │
|
|
58
|
+
│ │ DataFrameState │ │
|
|
59
|
+
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │
|
|
60
|
+
│ │ │ DataFrames │ │ Models │ │ History │ │ │
|
|
61
|
+
│ │ │ (by name) │ │ (by ID) │ │ (lineage) │ │ │
|
|
62
|
+
│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │
|
|
63
|
+
│ └─────────────────────────────────────────────────────────┘ │
|
|
64
|
+
│ │ │
|
|
65
|
+
│ ┌───────────────┼───────────────┐ │
|
|
66
|
+
│ ▼ ▼ ▼ │
|
|
67
|
+
│ ┌─────────────────┐ ┌─────────────┐ ┌─────────────────┐ │
|
|
68
|
+
│ │ Tool (state, │ │ Tool... │ │ Tool... │ │
|
|
69
|
+
│ │ params) │ │ │ │ │ │
|
|
70
|
+
│ └────────┬────────┘ └─────────────┘ └─────────────────┘ │
|
|
71
|
+
│ │ │
|
|
72
|
+
│ ▼ │
|
|
73
|
+
│ ┌─────────────────────────────────────────────────────────┐ │
|
|
74
|
+
│ │ Pydantic Result Model │ │
|
|
75
|
+
│ │ (JSON-serializable) │ │
|
|
76
|
+
│ └─────────────────────────────────────────────────────────┘ │
|
|
77
|
+
└─────────────────────────────────────────────────────────────────┘
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Three-Layer Stack
|
|
81
|
+
|
|
82
|
+
1. **stats-compass-core** (this package) - Stateful Python tools
|
|
83
|
+
- Manages DataFrames and models server-side
|
|
84
|
+
- Returns JSON-serializable Pydantic results
|
|
85
|
+
- Pure data operations, no UI or orchestration
|
|
86
|
+
|
|
87
|
+
2. **stats-compass-mcp** (separate package) - MCP Server
|
|
88
|
+
- Exposes tools via Model Context Protocol
|
|
89
|
+
- Handles JSON transport to/from LLM agents
|
|
90
|
+
- **Not part of this repository**
|
|
91
|
+
|
|
92
|
+
3. **stats-compass-app** (separate package) - SaaS Application
|
|
93
|
+
- Web UI for human interaction
|
|
94
|
+
- Multi-tool pipelines and workflows
|
|
95
|
+
- **Not part of this repository**
|
|
96
|
+
|
|
97
|
+
### Registry & Tool Discovery Flow
|
|
98
|
+
|
|
99
|
+
The `registry` module is the central nervous system for tool management. Here's how it works:
|
|
100
|
+
|
|
101
|
+
```
|
|
102
|
+
┌─────────────────────────────────────────────────────────────────────────┐
|
|
103
|
+
│ STARTUP / INITIALIZATION │
|
|
104
|
+
├─────────────────────────────────────────────────────────────────────────┤
|
|
105
|
+
│ 1. App calls registry.auto_discover() │
|
|
106
|
+
│ 2. Registry walks category folders (data/, cleaning/, transforms/...) │
|
|
107
|
+
│ 3. Each module is imported via importlib.import_module() │
|
|
108
|
+
│ 4. @registry.register decorators fire, populating _tools dict │
|
|
109
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
110
|
+
│
|
|
111
|
+
▼
|
|
112
|
+
┌─────────────────────────────────────────────────────────────────────────┐
|
|
113
|
+
│ TOOL INVOCATION │
|
|
114
|
+
├─────────────────────────────────────────────────────────────────────────┤
|
|
115
|
+
│ 1. MCP server receives request: {"tool": "cleaning.drop_na", ...} │
|
|
116
|
+
│ 2. Calls registry.invoke("cleaning", "drop_na", state, params) │
|
|
117
|
+
│ 3. Registry validates params against Pydantic input_schema │
|
|
118
|
+
│ 4. Registry calls tool function with (state, validated_params) │
|
|
119
|
+
│ 5. Tool returns Pydantic result model (JSON-serializable) │
|
|
120
|
+
│ 6. MCP server sends result.model_dump_json() back to LLM │
|
|
121
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
**Key files:**
|
|
125
|
+
- `registry.py` - Tool registration and invocation
|
|
126
|
+
- `state.py` - DataFrameState for server-side data management
|
|
127
|
+
- `results.py` - Pydantic result types for JSON serialization
|
|
128
|
+
|
|
129
|
+
## Installation
|
|
130
|
+
|
|
131
|
+
### Basic Installation (Core Only)
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
pip install stats-compass-core
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
This installs the core functionality: data loading, cleaning, transforms, and EDA tools. Dependencies: pandas, numpy, scipy, pydantic.
|
|
138
|
+
|
|
139
|
+
### With Optional Features
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
# For machine learning tools (scikit-learn)
|
|
143
|
+
pip install stats-compass-core[ml]
|
|
144
|
+
|
|
145
|
+
# For plotting tools (matplotlib, seaborn)
|
|
146
|
+
pip install stats-compass-core[plots]
|
|
147
|
+
|
|
148
|
+
# For time series / ARIMA tools (statsmodels)
|
|
149
|
+
pip install stats-compass-core[timeseries]
|
|
150
|
+
|
|
151
|
+
# For everything
|
|
152
|
+
pip install stats-compass-core[all]
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### For Development
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
git clone https://github.com/oogunbiyi21/stats-compass-core.git
|
|
159
|
+
cd stats-compass-core
|
|
160
|
+
poetry install --with dev # Installs all deps including optional ones
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## Quick Start
|
|
164
|
+
|
|
165
|
+
### Basic Usage Pattern
|
|
166
|
+
|
|
167
|
+
All tools follow the same pattern:
|
|
168
|
+
1. Create a `DataFrameState` instance (once per session)
|
|
169
|
+
2. Load data into state
|
|
170
|
+
3. Call tools with `(state, params)` signature
|
|
171
|
+
4. Tools return JSON-serializable result objects
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
import pandas as pd
|
|
175
|
+
from stats_compass_core import DataFrameState, registry
|
|
176
|
+
|
|
177
|
+
# 1. Create state manager (one per session)
|
|
178
|
+
state = DataFrameState(memory_limit_mb=500)
|
|
179
|
+
|
|
180
|
+
# 2. Load data into state
|
|
181
|
+
df = pd.read_csv("sales_data.csv")
|
|
182
|
+
state.set_dataframe(df, name="sales", operation="load_csv")
|
|
183
|
+
|
|
184
|
+
# 3. Call tools via registry
|
|
185
|
+
result = registry.invoke("eda", "describe", state, {})
|
|
186
|
+
print(result.model_dump_json()) # JSON-serializable output
|
|
187
|
+
|
|
188
|
+
# 4. Chain operations
|
|
189
|
+
result = registry.invoke("transforms", "groupby_aggregate", state, {
|
|
190
|
+
"by": ["region"],
|
|
191
|
+
"agg_func": {"revenue": "sum", "quantity": "mean"}
|
|
192
|
+
})
|
|
193
|
+
# Result DataFrame saved to state automatically
|
|
194
|
+
print(f"New DataFrame: {result.dataframe_name}")
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
### Direct Tool Usage
|
|
198
|
+
|
|
199
|
+
You can also import and call tools directly:
|
|
200
|
+
|
|
201
|
+
```python
|
|
202
|
+
from stats_compass_core import DataFrameState
|
|
203
|
+
from stats_compass_core.eda.describe import describe, DescribeInput
|
|
204
|
+
from stats_compass_core.cleaning.dropna import drop_na, DropNAInput
|
|
205
|
+
|
|
206
|
+
# Create state and load data
|
|
207
|
+
state = DataFrameState()
|
|
208
|
+
state.set_dataframe(my_dataframe, name="data", operation="manual")
|
|
209
|
+
|
|
210
|
+
# Call tool with typed params
|
|
211
|
+
params = DescribeInput(percentiles=[0.25, 0.5, 0.75])
|
|
212
|
+
result = describe(state, params)
|
|
213
|
+
|
|
214
|
+
# Result is a Pydantic model
|
|
215
|
+
print(result.statistics) # dict of column stats
|
|
216
|
+
print(result.dataframe_name) # "data"
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
## Core Concepts
|
|
220
|
+
|
|
221
|
+
### DataFrameState
|
|
222
|
+
|
|
223
|
+
The `DataFrameState` class manages all server-side data:
|
|
224
|
+
|
|
225
|
+
```python
|
|
226
|
+
from stats_compass_core import DataFrameState
|
|
227
|
+
|
|
228
|
+
state = DataFrameState(memory_limit_mb=500)
|
|
229
|
+
|
|
230
|
+
# Store DataFrames (multiple allowed)
|
|
231
|
+
state.set_dataframe(df1, name="raw_data", operation="load_csv")
|
|
232
|
+
state.set_dataframe(df2, name="cleaned", operation="drop_na")
|
|
233
|
+
|
|
234
|
+
# Retrieve DataFrames
|
|
235
|
+
df = state.get_dataframe("raw_data")
|
|
236
|
+
df = state.get_dataframe() # Gets active DataFrame
|
|
237
|
+
|
|
238
|
+
# Check what's stored
|
|
239
|
+
print(state.list_dataframes()) # [DataFrameInfo(...), ...]
|
|
240
|
+
print(state.get_active_dataframe_name()) # 'cleaned' (most recent)
|
|
241
|
+
|
|
242
|
+
# Store trained models
|
|
243
|
+
model_id = state.store_model(
|
|
244
|
+
model=trained_model,
|
|
245
|
+
model_type="random_forest_classifier",
|
|
246
|
+
target_column="churn",
|
|
247
|
+
feature_columns=["age", "tenure", "balance"],
|
|
248
|
+
source_dataframe="training_data"
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
# Retrieve models
|
|
252
|
+
model = state.get_model(model_id)
|
|
253
|
+
info = state.get_model_info(model_id)
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
### Result Types
|
|
257
|
+
|
|
258
|
+
All tools return Pydantic models that serialize to JSON:
|
|
259
|
+
|
|
260
|
+
| Result Type | Used By | Key Fields |
|
|
261
|
+
|-------------|---------|------------|
|
|
262
|
+
| `DataFrameLoadResult` | data loading tools | `dataframe_name`, `shape`, `columns` |
|
|
263
|
+
| `DataFrameMutationResult` | cleaning tools | `rows_before`, `rows_after`, `rows_affected` |
|
|
264
|
+
| `DataFrameQueryResult` | transform tools | `data`, `shape`, `dataframe_name` |
|
|
265
|
+
| `DescribeResult` | describe | `statistics`, `columns_analyzed` |
|
|
266
|
+
| `CorrelationsResult` | correlations | `correlations`, `method` |
|
|
267
|
+
| `ChartResult` | all plot tools | `image_base64`, `chart_type` |
|
|
268
|
+
| `ModelTrainingResult` | ML training | `model_id`, `metrics`, `feature_columns` |
|
|
269
|
+
| `HypothesisTestResult` | statistical tests | `statistic`, `p_value`, `significant_at_05` |
|
|
270
|
+
|
|
271
|
+
### Registry
|
|
272
|
+
|
|
273
|
+
The registry provides tool discovery and invocation:
|
|
274
|
+
|
|
275
|
+
```python
|
|
276
|
+
from stats_compass_core import registry
|
|
277
|
+
|
|
278
|
+
# List all tools
|
|
279
|
+
for key, metadata in registry._tools.items():
|
|
280
|
+
print(f"{key}: {metadata.description}")
|
|
281
|
+
|
|
282
|
+
# Invoke a tool (handles param validation)
|
|
283
|
+
result = registry.invoke(
|
|
284
|
+
category="cleaning",
|
|
285
|
+
tool_name="drop_na",
|
|
286
|
+
state=state,
|
|
287
|
+
params={"how": "any", "axis": 0}
|
|
288
|
+
)
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
## Available Tools
|
|
292
|
+
|
|
293
|
+
### Data Tools (`stats_compass_core.data`)
|
|
294
|
+
|
|
295
|
+
| Tool | Description | Returns |
|
|
296
|
+
|------|-------------|---------|
|
|
297
|
+
| `load_csv` | Load CSV file into state | `DataFrameLoadResult` |
|
|
298
|
+
| `get_schema` | Get DataFrame column types and stats | `SchemaResult` |
|
|
299
|
+
| `get_sample` | Get sample rows from DataFrame | `SampleResult` |
|
|
300
|
+
| `list_dataframes` | List all DataFrames in state | `DataFrameListResult` |
|
|
301
|
+
|
|
302
|
+
### Cleaning Tools (`stats_compass_core.cleaning`)
|
|
303
|
+
|
|
304
|
+
| Tool | Description | Returns |
|
|
305
|
+
|------|-------------|---------|
|
|
306
|
+
| `drop_na` | Remove rows/columns with missing values | `DataFrameMutationResult` |
|
|
307
|
+
| `dedupe` | Remove duplicate rows | `DataFrameMutationResult` |
|
|
308
|
+
| `apply_imputation` | Fill missing values (mean/median/mode/constant) | `DataFrameMutationResult` |
|
|
309
|
+
| `handle_outliers` | Handle outliers (cap/remove/winsorize/log/IQR) | `OutlierHandlingResult` |
|
|
310
|
+
|
|
311
|
+
### Transform Tools (`stats_compass_core.transforms`)
|
|
312
|
+
|
|
313
|
+
| Tool | Description | Returns |
|
|
314
|
+
|------|-------------|---------|
|
|
315
|
+
| `groupby_aggregate` | Group and aggregate data | `DataFrameQueryResult` |
|
|
316
|
+
| `pivot` | Reshape long to wide format | `DataFrameQueryResult` |
|
|
317
|
+
| `filter_dataframe` | Filter with pandas query syntax | `DataFrameQueryResult` |
|
|
318
|
+
| `bin_rare_categories` | Bin rare categories into 'Other' | `BinRareCategoriesResult` |
|
|
319
|
+
| `mean_target_encoding` | Target encoding for categoricals *[requires ml]* | `MeanTargetEncodingResult` |
|
|
320
|
+
|
|
321
|
+
### EDA Tools (`stats_compass_core.eda`)
|
|
322
|
+
|
|
323
|
+
| Tool | Description | Returns |
|
|
324
|
+
|------|-------------|---------|
|
|
325
|
+
| `describe` | Descriptive statistics | `DescribeResult` |
|
|
326
|
+
| `correlations` | Correlation matrix | `CorrelationsResult` |
|
|
327
|
+
| `t_test` | Two-sample t-test | `HypothesisTestResult` |
|
|
328
|
+
| `z_test` | Two-sample z-test | `HypothesisTestResult` |
|
|
329
|
+
| `chi_square_independence` | Chi-square test for independence | `HypothesisTestResult` |
|
|
330
|
+
| `chi_square_goodness_of_fit` | Chi-square goodness-of-fit test | `HypothesisTestResult` |
|
|
331
|
+
| `analyze_missing_data` | Analyze missing data patterns | `MissingDataAnalysisResult` |
|
|
332
|
+
| `detect_outliers` | Detect outliers using IQR/Z-score | `OutlierDetectionResult` |
|
|
333
|
+
| `data_quality_report` | Comprehensive data quality report | `DataQualityReportResult` |
|
|
334
|
+
|
|
335
|
+
### ML Tools (`stats_compass_core.ml`) *[requires ml extra]*
|
|
336
|
+
|
|
337
|
+
| Tool | Description | Returns |
|
|
338
|
+
|------|-------------|---------|
|
|
339
|
+
| `train_linear_regression` | Train linear regression | `ModelTrainingResult` |
|
|
340
|
+
| `train_logistic_regression` | Train logistic regression | `ModelTrainingResult` |
|
|
341
|
+
| `train_random_forest_classifier` | Train RF classifier | `ModelTrainingResult` |
|
|
342
|
+
| `train_random_forest_regressor` | Train RF regressor | `ModelTrainingResult` |
|
|
343
|
+
| `train_gradient_boosting_classifier` | Train GB classifier | `ModelTrainingResult` |
|
|
344
|
+
| `train_gradient_boosting_regressor` | Train GB regressor | `ModelTrainingResult` |
|
|
345
|
+
| `evaluate_classification_model` | Evaluate classifier | `ClassificationEvaluationResult` |
|
|
346
|
+
| `evaluate_regression_model` | Evaluate regressor | `RegressionEvaluationResult` |
|
|
347
|
+
|
|
348
|
+
### Plotting Tools (`stats_compass_core.plots`) *[requires plots extra]*
|
|
349
|
+
|
|
350
|
+
| Tool | Description | Returns |
|
|
351
|
+
|------|-------------|---------|
|
|
352
|
+
| `histogram` | Histogram of numeric column | `ChartResult` |
|
|
353
|
+
| `lineplot` | Line plot of time series | `ChartResult` |
|
|
354
|
+
| `bar_chart` | Bar chart of category counts | `ChartResult` |
|
|
355
|
+
| `scatter_plot` | Scatter plot of two columns | `ChartResult` |
|
|
356
|
+
| `feature_importance` | Feature importance from model | `ChartResult` |
|
|
357
|
+
| `roc_curve_plot` | ROC curve for classification model | `ChartResult` |
|
|
358
|
+
| `precision_recall_curve_plot` | Precision-recall curve | `ChartResult` |
|
|
359
|
+
|
|
360
|
+
### Time Series Tools (`stats_compass_core.ml`) *[requires timeseries extra]*
|
|
361
|
+
|
|
362
|
+
| Tool | Description | Returns |
|
|
363
|
+
|------|-------------|---------|
|
|
364
|
+
| `fit_arima` | Fit ARIMA(p,d,q) model | `ARIMAResult` |
|
|
365
|
+
| `forecast_arima` | Generate forecasts (supports natural language periods) | `ARIMAForecastResult` |
|
|
366
|
+
| `find_optimal_arima` | Grid search for best ARIMA parameters | `ARIMAParameterSearchResult` |
|
|
367
|
+
| `check_stationarity` | ADF/KPSS stationarity tests | `StationarityTestResult` |
|
|
368
|
+
| `infer_frequency` | Infer time series frequency | `InferFrequencyResult` |
|
|
369
|
+
|
|
370
|
+
## Usage Examples
|
|
371
|
+
|
|
372
|
+
### Complete Workflow Example
|
|
373
|
+
|
|
374
|
+
```python
|
|
375
|
+
import pandas as pd
|
|
376
|
+
from stats_compass_core import DataFrameState, registry
|
|
377
|
+
|
|
378
|
+
# Initialize state
|
|
379
|
+
state = DataFrameState()
|
|
380
|
+
|
|
381
|
+
# Load data
|
|
382
|
+
df = pd.DataFrame({
|
|
383
|
+
"region": ["North", "South", "North", "South", "East"],
|
|
384
|
+
"product": ["A", "A", "B", "B", "A"],
|
|
385
|
+
"revenue": [100, 150, 200, None, 120],
|
|
386
|
+
"quantity": [10, 15, 20, 12, 11]
|
|
387
|
+
})
|
|
388
|
+
state.set_dataframe(df, name="sales", operation="manual_load")
|
|
389
|
+
|
|
390
|
+
# Step 1: Check schema
|
|
391
|
+
result = registry.invoke("data", "get_schema", state, {})
|
|
392
|
+
print(f"Columns: {[c['name'] for c in result.columns]}")
|
|
393
|
+
|
|
394
|
+
# Step 2: Handle missing values
|
|
395
|
+
result = registry.invoke("cleaning", "apply_imputation", state, {
|
|
396
|
+
"strategy": "mean",
|
|
397
|
+
"columns": ["revenue"]
|
|
398
|
+
})
|
|
399
|
+
print(f"Filled {result.rows_affected} values")
|
|
400
|
+
|
|
401
|
+
# Step 3: Aggregate by region
|
|
402
|
+
result = registry.invoke("transforms", "groupby_aggregate", state, {
|
|
403
|
+
"by": ["region"],
|
|
404
|
+
"agg_func": {"revenue": "sum", "quantity": "mean"},
|
|
405
|
+
"save_as": "regional_summary"
|
|
406
|
+
})
|
|
407
|
+
print(f"Created: {result.dataframe_name}")
|
|
408
|
+
|
|
409
|
+
# Step 4: Describe the summary
|
|
410
|
+
result = registry.invoke("eda", "describe", state, {
|
|
411
|
+
"dataframe_name": "regional_summary"
|
|
412
|
+
})
|
|
413
|
+
print(result.model_dump_json(indent=2))
|
|
414
|
+
|
|
415
|
+
# Step 5: Create visualization
|
|
416
|
+
result = registry.invoke("plots", "bar_chart", state, {
|
|
417
|
+
"dataframe_name": "regional_summary",
|
|
418
|
+
"column": "region"
|
|
419
|
+
})
|
|
420
|
+
# result.image_base64 contains PNG image
|
|
421
|
+
```
|
|
422
|
+
|
|
423
|
+
### Working with Charts
|
|
424
|
+
|
|
425
|
+
```python
|
|
426
|
+
import base64
|
|
427
|
+
from stats_compass_core import DataFrameState, registry
|
|
428
|
+
|
|
429
|
+
state = DataFrameState()
|
|
430
|
+
state.set_dataframe(my_df, name="data", operation="load")
|
|
431
|
+
|
|
432
|
+
# Create histogram
|
|
433
|
+
result = registry.invoke("plots", "histogram", state, {
|
|
434
|
+
"column": "price",
|
|
435
|
+
"bins": 20,
|
|
436
|
+
"title": "Price Distribution"
|
|
437
|
+
})
|
|
438
|
+
|
|
439
|
+
# Decode and save the image
|
|
440
|
+
image_bytes = base64.b64decode(result.image_base64)
|
|
441
|
+
with open("histogram.png", "wb") as f:
|
|
442
|
+
f.write(image_bytes)
|
|
443
|
+
|
|
444
|
+
# Or use in web response
|
|
445
|
+
# return Response(content=image_bytes, media_type="image/png")
|
|
446
|
+
```
|
|
447
|
+
|
|
448
|
+
### Training and Using Models
|
|
449
|
+
|
|
450
|
+
```python
|
|
451
|
+
from stats_compass_core import DataFrameState, registry
|
|
452
|
+
|
|
453
|
+
state = DataFrameState()
|
|
454
|
+
state.set_dataframe(training_df, name="training", operation="load")
|
|
455
|
+
|
|
456
|
+
# Train model
|
|
457
|
+
result = registry.invoke("ml", "train_random_forest_classifier", state, {
|
|
458
|
+
"target_column": "churn",
|
|
459
|
+
"feature_columns": ["age", "tenure", "balance", "num_products"],
|
|
460
|
+
"test_size": 0.2
|
|
461
|
+
})
|
|
462
|
+
|
|
463
|
+
print(f"Model ID: {result.model_id}")
|
|
464
|
+
print(f"Accuracy: {result.metrics['accuracy']:.3f}")
|
|
465
|
+
print(f"Features: {result.feature_columns}")
|
|
466
|
+
|
|
467
|
+
# Model is stored in state for later use
|
|
468
|
+
model = state.get_model(result.model_id)
|
|
469
|
+
|
|
470
|
+
# Visualize feature importance
|
|
471
|
+
chart_result = registry.invoke("plots", "feature_importance", state, {
|
|
472
|
+
"model_id": result.model_id,
|
|
473
|
+
"top_n": 10
|
|
474
|
+
})
|
|
475
|
+
```
|
|
476
|
+
|
|
477
|
+
## Design Principles
|
|
478
|
+
|
|
479
|
+
### 1. Stateful, Not Pure
|
|
480
|
+
|
|
481
|
+
Unlike traditional pandas libraries, tools mutate shared state:
|
|
482
|
+
|
|
483
|
+
```python
|
|
484
|
+
# Tools operate on state, not raw DataFrames
|
|
485
|
+
result = drop_na(state, params) # ✓ Correct
|
|
486
|
+
result = drop_na(df, params) # ✗ Old pattern
|
|
487
|
+
```
|
|
488
|
+
|
|
489
|
+
### 2. JSON-Serializable Returns
|
|
490
|
+
|
|
491
|
+
All returns must be Pydantic models:
|
|
492
|
+
|
|
493
|
+
```python
|
|
494
|
+
# Returns JSON-serializable result
|
|
495
|
+
result = describe(state, params)
|
|
496
|
+
json_str = result.model_dump_json() # Always works
|
|
497
|
+
|
|
498
|
+
# NOT raw DataFrames or matplotlib figures
|
|
499
|
+
```
|
|
500
|
+
|
|
501
|
+
### 3. Transform Tools Save to State
|
|
502
|
+
|
|
503
|
+
Transform operations create new named DataFrames:
|
|
504
|
+
|
|
505
|
+
```python
|
|
506
|
+
result = registry.invoke("transforms", "groupby_aggregate", state, {
|
|
507
|
+
"by": ["region"],
|
|
508
|
+
"agg_func": {"sales": "sum"},
|
|
509
|
+
"save_as": "regional_totals" # Optional custom name
|
|
510
|
+
})
|
|
511
|
+
# New DataFrame now available as state.get_dataframe("regional_totals")
|
|
512
|
+
```
|
|
513
|
+
|
|
514
|
+
### 4. Models Stored by ID
|
|
515
|
+
|
|
516
|
+
Trained models aren't returned directly - they're stored:
|
|
517
|
+
|
|
518
|
+
```python
|
|
519
|
+
result = train_random_forest_classifier(state, params)
|
|
520
|
+
# result.model_id = "random_forest_classifier_churn_20241207_143022"
|
|
521
|
+
# Use state.get_model(result.model_id) to retrieve
|
|
522
|
+
```
|
|
523
|
+
|
|
524
|
+
## Contributing
|
|
525
|
+
|
|
526
|
+
See [docs/CONTRIBUTING.md](docs/CONTRIBUTING.md) for detailed contribution guidelines.
|
|
527
|
+
|
|
528
|
+
### Quick Start for Contributors
|
|
529
|
+
|
|
530
|
+
1. Fork and clone the repository
|
|
531
|
+
2. Install dependencies: `poetry install`
|
|
532
|
+
3. Create a new tool following the pattern in existing tools
|
|
533
|
+
4. Write tests in `tests/`
|
|
534
|
+
5. Submit a pull request
|
|
535
|
+
|
|
536
|
+
### Tool Signature Pattern
|
|
537
|
+
|
|
538
|
+
All tools must follow this signature:
|
|
539
|
+
|
|
540
|
+
```python
|
|
541
|
+
from stats_compass_core.state import DataFrameState
|
|
542
|
+
from stats_compass_core.results import SomeResult
|
|
543
|
+
from stats_compass_core.registry import registry
|
|
544
|
+
|
|
545
|
+
class MyToolInput(BaseModel):
|
|
546
|
+
dataframe_name: str | None = Field(default=None)
|
|
547
|
+
# ... other params
|
|
548
|
+
|
|
549
|
+
@registry.register(category="category", input_schema=MyToolInput, description="...")
|
|
550
|
+
def my_tool(state: DataFrameState, params: MyToolInput) -> SomeResult:
|
|
551
|
+
df = state.get_dataframe(params.dataframe_name)
|
|
552
|
+
source_name = params.dataframe_name or state.get_active_dataframe_name()
|
|
553
|
+
|
|
554
|
+
# ... do work ...
|
|
555
|
+
|
|
556
|
+
return SomeResult(...)
|
|
557
|
+
```
|
|
558
|
+
|
|
559
|
+
## License
|
|
560
|
+
|
|
561
|
+
MIT License - see [LICENSE](LICENSE) for details.
|
|
562
|
+
|