stats-compass-core 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. stats_compass_core-0.1.2/LICENSE +21 -0
  2. stats_compass_core-0.1.2/PKG-INFO +566 -0
  3. stats_compass_core-0.1.2/README.md +532 -0
  4. stats_compass_core-0.1.2/pyproject.toml +83 -0
  5. stats_compass_core-0.1.2/stats_compass_core/__init__.py +52 -0
  6. stats_compass_core-0.1.2/stats_compass_core/cleaning/__init__.py +13 -0
  7. stats_compass_core-0.1.2/stats_compass_core/cleaning/apply_imputation.py +133 -0
  8. stats_compass_core-0.1.2/stats_compass_core/cleaning/dedupe.py +91 -0
  9. stats_compass_core-0.1.2/stats_compass_core/cleaning/dropna.py +98 -0
  10. stats_compass_core-0.1.2/stats_compass_core/cleaning/handle_outliers.py +314 -0
  11. stats_compass_core-0.1.2/stats_compass_core/data/__init__.py +32 -0
  12. stats_compass_core-0.1.2/stats_compass_core/data/add_column.py +126 -0
  13. stats_compass_core-0.1.2/stats_compass_core/data/concat_dataframes.py +142 -0
  14. stats_compass_core-0.1.2/stats_compass_core/data/datasets.py +94 -0
  15. stats_compass_core-0.1.2/stats_compass_core/data/drop_columns.py +102 -0
  16. stats_compass_core-0.1.2/stats_compass_core/data/get_sample.py +80 -0
  17. stats_compass_core-0.1.2/stats_compass_core/data/get_schema.py +109 -0
  18. stats_compass_core-0.1.2/stats_compass_core/data/list_dataframes.py +48 -0
  19. stats_compass_core-0.1.2/stats_compass_core/data/load_csv.py +104 -0
  20. stats_compass_core-0.1.2/stats_compass_core/data/merge_dataframes.py +179 -0
  21. stats_compass_core-0.1.2/stats_compass_core/data/rename_columns.py +100 -0
  22. stats_compass_core-0.1.2/stats_compass_core/datasets/Bukayo_Saka_7322.csv +98 -0
  23. stats_compass_core-0.1.2/stats_compass_core/datasets/Housing.csv +546 -0
  24. stats_compass_core-0.1.2/stats_compass_core/datasets/TATASTEEL.csv +5307 -0
  25. stats_compass_core-0.1.2/stats_compass_core/eda/__init__.py +27 -0
  26. stats_compass_core-0.1.2/stats_compass_core/eda/chi_square_tests.py +271 -0
  27. stats_compass_core-0.1.2/stats_compass_core/eda/correlations.py +115 -0
  28. stats_compass_core-0.1.2/stats_compass_core/eda/data_quality.py +559 -0
  29. stats_compass_core-0.1.2/stats_compass_core/eda/describe.py +107 -0
  30. stats_compass_core-0.1.2/stats_compass_core/eda/hypothesis_tests.py +182 -0
  31. stats_compass_core-0.1.2/stats_compass_core/ml/__init__.py +56 -0
  32. stats_compass_core-0.1.2/stats_compass_core/ml/arima.py +1097 -0
  33. stats_compass_core-0.1.2/stats_compass_core/ml/common.py +143 -0
  34. stats_compass_core-0.1.2/stats_compass_core/ml/evaluate_classification_model.py +135 -0
  35. stats_compass_core-0.1.2/stats_compass_core/ml/evaluate_regression_model.py +86 -0
  36. stats_compass_core-0.1.2/stats_compass_core/ml/train_gradient_boosting_classifier.py +111 -0
  37. stats_compass_core-0.1.2/stats_compass_core/ml/train_gradient_boosting_regressor.py +111 -0
  38. stats_compass_core-0.1.2/stats_compass_core/ml/train_linear_regression.py +96 -0
  39. stats_compass_core-0.1.2/stats_compass_core/ml/train_logistic_regression.py +103 -0
  40. stats_compass_core-0.1.2/stats_compass_core/ml/train_random_forest_classifier.py +106 -0
  41. stats_compass_core-0.1.2/stats_compass_core/ml/train_random_forest_regressor.py +106 -0
  42. stats_compass_core-0.1.2/stats_compass_core/plots/__init__.py +21 -0
  43. stats_compass_core-0.1.2/stats_compass_core/plots/bar_chart.py +115 -0
  44. stats_compass_core-0.1.2/stats_compass_core/plots/classification_curves.py +351 -0
  45. stats_compass_core-0.1.2/stats_compass_core/plots/feature_importance.py +148 -0
  46. stats_compass_core-0.1.2/stats_compass_core/plots/histogram.py +122 -0
  47. stats_compass_core-0.1.2/stats_compass_core/plots/lineplot.py +133 -0
  48. stats_compass_core-0.1.2/stats_compass_core/plots/scatter_plot.py +122 -0
  49. stats_compass_core-0.1.2/stats_compass_core/registry.py +199 -0
  50. stats_compass_core-0.1.2/stats_compass_core/results.py +665 -0
  51. stats_compass_core-0.1.2/stats_compass_core/state.py +367 -0
  52. stats_compass_core-0.1.2/stats_compass_core/transforms/__init__.py +20 -0
  53. stats_compass_core-0.1.2/stats_compass_core/transforms/bin_rare_categories.py +263 -0
  54. stats_compass_core-0.1.2/stats_compass_core/transforms/filter_dataframe.py +92 -0
  55. stats_compass_core-0.1.2/stats_compass_core/transforms/groupby_aggregate.py +120 -0
  56. stats_compass_core-0.1.2/stats_compass_core/transforms/mean_target_encoding.py +301 -0
  57. stats_compass_core-0.1.2/stats_compass_core/transforms/pivot.py +125 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 machine_surfer
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,566 @@
1
+ Metadata-Version: 2.3
2
+ Name: stats-compass-core
3
+ Version: 0.1.2
4
+ Summary: A clean toolkit of deterministic pandas-based data tools
5
+ License: MIT
6
+ Keywords: pandas,data,tools,data-science,eda,ml
7
+ Author: Olatunji Ogunbiyi
8
+ Author-email: oogunbiyi21@users.noreply.github.com
9
+ Requires-Python: >=3.11,<4.0
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Scientific/Engineering
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Provides-Extra: all
21
+ Provides-Extra: ml
22
+ Provides-Extra: plots
23
+ Provides-Extra: timeseries
24
+ Requires-Dist: matplotlib (>=3.6.0,<4.0.0) ; extra == "plots" or extra == "timeseries" or extra == "all"
25
+ Requires-Dist: numpy (>=2.0.0,<3.0.0)
26
+ Requires-Dist: pandas (>=2.0.0,<3.0.0)
27
+ Requires-Dist: pydantic (>=2.0.0,<3.0.0)
28
+ Requires-Dist: scikit-learn (>=1.3.0,<2.0.0) ; extra == "ml" or extra == "all"
29
+ Requires-Dist: scipy (>=1.13.0,<2.0.0)
30
+ Requires-Dist: seaborn (>=0.12.0,<0.13.0) ; extra == "plots" or extra == "all"
31
+ Requires-Dist: statsmodels (>=0.14.0,<0.15.0) ; extra == "timeseries" or extra == "all"
32
+ Description-Content-Type: text/markdown
33
+
34
+ <div align="center">
35
+ <img src="./assets/logo/logo1.png" alt="Stats Compass Logo" width="200"/>
36
+
37
+ <h1>stats-compass-core</h1>
38
+
39
+ <p>A stateful, MCP-compatible toolkit of pandas-based data tools for AI-powered data analysis.</p>
40
+ </div>
41
+
42
+ ## Overview
43
+
44
+ **stats-compass-core** is a Python package that provides a curated collection of data tools designed for use with LLM agents via the Model Context Protocol (MCP). Unlike traditional pandas libraries, this package manages server-side state, allowing AI agents to work with DataFrames across multiple tool invocations without passing raw data over the wire.
45
+
46
+ ### Key Features
47
+
48
+ - 🔄 **Stateful Design**: Server-side `DataFrameState` manages multiple DataFrames and trained models
49
+ - 📦 **MCP-Compatible**: All tools return JSON-serializable Pydantic models
50
+ - 🧹 **Clean Architecture**: Organized into logical categories (data, cleaning, transforms, eda, ml, plots)
51
+ - 🔒 **Type-Safe**: Complete type hints with Pydantic schemas for input validation
52
+ - 🎯 **Memory-Managed**: Configurable memory limits prevent runaway state growth
53
+ - 📊 **Base64 Charts**: Visualization tools return PNG images as base64 strings
54
+ - 🤖 **Model Storage**: Trained ML models stored by ID for later use
55
+
56
+ ## Architecture
57
+
58
+ ```
59
+ ┌─────────────────────────────────────────────────────────────────┐
60
+ │ stats-compass-core │
61
+ │ ┌─────────────────────────────────────────────────────────┐ │
62
+ │ │ DataFrameState │ │
63
+ │ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │
64
+ │ │ │ DataFrames │ │ Models │ │ History │ │ │
65
+ │ │ │ (by name) │ │ (by ID) │ │ (lineage) │ │ │
66
+ │ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │
67
+ │ └─────────────────────────────────────────────────────────┘ │
68
+ │ │ │
69
+ │ ┌───────────────┼───────────────┐ │
70
+ │ ▼ ▼ ▼ │
71
+ │ ┌─────────────────┐ ┌─────────────┐ ┌─────────────────┐ │
72
+ │ │ Tool (state, │ │ Tool... │ │ Tool... │ │
73
+ │ │ params) │ │ │ │ │ │
74
+ │ └────────┬────────┘ └─────────────┘ └─────────────────┘ │
75
+ │ │ │
76
+ │ ▼ │
77
+ │ ┌─────────────────────────────────────────────────────────┐ │
78
+ │ │ Pydantic Result Model │ │
79
+ │ │ (JSON-serializable) │ │
80
+ │ └─────────────────────────────────────────────────────────┘ │
81
+ └─────────────────────────────────────────────────────────────────┘
82
+ ```
83
+
84
+ ### Three-Layer Stack
85
+
86
+ 1. **stats-compass-core** (this package) - Stateful Python tools
87
+ - Manages DataFrames and models server-side
88
+ - Returns JSON-serializable Pydantic results
89
+ - Pure data operations, no UI or orchestration
90
+
91
+ 2. **stats-compass-mcp** (separate package) - MCP Server
92
+ - Exposes tools via Model Context Protocol
93
+ - Handles JSON transport to/from LLM agents
94
+ - **Not part of this repository**
95
+
96
+ 3. **stats-compass-app** (separate package) - SaaS Application
97
+ - Web UI for human interaction
98
+ - Multi-tool pipelines and workflows
99
+ - **Not part of this repository**
100
+
101
+ ### Registry & Tool Discovery Flow
102
+
103
+ The `registry` module is the central nervous system for tool management. Here's how it works:
104
+
105
+ ```
106
+ ┌─────────────────────────────────────────────────────────────────────────┐
107
+ │ STARTUP / INITIALIZATION │
108
+ ├─────────────────────────────────────────────────────────────────────────┤
109
+ │ 1. App calls registry.auto_discover() │
110
+ │ 2. Registry walks category folders (data/, cleaning/, transforms/...) │
111
+ │ 3. Each module is imported via importlib.import_module() │
112
+ │ 4. @registry.register decorators fire, populating _tools dict │
113
+ └─────────────────────────────────────────────────────────────────────────┘
114
+
115
+
116
+ ┌─────────────────────────────────────────────────────────────────────────┐
117
+ │ TOOL INVOCATION │
118
+ ├─────────────────────────────────────────────────────────────────────────┤
119
+ │ 1. MCP server receives request: {"tool": "cleaning.drop_na", ...} │
120
+ │ 2. Calls registry.invoke("cleaning", "drop_na", state, params) │
121
+ │ 3. Registry validates params against Pydantic input_schema │
122
+ │ 4. Registry calls tool function with (state, validated_params) │
123
+ │ 5. Tool returns Pydantic result model (JSON-serializable) │
124
+ │ 6. MCP server sends result.model_dump_json() back to LLM │
125
+ └─────────────────────────────────────────────────────────────────────────┘
126
+ ```
127
+
128
+ **Key files:**
129
+ - `registry.py` - Tool registration and invocation
130
+ - `state.py` - DataFrameState for server-side data management
131
+ - `results.py` - Pydantic result types for JSON serialization
132
+
133
+ ## Installation
134
+
135
+ ### Basic Installation (Core Only)
136
+
137
+ ```bash
138
+ pip install stats-compass-core
139
+ ```
140
+
141
+ This installs the core functionality: data loading, cleaning, transforms, and EDA tools. Dependencies: pandas, numpy, scipy, pydantic.
142
+
143
+ ### With Optional Features
144
+
145
+ ```bash
146
+ # For machine learning tools (scikit-learn)
147
+ pip install stats-compass-core[ml]
148
+
149
+ # For plotting tools (matplotlib, seaborn)
150
+ pip install stats-compass-core[plots]
151
+
152
+ # For time series / ARIMA tools (statsmodels)
153
+ pip install stats-compass-core[timeseries]
154
+
155
+ # For everything
156
+ pip install stats-compass-core[all]
157
+ ```
158
+
159
+ ### For Development
160
+
161
+ ```bash
162
+ git clone https://github.com/oogunbiyi21/stats-compass-core.git
163
+ cd stats-compass-core
164
+ poetry install --with dev # Installs all deps including optional ones
165
+ ```
166
+
167
+ ## Quick Start
168
+
169
+ ### Basic Usage Pattern
170
+
171
+ All tools follow the same pattern:
172
+ 1. Create a `DataFrameState` instance (once per session)
173
+ 2. Load data into state
174
+ 3. Call tools with `(state, params)` signature
175
+ 4. Tools return JSON-serializable result objects
176
+
177
+ ```python
178
+ import pandas as pd
179
+ from stats_compass_core import DataFrameState, registry
180
+
181
+ # 1. Create state manager (one per session)
182
+ state = DataFrameState(memory_limit_mb=500)
183
+
184
+ # 2. Load data into state
185
+ df = pd.read_csv("sales_data.csv")
186
+ state.set_dataframe(df, name="sales", operation="load_csv")
187
+
188
+ # 3. Call tools via registry
189
+ result = registry.invoke("eda", "describe", state, {})
190
+ print(result.model_dump_json()) # JSON-serializable output
191
+
192
+ # 4. Chain operations
193
+ result = registry.invoke("transforms", "groupby_aggregate", state, {
194
+ "by": ["region"],
195
+ "agg_func": {"revenue": "sum", "quantity": "mean"}
196
+ })
197
+ # Result DataFrame saved to state automatically
198
+ print(f"New DataFrame: {result.dataframe_name}")
199
+ ```
200
+
201
+ ### Direct Tool Usage
202
+
203
+ You can also import and call tools directly:
204
+
205
+ ```python
206
+ from stats_compass_core import DataFrameState
207
+ from stats_compass_core.eda.describe import describe, DescribeInput
208
+ from stats_compass_core.cleaning.dropna import drop_na, DropNAInput
209
+
210
+ # Create state and load data
211
+ state = DataFrameState()
212
+ state.set_dataframe(my_dataframe, name="data", operation="manual")
213
+
214
+ # Call tool with typed params
215
+ params = DescribeInput(percentiles=[0.25, 0.5, 0.75])
216
+ result = describe(state, params)
217
+
218
+ # Result is a Pydantic model
219
+ print(result.statistics) # dict of column stats
220
+ print(result.dataframe_name) # "data"
221
+ ```
222
+
223
+ ## Core Concepts
224
+
225
+ ### DataFrameState
226
+
227
+ The `DataFrameState` class manages all server-side data:
228
+
229
+ ```python
230
+ from stats_compass_core import DataFrameState
231
+
232
+ state = DataFrameState(memory_limit_mb=500)
233
+
234
+ # Store DataFrames (multiple allowed)
235
+ state.set_dataframe(df1, name="raw_data", operation="load_csv")
236
+ state.set_dataframe(df2, name="cleaned", operation="drop_na")
237
+
238
+ # Retrieve DataFrames
239
+ df = state.get_dataframe("raw_data")
240
+ df = state.get_dataframe() # Gets active DataFrame
241
+
242
+ # Check what's stored
243
+ print(state.list_dataframes()) # [DataFrameInfo(...), ...]
244
+ print(state.get_active_dataframe_name()) # 'cleaned' (most recent)
245
+
246
+ # Store trained models
247
+ model_id = state.store_model(
248
+ model=trained_model,
249
+ model_type="random_forest_classifier",
250
+ target_column="churn",
251
+ feature_columns=["age", "tenure", "balance"],
252
+ source_dataframe="training_data"
253
+ )
254
+
255
+ # Retrieve models
256
+ model = state.get_model(model_id)
257
+ info = state.get_model_info(model_id)
258
+ ```
259
+
260
+ ### Result Types
261
+
262
+ All tools return Pydantic models that serialize to JSON:
263
+
264
+ | Result Type | Used By | Key Fields |
265
+ |-------------|---------|------------|
266
+ | `DataFrameLoadResult` | data loading tools | `dataframe_name`, `shape`, `columns` |
267
+ | `DataFrameMutationResult` | cleaning tools | `rows_before`, `rows_after`, `rows_affected` |
268
+ | `DataFrameQueryResult` | transform tools | `data`, `shape`, `dataframe_name` |
269
+ | `DescribeResult` | describe | `statistics`, `columns_analyzed` |
270
+ | `CorrelationsResult` | correlations | `correlations`, `method` |
271
+ | `ChartResult` | all plot tools | `image_base64`, `chart_type` |
272
+ | `ModelTrainingResult` | ML training | `model_id`, `metrics`, `feature_columns` |
273
+ | `HypothesisTestResult` | statistical tests | `statistic`, `p_value`, `significant_at_05` |
274
+
275
+ ### Registry
276
+
277
+ The registry provides tool discovery and invocation:
278
+
279
+ ```python
280
+ from stats_compass_core import registry
281
+
282
+ # List all tools
283
+ for key, metadata in registry._tools.items():
284
+ print(f"{key}: {metadata.description}")
285
+
286
+ # Invoke a tool (handles param validation)
287
+ result = registry.invoke(
288
+ category="cleaning",
289
+ tool_name="drop_na",
290
+ state=state,
291
+ params={"how": "any", "axis": 0}
292
+ )
293
+ ```
294
+
295
+ ## Available Tools
296
+
297
+ ### Data Tools (`stats_compass_core.data`)
298
+
299
+ | Tool | Description | Returns |
300
+ |------|-------------|---------|
301
+ | `load_csv` | Load CSV file into state | `DataFrameLoadResult` |
302
+ | `get_schema` | Get DataFrame column types and stats | `SchemaResult` |
303
+ | `get_sample` | Get sample rows from DataFrame | `SampleResult` |
304
+ | `list_dataframes` | List all DataFrames in state | `DataFrameListResult` |
305
+
306
+ ### Cleaning Tools (`stats_compass_core.cleaning`)
307
+
308
+ | Tool | Description | Returns |
309
+ |------|-------------|---------|
310
+ | `drop_na` | Remove rows/columns with missing values | `DataFrameMutationResult` |
311
+ | `dedupe` | Remove duplicate rows | `DataFrameMutationResult` |
312
+ | `apply_imputation` | Fill missing values (mean/median/mode/constant) | `DataFrameMutationResult` |
313
+ | `handle_outliers` | Handle outliers (cap/remove/winsorize/log/IQR) | `OutlierHandlingResult` |
314
+
315
+ ### Transform Tools (`stats_compass_core.transforms`)
316
+
317
+ | Tool | Description | Returns |
318
+ |------|-------------|---------|
319
+ | `groupby_aggregate` | Group and aggregate data | `DataFrameQueryResult` |
320
+ | `pivot` | Reshape long to wide format | `DataFrameQueryResult` |
321
+ | `filter_dataframe` | Filter with pandas query syntax | `DataFrameQueryResult` |
322
+ | `bin_rare_categories` | Bin rare categories into 'Other' | `BinRareCategoriesResult` |
323
+ | `mean_target_encoding` | Target encoding for categoricals *[requires ml]* | `MeanTargetEncodingResult` |
324
+
325
+ ### EDA Tools (`stats_compass_core.eda`)
326
+
327
+ | Tool | Description | Returns |
328
+ |------|-------------|---------|
329
+ | `describe` | Descriptive statistics | `DescribeResult` |
330
+ | `correlations` | Correlation matrix | `CorrelationsResult` |
331
+ | `t_test` | Two-sample t-test | `HypothesisTestResult` |
332
+ | `z_test` | Two-sample z-test | `HypothesisTestResult` |
333
+ | `chi_square_independence` | Chi-square test for independence | `HypothesisTestResult` |
334
+ | `chi_square_goodness_of_fit` | Chi-square goodness-of-fit test | `HypothesisTestResult` |
335
+ | `analyze_missing_data` | Analyze missing data patterns | `MissingDataAnalysisResult` |
336
+ | `detect_outliers` | Detect outliers using IQR/Z-score | `OutlierDetectionResult` |
337
+ | `data_quality_report` | Comprehensive data quality report | `DataQualityReportResult` |
338
+
339
+ ### ML Tools (`stats_compass_core.ml`) *[requires ml extra]*
340
+
341
+ | Tool | Description | Returns |
342
+ |------|-------------|---------|
343
+ | `train_linear_regression` | Train linear regression | `ModelTrainingResult` |
344
+ | `train_logistic_regression` | Train logistic regression | `ModelTrainingResult` |
345
+ | `train_random_forest_classifier` | Train RF classifier | `ModelTrainingResult` |
346
+ | `train_random_forest_regressor` | Train RF regressor | `ModelTrainingResult` |
347
+ | `train_gradient_boosting_classifier` | Train GB classifier | `ModelTrainingResult` |
348
+ | `train_gradient_boosting_regressor` | Train GB regressor | `ModelTrainingResult` |
349
+ | `evaluate_classification_model` | Evaluate classifier | `ClassificationEvaluationResult` |
350
+ | `evaluate_regression_model` | Evaluate regressor | `RegressionEvaluationResult` |
351
+
352
+ ### Plotting Tools (`stats_compass_core.plots`) *[requires plots extra]*
353
+
354
+ | Tool | Description | Returns |
355
+ |------|-------------|---------|
356
+ | `histogram` | Histogram of numeric column | `ChartResult` |
357
+ | `lineplot` | Line plot of time series | `ChartResult` |
358
+ | `bar_chart` | Bar chart of category counts | `ChartResult` |
359
+ | `scatter_plot` | Scatter plot of two columns | `ChartResult` |
360
+ | `feature_importance` | Feature importance from model | `ChartResult` |
361
+ | `roc_curve_plot` | ROC curve for classification model | `ChartResult` |
362
+ | `precision_recall_curve_plot` | Precision-recall curve | `ChartResult` |
363
+
364
+ ### Time Series Tools (`stats_compass_core.ml`) *[requires timeseries extra]*
365
+
366
+ | Tool | Description | Returns |
367
+ |------|-------------|---------|
368
+ | `fit_arima` | Fit ARIMA(p,d,q) model | `ARIMAResult` |
369
+ | `forecast_arima` | Generate forecasts (supports natural language periods) | `ARIMAForecastResult` |
370
+ | `find_optimal_arima` | Grid search for best ARIMA parameters | `ARIMAParameterSearchResult` |
371
+ | `check_stationarity` | ADF/KPSS stationarity tests | `StationarityTestResult` |
372
+ | `infer_frequency` | Infer time series frequency | `InferFrequencyResult` |
373
+
374
+ ## Usage Examples
375
+
376
+ ### Complete Workflow Example
377
+
378
+ ```python
379
+ import pandas as pd
380
+ from stats_compass_core import DataFrameState, registry
381
+
382
+ # Initialize state
383
+ state = DataFrameState()
384
+
385
+ # Load data
386
+ df = pd.DataFrame({
387
+ "region": ["North", "South", "North", "South", "East"],
388
+ "product": ["A", "A", "B", "B", "A"],
389
+ "revenue": [100, 150, 200, None, 120],
390
+ "quantity": [10, 15, 20, 12, 11]
391
+ })
392
+ state.set_dataframe(df, name="sales", operation="manual_load")
393
+
394
+ # Step 1: Check schema
395
+ result = registry.invoke("data", "get_schema", state, {})
396
+ print(f"Columns: {[c['name'] for c in result.columns]}")
397
+
398
+ # Step 2: Handle missing values
399
+ result = registry.invoke("cleaning", "apply_imputation", state, {
400
+ "strategy": "mean",
401
+ "columns": ["revenue"]
402
+ })
403
+ print(f"Filled {result.rows_affected} values")
404
+
405
+ # Step 3: Aggregate by region
406
+ result = registry.invoke("transforms", "groupby_aggregate", state, {
407
+ "by": ["region"],
408
+ "agg_func": {"revenue": "sum", "quantity": "mean"},
409
+ "save_as": "regional_summary"
410
+ })
411
+ print(f"Created: {result.dataframe_name}")
412
+
413
+ # Step 4: Describe the summary
414
+ result = registry.invoke("eda", "describe", state, {
415
+ "dataframe_name": "regional_summary"
416
+ })
417
+ print(result.model_dump_json(indent=2))
418
+
419
+ # Step 5: Create visualization
420
+ result = registry.invoke("plots", "bar_chart", state, {
421
+ "dataframe_name": "regional_summary",
422
+ "column": "region"
423
+ })
424
+ # result.image_base64 contains PNG image
425
+ ```
426
+
427
+ ### Working with Charts
428
+
429
+ ```python
430
+ import base64
431
+ from stats_compass_core import DataFrameState, registry
432
+
433
+ state = DataFrameState()
434
+ state.set_dataframe(my_df, name="data", operation="load")
435
+
436
+ # Create histogram
437
+ result = registry.invoke("plots", "histogram", state, {
438
+ "column": "price",
439
+ "bins": 20,
440
+ "title": "Price Distribution"
441
+ })
442
+
443
+ # Decode and save the image
444
+ image_bytes = base64.b64decode(result.image_base64)
445
+ with open("histogram.png", "wb") as f:
446
+ f.write(image_bytes)
447
+
448
+ # Or use in web response
449
+ # return Response(content=image_bytes, media_type="image/png")
450
+ ```
451
+
452
+ ### Training and Using Models
453
+
454
+ ```python
455
+ from stats_compass_core import DataFrameState, registry
456
+
457
+ state = DataFrameState()
458
+ state.set_dataframe(training_df, name="training", operation="load")
459
+
460
+ # Train model
461
+ result = registry.invoke("ml", "train_random_forest_classifier", state, {
462
+ "target_column": "churn",
463
+ "feature_columns": ["age", "tenure", "balance", "num_products"],
464
+ "test_size": 0.2
465
+ })
466
+
467
+ print(f"Model ID: {result.model_id}")
468
+ print(f"Accuracy: {result.metrics['accuracy']:.3f}")
469
+ print(f"Features: {result.feature_columns}")
470
+
471
+ # Model is stored in state for later use
472
+ model = state.get_model(result.model_id)
473
+
474
+ # Visualize feature importance
475
+ chart_result = registry.invoke("plots", "feature_importance", state, {
476
+ "model_id": result.model_id,
477
+ "top_n": 10
478
+ })
479
+ ```
480
+
481
+ ## Design Principles
482
+
483
+ ### 1. Stateful, Not Pure
484
+
485
+ Unlike traditional pandas libraries, tools mutate shared state:
486
+
487
+ ```python
488
+ # Tools operate on state, not raw DataFrames
489
+ result = drop_na(state, params) # ✓ Correct
490
+ result = drop_na(df, params) # ✗ Old pattern
491
+ ```
492
+
493
+ ### 2. JSON-Serializable Returns
494
+
495
+ All returns must be Pydantic models:
496
+
497
+ ```python
498
+ # Returns JSON-serializable result
499
+ result = describe(state, params)
500
+ json_str = result.model_dump_json() # Always works
501
+
502
+ # NOT raw DataFrames or matplotlib figures
503
+ ```
504
+
505
+ ### 3. Transform Tools Save to State
506
+
507
+ Transform operations create new named DataFrames:
508
+
509
+ ```python
510
+ result = registry.invoke("transforms", "groupby_aggregate", state, {
511
+ "by": ["region"],
512
+ "agg_func": {"sales": "sum"},
513
+ "save_as": "regional_totals" # Optional custom name
514
+ })
515
+ # New DataFrame now available as state.get_dataframe("regional_totals")
516
+ ```
517
+
518
+ ### 4. Models Stored by ID
519
+
520
+ Trained models aren't returned directly - they're stored:
521
+
522
+ ```python
523
+ result = train_random_forest_classifier(state, params)
524
+ # result.model_id = "random_forest_classifier_churn_20241207_143022"
525
+ # Use state.get_model(result.model_id) to retrieve
526
+ ```
527
+
528
+ ## Contributing
529
+
530
+ See [docs/CONTRIBUTING.md](docs/CONTRIBUTING.md) for detailed contribution guidelines.
531
+
532
+ ### Quick Start for Contributors
533
+
534
+ 1. Fork and clone the repository
535
+ 2. Install dependencies: `poetry install`
536
+ 3. Create a new tool following the pattern in existing tools
537
+ 4. Write tests in `tests/`
538
+ 5. Submit a pull request
539
+
540
+ ### Tool Signature Pattern
541
+
542
+ All tools must follow this signature:
543
+
544
+ ```python
545
+ from stats_compass_core.state import DataFrameState
546
+ from stats_compass_core.results import SomeResult
547
+ from stats_compass_core.registry import registry
548
+
549
+ class MyToolInput(BaseModel):
550
+ dataframe_name: str | None = Field(default=None)
551
+ # ... other params
552
+
553
+ @registry.register(category="category", input_schema=MyToolInput, description="...")
554
+ def my_tool(state: DataFrameState, params: MyToolInput) -> SomeResult:
555
+ df = state.get_dataframe(params.dataframe_name)
556
+ source_name = params.dataframe_name or state.get_active_dataframe_name()
557
+
558
+ # ... do work ...
559
+
560
+ return SomeResult(...)
561
+ ```
562
+
563
+ ## License
564
+
565
+ MIT License - see [LICENSE](LICENSE) for details.
566
+