stats-compass-core 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. stats_compass_core-0.1.0/LICENSE +21 -0
  2. stats_compass_core-0.1.0/PKG-INFO +562 -0
  3. stats_compass_core-0.1.0/README.md +528 -0
  4. stats_compass_core-0.1.0/datasets/Bukayo_Saka_7322.csv +98 -0
  5. stats_compass_core-0.1.0/datasets/Housing.csv +546 -0
  6. stats_compass_core-0.1.0/datasets/TATASTEEL.csv +5307 -0
  7. stats_compass_core-0.1.0/pyproject.toml +83 -0
  8. stats_compass_core-0.1.0/stats_compass_core/__init__.py +52 -0
  9. stats_compass_core-0.1.0/stats_compass_core/cleaning/__init__.py +13 -0
  10. stats_compass_core-0.1.0/stats_compass_core/cleaning/apply_imputation.py +133 -0
  11. stats_compass_core-0.1.0/stats_compass_core/cleaning/dedupe.py +91 -0
  12. stats_compass_core-0.1.0/stats_compass_core/cleaning/dropna.py +98 -0
  13. stats_compass_core-0.1.0/stats_compass_core/cleaning/handle_outliers.py +314 -0
  14. stats_compass_core-0.1.0/stats_compass_core/data/__init__.py +32 -0
  15. stats_compass_core-0.1.0/stats_compass_core/data/add_column.py +126 -0
  16. stats_compass_core-0.1.0/stats_compass_core/data/concat_dataframes.py +142 -0
  17. stats_compass_core-0.1.0/stats_compass_core/data/datasets.py +94 -0
  18. stats_compass_core-0.1.0/stats_compass_core/data/drop_columns.py +102 -0
  19. stats_compass_core-0.1.0/stats_compass_core/data/get_sample.py +80 -0
  20. stats_compass_core-0.1.0/stats_compass_core/data/get_schema.py +109 -0
  21. stats_compass_core-0.1.0/stats_compass_core/data/list_dataframes.py +48 -0
  22. stats_compass_core-0.1.0/stats_compass_core/data/load_csv.py +104 -0
  23. stats_compass_core-0.1.0/stats_compass_core/data/merge_dataframes.py +179 -0
  24. stats_compass_core-0.1.0/stats_compass_core/data/rename_columns.py +100 -0
  25. stats_compass_core-0.1.0/stats_compass_core/eda/__init__.py +27 -0
  26. stats_compass_core-0.1.0/stats_compass_core/eda/chi_square_tests.py +271 -0
  27. stats_compass_core-0.1.0/stats_compass_core/eda/correlations.py +115 -0
  28. stats_compass_core-0.1.0/stats_compass_core/eda/data_quality.py +559 -0
  29. stats_compass_core-0.1.0/stats_compass_core/eda/describe.py +107 -0
  30. stats_compass_core-0.1.0/stats_compass_core/eda/hypothesis_tests.py +182 -0
  31. stats_compass_core-0.1.0/stats_compass_core/ml/__init__.py +56 -0
  32. stats_compass_core-0.1.0/stats_compass_core/ml/arima.py +1097 -0
  33. stats_compass_core-0.1.0/stats_compass_core/ml/common.py +143 -0
  34. stats_compass_core-0.1.0/stats_compass_core/ml/evaluate_classification_model.py +135 -0
  35. stats_compass_core-0.1.0/stats_compass_core/ml/evaluate_regression_model.py +86 -0
  36. stats_compass_core-0.1.0/stats_compass_core/ml/train_gradient_boosting_classifier.py +111 -0
  37. stats_compass_core-0.1.0/stats_compass_core/ml/train_gradient_boosting_regressor.py +111 -0
  38. stats_compass_core-0.1.0/stats_compass_core/ml/train_linear_regression.py +96 -0
  39. stats_compass_core-0.1.0/stats_compass_core/ml/train_logistic_regression.py +103 -0
  40. stats_compass_core-0.1.0/stats_compass_core/ml/train_random_forest_classifier.py +106 -0
  41. stats_compass_core-0.1.0/stats_compass_core/ml/train_random_forest_regressor.py +106 -0
  42. stats_compass_core-0.1.0/stats_compass_core/plots/__init__.py +21 -0
  43. stats_compass_core-0.1.0/stats_compass_core/plots/bar_chart.py +115 -0
  44. stats_compass_core-0.1.0/stats_compass_core/plots/classification_curves.py +351 -0
  45. stats_compass_core-0.1.0/stats_compass_core/plots/feature_importance.py +148 -0
  46. stats_compass_core-0.1.0/stats_compass_core/plots/histogram.py +122 -0
  47. stats_compass_core-0.1.0/stats_compass_core/plots/lineplot.py +133 -0
  48. stats_compass_core-0.1.0/stats_compass_core/plots/scatter_plot.py +122 -0
  49. stats_compass_core-0.1.0/stats_compass_core/registry.py +199 -0
  50. stats_compass_core-0.1.0/stats_compass_core/results.py +665 -0
  51. stats_compass_core-0.1.0/stats_compass_core/state.py +367 -0
  52. stats_compass_core-0.1.0/stats_compass_core/transforms/__init__.py +20 -0
  53. stats_compass_core-0.1.0/stats_compass_core/transforms/bin_rare_categories.py +263 -0
  54. stats_compass_core-0.1.0/stats_compass_core/transforms/filter_dataframe.py +92 -0
  55. stats_compass_core-0.1.0/stats_compass_core/transforms/groupby_aggregate.py +120 -0
  56. stats_compass_core-0.1.0/stats_compass_core/transforms/mean_target_encoding.py +301 -0
  57. stats_compass_core-0.1.0/stats_compass_core/transforms/pivot.py +125 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 machine_surfer
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,562 @@
1
+ Metadata-Version: 2.3
2
+ Name: stats-compass-core
3
+ Version: 0.1.0
4
+ Summary: A clean toolkit of deterministic pandas-based data tools
5
+ License: MIT
6
+ Keywords: pandas,data,tools,data-science,eda,ml
7
+ Author: Olamide Ogunbiyi
8
+ Author-email: oogunbiyi21@users.noreply.github.com
9
+ Requires-Python: >=3.11,<4.0
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Scientific/Engineering
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Provides-Extra: all
21
+ Provides-Extra: ml
22
+ Provides-Extra: plots
23
+ Provides-Extra: timeseries
24
+ Requires-Dist: matplotlib (>=3.6.0,<4.0.0) ; extra == "plots" or extra == "timeseries" or extra == "all"
25
+ Requires-Dist: numpy (>=2.0.0,<3.0.0)
26
+ Requires-Dist: pandas (>=2.0.0,<3.0.0)
27
+ Requires-Dist: pydantic (>=2.0.0,<3.0.0)
28
+ Requires-Dist: scikit-learn (>=1.3.0,<2.0.0) ; extra == "ml" or extra == "all"
29
+ Requires-Dist: scipy (>=1.13.0,<2.0.0)
30
+ Requires-Dist: seaborn (>=0.12.0,<0.13.0) ; extra == "plots" or extra == "all"
31
+ Requires-Dist: statsmodels (>=0.14.0,<0.15.0) ; extra == "timeseries" or extra == "all"
32
+ Description-Content-Type: text/markdown
33
+
34
+ # stats-compass-core
35
+
36
+ A stateful, MCP-compatible toolkit of pandas-based data tools for AI-powered data analysis.
37
+
38
+ ## Overview
39
+
40
+ **stats-compass-core** is a Python package that provides a curated collection of data tools designed for use with LLM agents via the Model Context Protocol (MCP). Unlike traditional pandas libraries, this package manages server-side state, allowing AI agents to work with DataFrames across multiple tool invocations without passing raw data over the wire.
41
+
42
+ ### Key Features
43
+
44
+ - 🔄 **Stateful Design**: Server-side `DataFrameState` manages multiple DataFrames and trained models
45
+ - 📦 **MCP-Compatible**: All tools return JSON-serializable Pydantic models
46
+ - 🧹 **Clean Architecture**: Organized into logical categories (data, cleaning, transforms, eda, ml, plots)
47
+ - 🔒 **Type-Safe**: Complete type hints with Pydantic schemas for input validation
48
+ - 🎯 **Memory-Managed**: Configurable memory limits prevent runaway state growth
49
+ - 📊 **Base64 Charts**: Visualization tools return PNG images as base64 strings
50
+ - 🤖 **Model Storage**: Trained ML models stored by ID for later use
51
+
52
+ ## Architecture
53
+
54
+ ```
55
+ ┌─────────────────────────────────────────────────────────────────┐
56
+ │ stats-compass-core │
57
+ │ ┌─────────────────────────────────────────────────────────┐ │
58
+ │ │ DataFrameState │ │
59
+ │ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │
60
+ │ │ │ DataFrames │ │ Models │ │ History │ │ │
61
+ │ │ │ (by name) │ │ (by ID) │ │ (lineage) │ │ │
62
+ │ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │
63
+ │ └─────────────────────────────────────────────────────────┘ │
64
+ │ │ │
65
+ │ ┌───────────────┼───────────────┐ │
66
+ │ ▼ ▼ ▼ │
67
+ │ ┌─────────────────┐ ┌─────────────┐ ┌─────────────────┐ │
68
+ │ │ Tool (state, │ │ Tool... │ │ Tool... │ │
69
+ │ │ params) │ │ │ │ │ │
70
+ │ └────────┬────────┘ └─────────────┘ └─────────────────┘ │
71
+ │ │ │
72
+ │ ▼ │
73
+ │ ┌─────────────────────────────────────────────────────────┐ │
74
+ │ │ Pydantic Result Model │ │
75
+ │ │ (JSON-serializable) │ │
76
+ │ └─────────────────────────────────────────────────────────┘ │
77
+ └─────────────────────────────────────────────────────────────────┘
78
+ ```
79
+
80
+ ### Three-Layer Stack
81
+
82
+ 1. **stats-compass-core** (this package) - Stateful Python tools
83
+ - Manages DataFrames and models server-side
84
+ - Returns JSON-serializable Pydantic results
85
+ - Pure data operations, no UI or orchestration
86
+
87
+ 2. **stats-compass-mcp** (separate package) - MCP Server
88
+ - Exposes tools via Model Context Protocol
89
+ - Handles JSON transport to/from LLM agents
90
+ - **Not part of this repository**
91
+
92
+ 3. **stats-compass-app** (separate package) - SaaS Application
93
+ - Web UI for human interaction
94
+ - Multi-tool pipelines and workflows
95
+ - **Not part of this repository**
96
+
97
+ ### Registry & Tool Discovery Flow
98
+
99
+ The `registry` module is the central nervous system for tool management. Here's how it works:
100
+
101
+ ```
102
+ ┌─────────────────────────────────────────────────────────────────────────┐
103
+ │ STARTUP / INITIALIZATION │
104
+ ├─────────────────────────────────────────────────────────────────────────┤
105
+ │ 1. App calls registry.auto_discover() │
106
+ │ 2. Registry walks category folders (data/, cleaning/, transforms/...) │
107
+ │ 3. Each module is imported via importlib.import_module() │
108
+ │ 4. @registry.register decorators fire, populating _tools dict │
109
+ └─────────────────────────────────────────────────────────────────────────┘
110
+
111
+
112
+ ┌─────────────────────────────────────────────────────────────────────────┐
113
+ │ TOOL INVOCATION │
114
+ ├─────────────────────────────────────────────────────────────────────────┤
115
+ │ 1. MCP server receives request: {"tool": "cleaning.drop_na", ...} │
116
+ │ 2. Calls registry.invoke("cleaning", "drop_na", state, params) │
117
+ │ 3. Registry validates params against Pydantic input_schema │
118
+ │ 4. Registry calls tool function with (state, validated_params) │
119
+ │ 5. Tool returns Pydantic result model (JSON-serializable) │
120
+ │ 6. MCP server sends result.model_dump_json() back to LLM │
121
+ └─────────────────────────────────────────────────────────────────────────┘
122
+ ```
123
+
124
+ **Key files:**
125
+ - `registry.py` - Tool registration and invocation
126
+ - `state.py` - DataFrameState for server-side data management
127
+ - `results.py` - Pydantic result types for JSON serialization
128
+
129
+ ## Installation
130
+
131
+ ### Basic Installation (Core Only)
132
+
133
+ ```bash
134
+ pip install stats-compass-core
135
+ ```
136
+
137
+ This installs the core functionality: data loading, cleaning, transforms, and EDA tools. Dependencies: pandas, numpy, scipy, pydantic.
138
+
139
+ ### With Optional Features
140
+
141
+ ```bash
142
+ # For machine learning tools (scikit-learn)
143
+ pip install stats-compass-core[ml]
144
+
145
+ # For plotting tools (matplotlib, seaborn)
146
+ pip install stats-compass-core[plots]
147
+
148
+ # For time series / ARIMA tools (statsmodels)
149
+ pip install stats-compass-core[timeseries]
150
+
151
+ # For everything
152
+ pip install stats-compass-core[all]
153
+ ```
154
+
155
+ ### For Development
156
+
157
+ ```bash
158
+ git clone https://github.com/oogunbiyi21/stats-compass-core.git
159
+ cd stats-compass-core
160
+ poetry install --with dev # Installs all deps including optional ones
161
+ ```
162
+
163
+ ## Quick Start
164
+
165
+ ### Basic Usage Pattern
166
+
167
+ All tools follow the same pattern:
168
+ 1. Create a `DataFrameState` instance (once per session)
169
+ 2. Load data into state
170
+ 3. Call tools with `(state, params)` signature
171
+ 4. Tools return JSON-serializable result objects
172
+
173
+ ```python
174
+ import pandas as pd
175
+ from stats_compass_core import DataFrameState, registry
176
+
177
+ # 1. Create state manager (one per session)
178
+ state = DataFrameState(memory_limit_mb=500)
179
+
180
+ # 2. Load data into state
181
+ df = pd.read_csv("sales_data.csv")
182
+ state.set_dataframe(df, name="sales", operation="load_csv")
183
+
184
+ # 3. Call tools via registry
185
+ result = registry.invoke("eda", "describe", state, {})
186
+ print(result.model_dump_json()) # JSON-serializable output
187
+
188
+ # 4. Chain operations
189
+ result = registry.invoke("transforms", "groupby_aggregate", state, {
190
+ "by": ["region"],
191
+ "agg_func": {"revenue": "sum", "quantity": "mean"}
192
+ })
193
+ # Result DataFrame saved to state automatically
194
+ print(f"New DataFrame: {result.dataframe_name}")
195
+ ```
196
+
197
+ ### Direct Tool Usage
198
+
199
+ You can also import and call tools directly:
200
+
201
+ ```python
202
+ from stats_compass_core import DataFrameState
203
+ from stats_compass_core.eda.describe import describe, DescribeInput
204
+ from stats_compass_core.cleaning.dropna import drop_na, DropNAInput
205
+
206
+ # Create state and load data
207
+ state = DataFrameState()
208
+ state.set_dataframe(my_dataframe, name="data", operation="manual")
209
+
210
+ # Call tool with typed params
211
+ params = DescribeInput(percentiles=[0.25, 0.5, 0.75])
212
+ result = describe(state, params)
213
+
214
+ # Result is a Pydantic model
215
+ print(result.statistics) # dict of column stats
216
+ print(result.dataframe_name) # "data"
217
+ ```
218
+
219
+ ## Core Concepts
220
+
221
+ ### DataFrameState
222
+
223
+ The `DataFrameState` class manages all server-side data:
224
+
225
+ ```python
226
+ from stats_compass_core import DataFrameState
227
+
228
+ state = DataFrameState(memory_limit_mb=500)
229
+
230
+ # Store DataFrames (multiple allowed)
231
+ state.set_dataframe(df1, name="raw_data", operation="load_csv")
232
+ state.set_dataframe(df2, name="cleaned", operation="drop_na")
233
+
234
+ # Retrieve DataFrames
235
+ df = state.get_dataframe("raw_data")
236
+ df = state.get_dataframe() # Gets active DataFrame
237
+
238
+ # Check what's stored
239
+ print(state.list_dataframes()) # [DataFrameInfo(...), ...]
240
+ print(state.get_active_dataframe_name()) # 'cleaned' (most recent)
241
+
242
+ # Store trained models
243
+ model_id = state.store_model(
244
+ model=trained_model,
245
+ model_type="random_forest_classifier",
246
+ target_column="churn",
247
+ feature_columns=["age", "tenure", "balance"],
248
+ source_dataframe="training_data"
249
+ )
250
+
251
+ # Retrieve models
252
+ model = state.get_model(model_id)
253
+ info = state.get_model_info(model_id)
254
+ ```
255
+
256
+ ### Result Types
257
+
258
+ All tools return Pydantic models that serialize to JSON:
259
+
260
+ | Result Type | Used By | Key Fields |
261
+ |-------------|---------|------------|
262
+ | `DataFrameLoadResult` | data loading tools | `dataframe_name`, `shape`, `columns` |
263
+ | `DataFrameMutationResult` | cleaning tools | `rows_before`, `rows_after`, `rows_affected` |
264
+ | `DataFrameQueryResult` | transform tools | `data`, `shape`, `dataframe_name` |
265
+ | `DescribeResult` | describe | `statistics`, `columns_analyzed` |
266
+ | `CorrelationsResult` | correlations | `correlations`, `method` |
267
+ | `ChartResult` | all plot tools | `image_base64`, `chart_type` |
268
+ | `ModelTrainingResult` | ML training | `model_id`, `metrics`, `feature_columns` |
269
+ | `HypothesisTestResult` | statistical tests | `statistic`, `p_value`, `significant_at_05` |
270
+
271
+ ### Registry
272
+
273
+ The registry provides tool discovery and invocation:
274
+
275
+ ```python
276
+ from stats_compass_core import registry
277
+
278
+ # List all tools
279
+ for key, metadata in registry._tools.items():
280
+ print(f"{key}: {metadata.description}")
281
+
282
+ # Invoke a tool (handles param validation)
283
+ result = registry.invoke(
284
+ category="cleaning",
285
+ tool_name="drop_na",
286
+ state=state,
287
+ params={"how": "any", "axis": 0}
288
+ )
289
+ ```
290
+
291
+ ## Available Tools
292
+
293
+ ### Data Tools (`stats_compass_core.data`)
294
+
295
+ | Tool | Description | Returns |
296
+ |------|-------------|---------|
297
+ | `load_csv` | Load CSV file into state | `DataFrameLoadResult` |
298
+ | `get_schema` | Get DataFrame column types and stats | `SchemaResult` |
299
+ | `get_sample` | Get sample rows from DataFrame | `SampleResult` |
300
+ | `list_dataframes` | List all DataFrames in state | `DataFrameListResult` |
301
+
302
+ ### Cleaning Tools (`stats_compass_core.cleaning`)
303
+
304
+ | Tool | Description | Returns |
305
+ |------|-------------|---------|
306
+ | `drop_na` | Remove rows/columns with missing values | `DataFrameMutationResult` |
307
+ | `dedupe` | Remove duplicate rows | `DataFrameMutationResult` |
308
+ | `apply_imputation` | Fill missing values (mean/median/mode/constant) | `DataFrameMutationResult` |
309
+ | `handle_outliers` | Handle outliers (cap/remove/winsorize/log/IQR) | `OutlierHandlingResult` |
310
+
311
+ ### Transform Tools (`stats_compass_core.transforms`)
312
+
313
+ | Tool | Description | Returns |
314
+ |------|-------------|---------|
315
+ | `groupby_aggregate` | Group and aggregate data | `DataFrameQueryResult` |
316
+ | `pivot` | Reshape long to wide format | `DataFrameQueryResult` |
317
+ | `filter_dataframe` | Filter with pandas query syntax | `DataFrameQueryResult` |
318
+ | `bin_rare_categories` | Bin rare categories into 'Other' | `BinRareCategoriesResult` |
319
+ | `mean_target_encoding` | Target encoding for categoricals *[requires ml]* | `MeanTargetEncodingResult` |
320
+
321
+ ### EDA Tools (`stats_compass_core.eda`)
322
+
323
+ | Tool | Description | Returns |
324
+ |------|-------------|---------|
325
+ | `describe` | Descriptive statistics | `DescribeResult` |
326
+ | `correlations` | Correlation matrix | `CorrelationsResult` |
327
+ | `t_test` | Two-sample t-test | `HypothesisTestResult` |
328
+ | `z_test` | Two-sample z-test | `HypothesisTestResult` |
329
+ | `chi_square_independence` | Chi-square test for independence | `HypothesisTestResult` |
330
+ | `chi_square_goodness_of_fit` | Chi-square goodness-of-fit test | `HypothesisTestResult` |
331
+ | `analyze_missing_data` | Analyze missing data patterns | `MissingDataAnalysisResult` |
332
+ | `detect_outliers` | Detect outliers using IQR/Z-score | `OutlierDetectionResult` |
333
+ | `data_quality_report` | Comprehensive data quality report | `DataQualityReportResult` |
334
+
335
+ ### ML Tools (`stats_compass_core.ml`) *[requires ml extra]*
336
+
337
+ | Tool | Description | Returns |
338
+ |------|-------------|---------|
339
+ | `train_linear_regression` | Train linear regression | `ModelTrainingResult` |
340
+ | `train_logistic_regression` | Train logistic regression | `ModelTrainingResult` |
341
+ | `train_random_forest_classifier` | Train RF classifier | `ModelTrainingResult` |
342
+ | `train_random_forest_regressor` | Train RF regressor | `ModelTrainingResult` |
343
+ | `train_gradient_boosting_classifier` | Train GB classifier | `ModelTrainingResult` |
344
+ | `train_gradient_boosting_regressor` | Train GB regressor | `ModelTrainingResult` |
345
+ | `evaluate_classification_model` | Evaluate classifier | `ClassificationEvaluationResult` |
346
+ | `evaluate_regression_model` | Evaluate regressor | `RegressionEvaluationResult` |
347
+
348
+ ### Plotting Tools (`stats_compass_core.plots`) *[requires plots extra]*
349
+
350
+ | Tool | Description | Returns |
351
+ |------|-------------|---------|
352
+ | `histogram` | Histogram of numeric column | `ChartResult` |
353
+ | `lineplot` | Line plot of time series | `ChartResult` |
354
+ | `bar_chart` | Bar chart of category counts | `ChartResult` |
355
+ | `scatter_plot` | Scatter plot of two columns | `ChartResult` |
356
+ | `feature_importance` | Feature importance from model | `ChartResult` |
357
+ | `roc_curve_plot` | ROC curve for classification model | `ChartResult` |
358
+ | `precision_recall_curve_plot` | Precision-recall curve | `ChartResult` |
359
+
360
+ ### Time Series Tools (`stats_compass_core.ml`) *[requires timeseries extra]*
361
+
362
+ | Tool | Description | Returns |
363
+ |------|-------------|---------|
364
+ | `fit_arima` | Fit ARIMA(p,d,q) model | `ARIMAResult` |
365
+ | `forecast_arima` | Generate forecasts (supports natural language periods) | `ARIMAForecastResult` |
366
+ | `find_optimal_arima` | Grid search for best ARIMA parameters | `ARIMAParameterSearchResult` |
367
+ | `check_stationarity` | ADF/KPSS stationarity tests | `StationarityTestResult` |
368
+ | `infer_frequency` | Infer time series frequency | `InferFrequencyResult` |
369
+
370
+ ## Usage Examples
371
+
372
+ ### Complete Workflow Example
373
+
374
+ ```python
375
+ import pandas as pd
376
+ from stats_compass_core import DataFrameState, registry
377
+
378
+ # Initialize state
379
+ state = DataFrameState()
380
+
381
+ # Load data
382
+ df = pd.DataFrame({
383
+ "region": ["North", "South", "North", "South", "East"],
384
+ "product": ["A", "A", "B", "B", "A"],
385
+ "revenue": [100, 150, 200, None, 120],
386
+ "quantity": [10, 15, 20, 12, 11]
387
+ })
388
+ state.set_dataframe(df, name="sales", operation="manual_load")
389
+
390
+ # Step 1: Check schema
391
+ result = registry.invoke("data", "get_schema", state, {})
392
+ print(f"Columns: {[c['name'] for c in result.columns]}")
393
+
394
+ # Step 2: Handle missing values
395
+ result = registry.invoke("cleaning", "apply_imputation", state, {
396
+ "strategy": "mean",
397
+ "columns": ["revenue"]
398
+ })
399
+ print(f"Filled {result.rows_affected} values")
400
+
401
+ # Step 3: Aggregate by region
402
+ result = registry.invoke("transforms", "groupby_aggregate", state, {
403
+ "by": ["region"],
404
+ "agg_func": {"revenue": "sum", "quantity": "mean"},
405
+ "save_as": "regional_summary"
406
+ })
407
+ print(f"Created: {result.dataframe_name}")
408
+
409
+ # Step 4: Describe the summary
410
+ result = registry.invoke("eda", "describe", state, {
411
+ "dataframe_name": "regional_summary"
412
+ })
413
+ print(result.model_dump_json(indent=2))
414
+
415
+ # Step 5: Create visualization
416
+ result = registry.invoke("plots", "bar_chart", state, {
417
+ "dataframe_name": "regional_summary",
418
+ "column": "region"
419
+ })
420
+ # result.image_base64 contains PNG image
421
+ ```
422
+
423
+ ### Working with Charts
424
+
425
+ ```python
426
+ import base64
427
+ from stats_compass_core import DataFrameState, registry
428
+
429
+ state = DataFrameState()
430
+ state.set_dataframe(my_df, name="data", operation="load")
431
+
432
+ # Create histogram
433
+ result = registry.invoke("plots", "histogram", state, {
434
+ "column": "price",
435
+ "bins": 20,
436
+ "title": "Price Distribution"
437
+ })
438
+
439
+ # Decode and save the image
440
+ image_bytes = base64.b64decode(result.image_base64)
441
+ with open("histogram.png", "wb") as f:
442
+ f.write(image_bytes)
443
+
444
+ # Or use in web response
445
+ # return Response(content=image_bytes, media_type="image/png")
446
+ ```
447
+
448
+ ### Training and Using Models
449
+
450
+ ```python
451
+ from stats_compass_core import DataFrameState, registry
452
+
453
+ state = DataFrameState()
454
+ state.set_dataframe(training_df, name="training", operation="load")
455
+
456
+ # Train model
457
+ result = registry.invoke("ml", "train_random_forest_classifier", state, {
458
+ "target_column": "churn",
459
+ "feature_columns": ["age", "tenure", "balance", "num_products"],
460
+ "test_size": 0.2
461
+ })
462
+
463
+ print(f"Model ID: {result.model_id}")
464
+ print(f"Accuracy: {result.metrics['accuracy']:.3f}")
465
+ print(f"Features: {result.feature_columns}")
466
+
467
+ # Model is stored in state for later use
468
+ model = state.get_model(result.model_id)
469
+
470
+ # Visualize feature importance
471
+ chart_result = registry.invoke("plots", "feature_importance", state, {
472
+ "model_id": result.model_id,
473
+ "top_n": 10
474
+ })
475
+ ```
476
+
477
+ ## Design Principles
478
+
479
+ ### 1. Stateful, Not Pure
480
+
481
+ Unlike traditional pandas libraries, tools mutate shared state:
482
+
483
+ ```python
484
+ # Tools operate on state, not raw DataFrames
485
+ result = drop_na(state, params) # ✓ Correct
486
+ result = drop_na(df, params) # ✗ Old pattern
487
+ ```
488
+
489
+ ### 2. JSON-Serializable Returns
490
+
491
+ All returns must be Pydantic models:
492
+
493
+ ```python
494
+ # Returns JSON-serializable result
495
+ result = describe(state, params)
496
+ json_str = result.model_dump_json() # Always works
497
+
498
+ # NOT raw DataFrames or matplotlib figures
499
+ ```
500
+
501
+ ### 3. Transform Tools Save to State
502
+
503
+ Transform operations create new named DataFrames:
504
+
505
+ ```python
506
+ result = registry.invoke("transforms", "groupby_aggregate", state, {
507
+ "by": ["region"],
508
+ "agg_func": {"sales": "sum"},
509
+ "save_as": "regional_totals" # Optional custom name
510
+ })
511
+ # New DataFrame now available as state.get_dataframe("regional_totals")
512
+ ```
513
+
514
+ ### 4. Models Stored by ID
515
+
516
+ Trained models aren't returned directly - they're stored:
517
+
518
+ ```python
519
+ result = train_random_forest_classifier(state, params)
520
+ # result.model_id = "random_forest_classifier_churn_20241207_143022"
521
+ # Use state.get_model(result.model_id) to retrieve
522
+ ```
523
+
524
+ ## Contributing
525
+
526
+ See [docs/CONTRIBUTING.md](docs/CONTRIBUTING.md) for detailed contribution guidelines.
527
+
528
+ ### Quick Start for Contributors
529
+
530
+ 1. Fork and clone the repository
531
+ 2. Install dependencies: `poetry install`
532
+ 3. Create a new tool following the pattern in existing tools
533
+ 4. Write tests in `tests/`
534
+ 5. Submit a pull request
535
+
536
+ ### Tool Signature Pattern
537
+
538
+ All tools must follow this signature:
539
+
540
+ ```python
541
+ from stats_compass_core.state import DataFrameState
542
+ from stats_compass_core.results import SomeResult
543
+ from stats_compass_core.registry import registry
544
+
545
+ class MyToolInput(BaseModel):
546
+ dataframe_name: str | None = Field(default=None)
547
+ # ... other params
548
+
549
+ @registry.register(category="category", input_schema=MyToolInput, description="...")
550
+ def my_tool(state: DataFrameState, params: MyToolInput) -> SomeResult:
551
+ df = state.get_dataframe(params.dataframe_name)
552
+ source_name = params.dataframe_name or state.get_active_dataframe_name()
553
+
554
+ # ... do work ...
555
+
556
+ return SomeResult(...)
557
+ ```
558
+
559
+ ## License
560
+
561
+ MIT License - see [LICENSE](LICENSE) for details.
562
+