mloda 0.2.13__py3-none-any.whl → 0.2.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. mloda-0.2.15.dist-info/METADATA +391 -0
  2. {mloda-0.2.13.dist-info → mloda-0.2.15.dist-info}/RECORD +81 -76
  3. mloda_core/abstract_plugins/abstract_feature_group.py +57 -1
  4. mloda_core/abstract_plugins/components/feature.py +6 -6
  5. mloda_core/abstract_plugins/components/feature_chainer/feature_chain_parser.py +1 -5
  6. mloda_core/abstract_plugins/components/feature_collection.py +39 -14
  7. mloda_core/abstract_plugins/components/feature_set.py +19 -17
  8. mloda_core/abstract_plugins/components/framework_transformer/cfw_transformer.py +42 -1
  9. mloda_core/abstract_plugins/components/input_data/base_input_data.py +1 -1
  10. mloda_core/abstract_plugins/components/link.py +24 -5
  11. mloda_core/abstract_plugins/components/options.py +135 -59
  12. mloda_core/abstract_plugins/compute_frame_work.py +47 -3
  13. mloda_core/api/request.py +6 -2
  14. mloda_core/core/step/join_step.py +1 -0
  15. mloda_core/core/step/transform_frame_work_step.py +27 -2
  16. mloda_core/filter/global_filter.py +5 -5
  17. mloda_core/prepare/execution_plan.py +2 -2
  18. mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_framework.py +1 -1
  19. mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_merge_engine.py +20 -15
  20. mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_pyarrow_transformer.py +1 -1
  21. mloda_plugins/compute_framework/base_implementations/pandas/pandas_merge_engine.py +8 -4
  22. mloda_plugins/compute_framework/base_implementations/polars/polars_lazy_merge_engine.py +4 -2
  23. mloda_plugins/compute_framework/base_implementations/polars/polars_merge_engine.py +63 -48
  24. mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_merge_engine.py +57 -12
  25. mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_merge_engine.py +47 -37
  26. mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_pyarrow_transformer.py +4 -0
  27. mloda_plugins/config/__init__.py +1 -0
  28. mloda_plugins/config/feature/__init__.py +0 -0
  29. mloda_plugins/config/feature/loader.py +164 -0
  30. mloda_plugins/config/feature/models.py +50 -0
  31. mloda_plugins/config/feature/parser.py +35 -0
  32. mloda_plugins/feature_group/experimental/aggregated_feature_group/base.py +41 -14
  33. mloda_plugins/feature_group/experimental/aggregated_feature_group/pandas.py +79 -25
  34. mloda_plugins/feature_group/experimental/aggregated_feature_group/polars_lazy.py +82 -28
  35. mloda_plugins/feature_group/experimental/aggregated_feature_group/pyarrow.py +87 -32
  36. mloda_plugins/feature_group/experimental/clustering/base.py +95 -15
  37. mloda_plugins/feature_group/experimental/clustering/pandas.py +231 -8
  38. mloda_plugins/feature_group/experimental/data_quality/missing_value/base.py +107 -16
  39. mloda_plugins/feature_group/experimental/data_quality/missing_value/pandas.py +85 -44
  40. mloda_plugins/feature_group/experimental/data_quality/missing_value/pyarrow.py +101 -56
  41. mloda_plugins/feature_group/experimental/data_quality/missing_value/python_dict.py +94 -38
  42. mloda_plugins/feature_group/experimental/default_options_key.py +1 -1
  43. mloda_plugins/feature_group/experimental/dimensionality_reduction/base.py +81 -10
  44. mloda_plugins/feature_group/experimental/dimensionality_reduction/pandas.py +89 -16
  45. mloda_plugins/feature_group/experimental/dynamic_feature_group_factory/dynamic_feature_group_factory.py +166 -1
  46. mloda_plugins/feature_group/experimental/forecasting/base.py +127 -24
  47. mloda_plugins/feature_group/experimental/forecasting/pandas.py +251 -23
  48. mloda_plugins/feature_group/experimental/geo_distance/base.py +4 -4
  49. mloda_plugins/feature_group/experimental/llm/cli_features/refactor_git_cached.py +2 -2
  50. mloda_plugins/feature_group/experimental/llm/installed_packages_feature_group.py +90 -0
  51. mloda_plugins/feature_group/experimental/llm/llm_api/claude.py +165 -0
  52. mloda_plugins/feature_group/experimental/llm/llm_api/gemini.py +156 -0
  53. mloda_plugins/feature_group/experimental/llm/llm_api/openai.py +191 -0
  54. mloda_plugins/feature_group/experimental/llm/llm_file_selector.py +159 -2
  55. mloda_plugins/feature_group/experimental/node_centrality/base.py +6 -6
  56. mloda_plugins/feature_group/experimental/sklearn/encoding/base.py +114 -32
  57. mloda_plugins/feature_group/experimental/sklearn/encoding/pandas.py +3 -3
  58. mloda_plugins/feature_group/experimental/sklearn/pipeline/base.py +2 -2
  59. mloda_plugins/feature_group/experimental/sklearn/pipeline/pandas.py +3 -3
  60. mloda_plugins/feature_group/experimental/sklearn/scaling/base.py +3 -3
  61. mloda_plugins/feature_group/experimental/sklearn/scaling/pandas.py +3 -3
  62. mloda_plugins/feature_group/experimental/sklearn/sklearn_artifact.py +4 -4
  63. mloda_plugins/feature_group/experimental/source_input_feature.py +7 -7
  64. mloda_plugins/feature_group/experimental/text_cleaning/base.py +4 -4
  65. mloda_plugins/feature_group/experimental/time_window/base.py +44 -17
  66. mloda_plugins/feature_group/experimental/time_window/pandas.py +59 -8
  67. mloda_plugins/feature_group/experimental/time_window/pyarrow.py +95 -40
  68. mloda_plugins/feature_group/input_data/api_data/api_data.py +122 -0
  69. mloda_plugins/feature_group/input_data/read_context_files.py +1 -1
  70. mloda_plugins/feature_group/input_data/read_dbs/sqlite.py +144 -0
  71. mloda_plugins/feature_group/input_data/read_files/csv.py +139 -0
  72. mloda_plugins/feature_group/input_data/read_files/feather.py +90 -0
  73. mloda_plugins/feature_group/input_data/read_files/json.py +108 -0
  74. mloda_plugins/feature_group/input_data/read_files/orc.py +90 -0
  75. mloda_plugins/feature_group/input_data/read_files/parquet.py +90 -0
  76. mloda_plugins/feature_group/input_data/read_files/text_file_reader.py +118 -0
  77. mloda-0.2.13.dist-info/METADATA +0 -477
  78. {mloda-0.2.13.dist-info → mloda-0.2.15.dist-info}/WHEEL +0 -0
  79. {mloda-0.2.13.dist-info → mloda-0.2.15.dist-info}/entry_points.txt +0 -0
  80. {mloda-0.2.13.dist-info → mloda-0.2.15.dist-info}/licenses/LICENSE.TXT +0 -0
  81. {mloda-0.2.13.dist-info → mloda-0.2.15.dist-info}/licenses/NOTICE.md +0 -0
  82. {mloda-0.2.13.dist-info → mloda-0.2.15.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,391 @@
1
+ Metadata-Version: 2.4
2
+ Name: mloda
3
+ Version: 0.2.15
4
+ Summary: Rethinking Data and Feature Engineering
5
+ Author-email: Tom Kaltofen <info@mloda.ai>
6
+ License: Apache-2.0
7
+ Project-URL: Bug Tracker, https://github.com/mloda-ai/mloda/issues
8
+ Project-URL: Documentation, https://mloda-ai.github.io/mloda/
9
+ Project-URL: Source Code, https://github.com/mloda-ai/mloda
10
+ Project-URL: PyPI, https://pypi.org/project/mloda/
11
+ Project-URL: Homepage, https://mloda.ai
12
+ Classifier: Programming Language :: Python :: 3
13
+ Requires-Python: <3.14,>=3.8
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE.TXT
16
+ License-File: NOTICE.md
17
+ Requires-Dist: pyarrow
18
+ Dynamic: license-file
19
+
20
+ # mloda: Make data, feature and context engineering shareable
21
+
22
+ [![Website](https://img.shields.io/badge/website-mloda.ai-blue.svg)](https://mloda.ai)
23
+ [![Documentation](https://img.shields.io/badge/docs-github.io-blue.svg)](https://mloda-ai.github.io/mloda/)
24
+ [![PyPI version](https://badge.fury.io/py/mloda.svg)](https://badge.fury.io/py/mloda)
25
+ [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/mloda-ai/mloda/blob/main/LICENSE.TXT)
26
+ [![Tox](https://img.shields.io/badge/tested_with-tox-blue.svg)](https://tox.readthedocs.io/)
27
+ [![Checked with mypy](https://img.shields.io/badge/type%20checked-mypy-blue.svg)](http://mypy-lang.org/)
28
+ [![code style: ruff](https://img.shields.io/badge/code%20style-ruff-000000.svg)](https://github.com/astral-sh/ruff)
29
+
30
+ > **⚠️ Early Version Notice**: mloda is in active development. Some features described below are still being implemented. We're actively seeking feedback to shape the future of the framework. [Share your thoughts!](https://github.com/mloda-ai/mloda/issues/)
31
+
32
+ ## 🍳 Think of mloda Like Cooking Recipes
33
+
34
+ **Traditional Data Pipelines** = Making everything from scratch
35
+ - Want pasta? Make noodles, sauce, cheese from raw ingredients
36
+ - Want pizza? Start over - make dough, sauce, cheese again
37
+ - Want lasagna? Repeat everything once more
38
+ - Can't share recipes easily - they're mixed with your kitchen setup
39
+
40
+ **mloda** = Using recipe components
41
+ - Create reusable recipes: "tomato sauce", "pasta dough", "cheese blend"
42
+ - Use same "tomato sauce" for pasta, pizza, lasagna
43
+ - Switch kitchens (home → restaurant → food truck) - same recipes work
44
+ - Share your "tomato sauce" recipe with friends - they don't need your whole kitchen
45
+
46
+ **Result**: Instead of rebuilding the same thing 10 times, build once and reuse everywhere!
47
+
48
+ ### Installation
49
+ ```bash
50
+ pip install mloda
51
+ ```
52
+
53
+ ### 1. The Core API Call - Your Starting Point
54
+
55
+ **Complete Working Example with DataCreator**
56
+
57
+ ```python
58
+ # Step 1: Create a sample data source using DataCreator
59
+ from mloda_core.abstract_plugins.abstract_feature_group import AbstractFeatureGroup
60
+ from mloda_core.abstract_plugins.components.input_data.creator.data_creator import DataCreator
61
+ from mloda_core.abstract_plugins.components.feature_set import FeatureSet
62
+ from typing import Any, Optional
63
+ from mloda_core.abstract_plugins.components.input_data.base_input_data import BaseInputData
64
+ import pandas as pd
65
+
66
+ class SampleData(AbstractFeatureGroup):
67
+ @classmethod
68
+ def input_data(cls) -> Optional[BaseInputData]:
69
+ return DataCreator({"customer_id", "age", "income"})
70
+
71
+ @classmethod
72
+ def calculate_feature(cls, data: Any, features: FeatureSet) -> Any:
73
+ return pd.DataFrame({
74
+ 'customer_id': ['C001', 'C002', 'C003', 'C004', 'C005'],
75
+ 'age': [25, 30, 35, None, 45],
76
+ 'income': [50000, 75000, None, 60000, 85000]
77
+ })
78
+
79
+ # Step 2: Load mloda plugins and run pipeline
80
+ from mloda_core.api.request import mlodaAPI
81
+ from mloda_core.abstract_plugins.plugin_loader.plugin_loader import PluginLoader
82
+ from mloda_plugins.compute_framework.base_implementations.pandas.dataframe import PandasDataframe
83
+
84
+ PluginLoader.all()
85
+
86
+ result = mlodaAPI.run_all(
87
+ features=[
88
+ "customer_id", # Original column
89
+ "age", # Original column
90
+ "standard_scaled__income" # Transform: scale income to mean=0, std=1
91
+ ],
92
+ compute_frameworks={PandasDataframe}
93
+ )
94
+
95
+ # Step 3: Get your processed data
96
+ data = result[0]
97
+ print(data.head())
98
+ # Output: DataFrame with customer_id, age, and scaled income
99
+ ```
100
+
101
+ **What just happened?**
102
+ 1. **SampleData class** - Created a data source using DataCreator (generates data in-memory)
103
+ 2. **PluginLoader.all()** - Loaded all available transformations (scaling, encoding, imputation, etc.)
104
+ 3. **mlodaAPI.run_all()** - Executed the feature pipeline:
105
+ - Got data from `SampleData`
106
+ - Extracted `customer_id` and `age` as-is
107
+ - Applied StandardScaler to `income` → `standard_scaled__income`
108
+ 4. **result[0]** - Retrieved the processed pandas DataFrame
109
+
110
+ > **Key Insight**: The syntax `standard_scaled__income` is mloda's **feature chaining**. Behind the scenes, mloda creates a chain of **feature group** objects (`StandardScalingFeatureGroup` → `SourceFeatureGroup`), automatically resolving dependencies. See [Section 2](#2-understanding-feature-chaining-transformations) for full explanation of chaining syntax and [Section 4](#4-advanced-feature-objects-for-complex-configurations) to learn about the underlying feature group architecture.
111
+
112
+ ### 2. Understanding Feature Chaining (Transformations)
113
+
114
+ **The Power of Double Underscore `__` Syntax**
115
+
116
+ As mentioned in Section 1, feature chaining (like `standard_scaled__income`) is syntactic sugar that mloda converts into a chain of **feature group objects**. Each transformation (`standard_scaled`, `mean_imputed`, etc.) corresponds to a specific feature group class.
117
+
118
+ mloda's chaining syntax lets you compose transformations using `__` as a separator:
119
+
120
+ ```python
121
+ # Pattern examples (these show the syntax):
122
+ # "standard_scaled__income" # Scale income column
123
+ # "mean_imputed__age" # Fill missing age values with mean
124
+ # "onehot_encoded__category" # One-hot encode category column
125
+ #
126
+ # You can chain transformations!
127
+ # Pattern: {transform2}__{transform1}__{source}
128
+ # "standard_scaled__mean_imputed__income" # First impute, then scale
129
+
130
+ # Real working example:
131
+ _ = ["standard_scaled__income", "mean_imputed__age"] # Valid feature names
132
+ ```
133
+
134
+ **Available Transformations:**
135
+
136
+ | Transformation | Purpose | Example |
137
+ |---------------|---------|---------|
138
+ | `standard_scaled__` | StandardScaler (mean=0, std=1) | `standard_scaled__income` |
139
+ | `minmax_scaled__` | MinMaxScaler (range [0,1]) | `minmax_scaled__age` |
140
+ | `robust_scaled__` | RobustScaler (median-based, handles outliers) | `robust_scaled__price` |
141
+ | `mean_imputed__` | Fill missing values with mean | `mean_imputed__salary` |
142
+ | `median_imputed__` | Fill missing values with median | `median_imputed__age` |
143
+ | `mode_imputed__` | Fill missing values with mode | `mode_imputed__category` |
144
+ | `onehot_encoded__` | One-hot encoding | `onehot_encoded__state` |
145
+ | `label_encoded__` | Label encoding | `label_encoded__priority` |
146
+
147
+ > **Key Insight**: Transformations are read right-to-left. `standard_scaled__mean_imputed__income` means: take `income` → apply mean imputation → apply standard scaling.
148
+
149
+ **When You Need More Control**
150
+
151
+ Most of the time, simple string syntax is enough:
152
+ ```python
153
+ # Example feature list (simple strings)
154
+ example_features = ["customer_id", "standard_scaled__income", "onehot_encoded__region"]
155
+ ```
156
+
157
+ But for advanced configurations, you can explicitly create `Feature` objects with custom options (covered in Section 3).
158
+
159
+ ### 3. Advanced: Feature Objects for Complex Configurations
160
+
161
+ **Understanding the Feature Group Architecture**
162
+
163
+ Behind the scenes, chaining like `standard_scaled__income` creates feature group objects:
164
+
165
+ ```python
166
+ # When you write this string:
167
+ "standard_scaled__income"
168
+
169
+ # mloda creates this chain of feature groups:
170
+ # StandardScalingFeatureGroup (reads from) → IncomeSourceFeatureGroup
171
+ ```
172
+
173
+ **Explicit Feature Objects**
174
+
175
+ For truly custom configurations, you can use `Feature` objects:
176
+
177
+ ```python
178
+ # Example (for custom feature configurations):
179
+ # from mloda_core.abstract_plugins.components.feature import Feature
180
+ # from mloda_core.abstract_plugins.components.options import Options
181
+ #
182
+ # features = [
183
+ # "customer_id", # Simple string
184
+ # Feature(
185
+ # "custom_feature",
186
+ # options=Options({
187
+ # "custom_param": "value",
188
+ # "mloda_source_features": "source_column",
189
+ # })
190
+ # ),
191
+ # ]
192
+ #
193
+ # result = mlodaAPI.run_all(
194
+ # features=features,
195
+ # compute_frameworks={PandasDataframe}
196
+ # )
197
+ ```
198
+
199
+ > **Deep Dive**: Each transformation type (`standard_scaled__`, `mean_imputed__`, etc.) maps to a feature group class in `mloda_plugins/feature_group/`. For example, `standard_scaled__` uses `ScalingFeatureGroup`. When you chain transformations, mloda builds a dependency graph of these feature groups and executes them in the correct order. This architecture makes mloda extensible - you can create custom feature groups for your own transformations!
200
+
201
+ ### 4. Data Access - Where Your Data Comes From
202
+
203
+ **Three Ways to Provide Data**
204
+
205
+ mloda supports multiple data access patterns depending on your use case:
206
+
207
+ **1. DataCreator** - For testing and demos (used in our examples)
208
+ ```python
209
+ # Perfect for creating sample/test data in-memory
210
+ # See Section 1 for the SampleData class definition using DataCreator:
211
+ #
212
+ # class SampleData(AbstractFeatureGroup):
213
+ # @classmethod
214
+ # def input_data(cls) -> Optional[BaseInputData]:
215
+ # return DataCreator({"customer_id", "age", "income"})
216
+ #
217
+ # @classmethod
218
+ # def calculate_feature(cls, data: Any, features: FeatureSet) -> Any:
219
+ # return pd.DataFrame({
220
+ # 'customer_id': ['C001', 'C002'],
221
+ # 'age': [25, 30],
222
+ # 'income': [50000, 75000]
223
+ # })
224
+ ```
225
+
226
+ **2. DataAccessCollection** - For production file/database access
227
+ ```python
228
+ # Example (requires actual files/databases):
229
+ # from mloda_core.abstract_plugins.components.data_access_collection import DataAccessCollection
230
+ #
231
+ # # Read from files, folders, or databases
232
+ # data_access = DataAccessCollection(
233
+ # files={"customers.csv", "orders.parquet"}, # CSV/Parquet/JSON files
234
+ # folders={"data/raw/"}, # Entire directories
235
+ # credential_dicts={"host": "db.example.com"} # Database credentials
236
+ # )
237
+ #
238
+ # result = mlodaAPI.run_all(
239
+ # features=["customer_id", "standard_scaled__income"],
240
+ # compute_frameworks={PandasDataframe},
241
+ # data_access_collection=data_access
242
+ # )
243
+ ```
244
+
245
+ **3. ApiData** - For runtime data injection (web requests, real-time predictions)
246
+ ```python
247
+ # Example (for API endpoints and real-time predictions):
248
+ # from mloda_core.abstract_plugins.components.input_data.api.api_input_data_collection import ApiInputDataCollection
249
+ #
250
+ # api_input_data_collection = ApiInputDataCollection()
251
+ # api_data = api_input_data_collection.setup_key_api_data(
252
+ # key_name="PredictionData",
253
+ # api_input_data={"customer_id": ["C001", "C002"], "age": [25, 30]}
254
+ # )
255
+ #
256
+ # result = mlodaAPI.run_all(
257
+ # features=["customer_id", "standard_scaled__age"],
258
+ # compute_frameworks={PandasDataframe},
259
+ # api_input_data_collection=api_input_data_collection,
260
+ # api_data=api_data
261
+ # )
262
+ ```
263
+
264
+ > **Key Insight**: Use **DataCreator** for demos, **DataAccessCollection** for batch processing from files/databases, and **ApiData** for real-time predictions and web services.
265
+
266
+ ### 5. Compute Frameworks - Choose Your Processing Engine
267
+
268
+ **Using Different Data Processing Libraries**
269
+
270
+ mloda supports multiple compute frameworks (pandas, polars, pyarrow, etc.). Most users start with pandas:
271
+
272
+ ```python
273
+ # Using the SampleData class from Section 1
274
+ # Default: Everything processes with pandas
275
+ result = mlodaAPI.run_all(
276
+ features=["customer_id", "standard_scaled__income"],
277
+ compute_frameworks={PandasDataframe} # Use pandas for all features
278
+ )
279
+
280
+ data = result[0] # Returns pandas DataFrame
281
+ print(type(data)) # <class 'pandas.core.frame.DataFrame'>
282
+ ```
283
+
284
+ **Why Compute Frameworks Matter:**
285
+ - **Pandas**: Best for small-to-medium datasets, rich ecosystem, familiar API
286
+ - **Polars**: High performance for larger datasets
287
+ - **PyArrow**: Memory-efficient, great for columnar data
288
+ - **Spark**: Distributed processing for big data
289
+
290
+ > **For most use cases**: Start with `compute_frameworks={PandasDataframe}` and switch to others only if you need specific performance characteristics.
291
+
292
+ ### 6. Putting It All Together - Complete ML Pipeline
293
+
294
+ **Real-World Example: Customer Churn Prediction**
295
+
296
+ Let's build a complete machine learning pipeline with mloda:
297
+
298
+ ```python
299
+ # Step 1: Extend SampleData with more features for ML
300
+ # (Reuse the same class to avoid conflicts)
301
+ SampleData._original_calculate = SampleData.calculate_feature
302
+
303
+ @classmethod
304
+ def _extended_calculate(cls, data: Any, features: FeatureSet) -> Any:
305
+ import numpy as np
306
+ np.random.seed(42)
307
+ n = 100
308
+ return pd.DataFrame({
309
+ 'customer_id': [f'C{i:03d}' for i in range(n)],
310
+ 'age': np.random.randint(18, 70, n),
311
+ 'income': np.random.randint(30000, 120000, n),
312
+ 'account_balance': np.random.randint(0, 10000, n),
313
+ 'subscription_tier': np.random.choice(['Basic', 'Premium', 'Enterprise'], n),
314
+ 'region': np.random.choice(['North', 'South', 'East', 'West'], n),
315
+ 'customer_segment': np.random.choice(['New', 'Regular', 'VIP'], n),
316
+ 'churned': np.random.choice([0, 1], n)
317
+ })
318
+
319
+ SampleData.calculate_feature = _extended_calculate
320
+ SampleData._input_data_original = SampleData.input_data()
321
+
322
+ @classmethod
323
+ def _extended_input_data(cls) -> Optional[BaseInputData]:
324
+ return DataCreator({"customer_id", "age", "income", "account_balance",
325
+ "subscription_tier", "region", "customer_segment", "churned"})
326
+
327
+ SampleData.input_data = _extended_input_data
328
+
329
+ # Step 2: Run feature engineering pipeline
330
+ from sklearn.model_selection import train_test_split
331
+ from sklearn.ensemble import RandomForestClassifier
332
+ from sklearn.metrics import accuracy_score
333
+
334
+ result = mlodaAPI.run_all(
335
+ features=[
336
+ "customer_id",
337
+ "standard_scaled__age",
338
+ "standard_scaled__income",
339
+ "robust_scaled__account_balance",
340
+ "label_encoded__subscription_tier",
341
+ "label_encoded__region",
342
+ "label_encoded__customer_segment",
343
+ "churned"
344
+ ],
345
+ compute_frameworks={PandasDataframe}
346
+ )
347
+
348
+ # Step 3: Prepare for ML
349
+ processed_data = result[0]
350
+ if len(processed_data.columns) > 2: # Check we have features besides customer_id and churned
351
+ X = processed_data.drop(['customer_id', 'churned'], axis=1)
352
+ y = processed_data['churned']
353
+
354
+ # Step 4: Train and evaluate
355
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
356
+ model = RandomForestClassifier(n_estimators=100, random_state=42)
357
+ model.fit(X_train, y_train)
358
+ accuracy = accuracy_score(y_test, model.predict(X_test))
359
+ print(f"🎯 Model Accuracy: {accuracy:.2%}")
360
+ else:
361
+ print("⚠️ Skipping ML - extend SampleData first with more features!")
362
+ ```
363
+
364
+ **What mloda Did For You:**
365
+ 1. ✅ Generated sample data with DataCreator
366
+ 2. ✅ Scaled numeric features (StandardScaler & RobustScaler)
367
+ 3. ✅ Encoded categorical features (Label encoding)
368
+ 4. ✅ Returned clean DataFrame ready for sklearn
369
+
370
+ > **🎉 You now understand mloda's complete workflow!** The same transformations work across pandas, polars, pyarrow, and other frameworks - just change `compute_frameworks`.
371
+
372
+ ## 📖 Documentation
373
+
374
+ - **[Getting Started](https://mloda-ai.github.io/mloda/chapter1/installation/)** - Installation and first steps
375
+ - **[sklearn Integration](https://mloda-ai.github.io/mloda/examples/sklearn_integration_basic/)** - Complete tutorial
376
+ - **[Feature Groups](https://mloda-ai.github.io/mloda/chapter1/feature-groups/)** - Core concepts
377
+ - **[Compute Frameworks](https://mloda-ai.github.io/mloda/chapter1/compute-frameworks/)** - Technology integration
378
+ - **[API Reference](https://mloda-ai.github.io/mloda/in_depth/mloda-api/)** - Complete API documentation
379
+
380
+ ## 🤝 Contributing
381
+
382
+ We welcome contributions! Whether you're building plugins, adding features, or improving documentation, your input is invaluable.
383
+
384
+ - **[Development Guide](https://mloda-ai.github.io/mloda/development/)** - How to contribute
385
+ - **[GitHub Issues](https://github.com/mloda-ai/mloda/issues/)** - Report bugs or request features
386
+ - **[Email](mailto:info@mloda.ai)** - Direct contact
387
+
388
+ ## 📄 License
389
+
390
+ This project is licensed under the [Apache License, Version 2.0](https://github.com/mloda-ai/mloda/blob/main/LICENSE.TXT).
391
+ ---