mloda 0.2.11__py3-none-any.whl → 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mloda-0.2.11.dist-info → mloda-0.2.13.dist-info}/METADATA +166 -178
- {mloda-0.2.11.dist-info → mloda-0.2.13.dist-info}/RECORD +44 -45
- mloda_core/abstract_plugins/abstract_feature_group.py +1 -27
- mloda_core/abstract_plugins/components/base_artifact.py +32 -2
- mloda_core/abstract_plugins/components/feature.py +18 -3
- mloda_core/abstract_plugins/components/feature_chainer/feature_chain_parser.py +247 -88
- mloda_core/abstract_plugins/components/feature_collection.py +5 -1
- mloda_core/abstract_plugins/components/feature_set.py +24 -0
- mloda_core/abstract_plugins/components/options.py +177 -17
- mloda_core/api/request.py +5 -1
- mloda_core/core/engine.py +2 -19
- mloda_core/prepare/identify_feature_group.py +23 -1
- mloda_core/runtime/run.py +0 -1
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_filter_engine.py +0 -5
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_framework.py +1 -1
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_merge_engine.py +1 -1
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_pyarrow_transformer.py +1 -1
- mloda_plugins/feature_group/experimental/aggregated_feature_group/base.py +130 -79
- mloda_plugins/feature_group/experimental/clustering/base.py +149 -101
- mloda_plugins/feature_group/experimental/data_quality/missing_value/base.py +147 -85
- mloda_plugins/feature_group/experimental/data_quality/missing_value/pyarrow.py +9 -1
- mloda_plugins/feature_group/experimental/default_options_key.py +6 -0
- mloda_plugins/feature_group/experimental/dimensionality_reduction/base.py +174 -137
- mloda_plugins/feature_group/experimental/forecasting/base.py +178 -105
- mloda_plugins/feature_group/experimental/forecasting/forecasting_artifact.py +5 -2
- mloda_plugins/feature_group/experimental/forecasting/pandas.py +75 -11
- mloda_plugins/feature_group/experimental/geo_distance/base.py +159 -96
- mloda_plugins/feature_group/experimental/llm/llm_api/gemini.py +4 -4
- mloda_plugins/feature_group/experimental/node_centrality/base.py +101 -72
- mloda_plugins/feature_group/experimental/sklearn/encoding/base.py +104 -60
- mloda_plugins/feature_group/experimental/sklearn/pipeline/base.py +191 -84
- mloda_plugins/feature_group/experimental/sklearn/scaling/base.py +101 -57
- mloda_plugins/feature_group/experimental/sklearn/sklearn_artifact.py +15 -21
- mloda_plugins/feature_group/experimental/text_cleaning/base.py +103 -67
- mloda_plugins/feature_group/experimental/time_window/base.py +109 -48
- mloda_plugins/feature_group/experimental/time_window/pyarrow.py +11 -1
- mloda_plugins/feature_group/input_data/read_db.py +9 -1
- mloda_plugins/feature_group/input_data/read_dbs/sqlite.py +11 -3
- mloda_plugins/feature_group/input_data/read_file.py +9 -1
- mloda_core/abstract_plugins/components/feature_chainer/feature_chainer_parser_configuration.py +0 -177
- {mloda-0.2.11.dist-info → mloda-0.2.13.dist-info}/WHEEL +0 -0
- {mloda-0.2.11.dist-info → mloda-0.2.13.dist-info}/entry_points.txt +0 -0
- {mloda-0.2.11.dist-info → mloda-0.2.13.dist-info}/licenses/LICENSE.TXT +0 -0
- {mloda-0.2.11.dist-info → mloda-0.2.13.dist-info}/licenses/NOTICE.md +0 -0
- {mloda-0.2.11.dist-info → mloda-0.2.13.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mloda
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.13
|
|
4
4
|
Summary: Rethinking Data and Feature Engineering
|
|
5
5
|
Author-email: Tom Kaltofen <info@mloda.ai>
|
|
6
6
|
License: Apache License
|
|
@@ -219,43 +219,18 @@ License-File: NOTICE.md
|
|
|
219
219
|
Requires-Dist: pyarrow
|
|
220
220
|
Dynamic: license-file
|
|
221
221
|
|
|
222
|
-
# mloda:
|
|
222
|
+
# mloda: Make data and feature engineering shareable
|
|
223
223
|
|
|
224
|
+
[](https://mloda.ai)
|
|
224
225
|
[](https://mloda-ai.github.io/mloda/)
|
|
225
226
|
[](https://badge.fury.io/py/mloda)
|
|
226
227
|
[](https://github.com/mloda-ai/mloda/blob/main/LICENSE.TXT)
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
228
|
+
[](https://tox.readthedocs.io/)
|
|
229
|
+
[](http://mypy-lang.org/)
|
|
230
|
+
[](https://github.com/astral-sh/ruff)
|
|
230
231
|
|
|
231
232
|
> **⚠️ Early Version Notice**: mloda is in active development. Some features described below are still being implemented. We're actively seeking feedback to shape the future of the framework. [Share your thoughts!](https://github.com/mloda-ai/mloda/issues/)
|
|
232
233
|
|
|
233
|
-
## 🚀 Transforming Feature Engineering Through Process-Data Separation
|
|
234
|
-
|
|
235
|
-
mloda **revolutionizes feature engineering** by separating **processes** (transformations) from **data**, enabling unprecedented flexibility, reusability, and scalability in machine learning workflows.
|
|
236
|
-
|
|
237
|
-
**🤖 Built for the AI Era**: While others write code, AI writes mloda plugins. *Check the inline comments in our experimental plugin code - all AI written.*
|
|
238
|
-
|
|
239
|
-
**🌐 Share Without Secrets**: Traditional pipelines lock business logic inside - mloda plugins separate transformations from business context, enabling safe community sharing.
|
|
240
|
-
|
|
241
|
-
**🎯 Try the first example out NOW:** [sklearn Integration Example](https://mloda-ai.github.io/mloda/examples/sklearn_integration_basic/) - See mloda transform traditional sklearn pipelines!
|
|
242
|
-
|
|
243
|
-
## 📋 Table of Contents
|
|
244
|
-
|
|
245
|
-
- [🍳 Think of mloda Like Cooking Recipes](#-think-of-mloda-like-cooking-recipes)
|
|
246
|
-
- [💡 The Value Proposition](#-the-value-proposition)
|
|
247
|
-
- [📊 Why Process-Data Separation Changes Everything](#-why-process-data-separation-changes-everything)
|
|
248
|
-
- [🚀 Quick Start](#-quick-start)
|
|
249
|
-
- [🔄 Write Once, Run Anywhere](#-write-once-run-anywhere-environments--frameworks)
|
|
250
|
-
- [🌍 Deploy Anywhere Python Runs](#-deploy-anywhere-python-runs)
|
|
251
|
-
- [🎯 Minimal Dependencies](#-minimal-dependencies-maximum-compatibility)
|
|
252
|
-
- [🔧 Complete Data Processing](#-complete-data-processing-capabilities)
|
|
253
|
-
- [👥 Role-Based Governance](#-logical-role-based-data-governance)
|
|
254
|
-
- [🌐 Community-Driven Plugin Ecosystem](#-community-driven-plugin-ecosystem)
|
|
255
|
-
- [📖 Documentation](#-documentation)
|
|
256
|
-
- [🤝 Contributing](#-contributing)
|
|
257
|
-
- [📄 License](#-license)
|
|
258
|
-
|
|
259
234
|
## 🍳 Think of mloda Like Cooking Recipes
|
|
260
235
|
|
|
261
236
|
**Traditional Data Pipelines** = Making everything from scratch
|
|
@@ -270,201 +245,215 @@ mloda **revolutionizes feature engineering** by separating **processes** (transf
|
|
|
270
245
|
- Switch kitchens (home → restaurant → food truck) - same recipes work
|
|
271
246
|
- Share your "tomato sauce" recipe with friends - they don't need your whole kitchen
|
|
272
247
|
|
|
273
|
-
**Real Example**: You need to clean customer ages (remove outliers, fill missing values)
|
|
274
|
-
- **Traditional**: Write age-cleaning code for training, testing, production separately
|
|
275
|
-
- **mloda**: Create one "clean_age" plugin, use everywhere - development, testing, production, analysis
|
|
276
|
-
|
|
277
248
|
**Result**: Instead of rebuilding the same thing 10 times, build once and reuse everywhere!
|
|
278
249
|
|
|
279
|
-
|
|
250
|
+
### Installation
|
|
251
|
+
```bash
|
|
252
|
+
pip install mloda
|
|
253
|
+
```
|
|
280
254
|
|
|
281
|
-
|
|
255
|
+
### 1. The Core API Call - Your Starting Point
|
|
282
256
|
|
|
283
|
-
|
|
284
|
-
|-----------|----------------------|------------------|
|
|
285
|
-
| **⏰ Repetitive Work** | Rebuild same transformations for each environment | Write once, reuse across all environments |
|
|
286
|
-
| **🐛 Consistency Issues** | Different implementations create bugs | Single implementation ensures consistency |
|
|
287
|
-
| **👥 Knowledge Silos** | Senior expertise locked in complex pipelines | Reusable patterns everyone can use |
|
|
288
|
-
| **🚀 Deployment Friction** | Train/serve skew causes production issues | Same logic guaranteed everywhere |
|
|
289
|
-
| **💡 Innovation Bottleneck** | Time spent on solved problems | Focus energy on unique business value |
|
|
257
|
+
**The One Command That Does Everything**
|
|
290
258
|
|
|
291
|
-
|
|
259
|
+
```python
|
|
260
|
+
# This is the heart of mloda. You describe what you want and mloda resolves the dependencies.
|
|
261
|
+
from mloda_core.api.request import mlodaAPI
|
|
292
262
|
|
|
293
|
-
|
|
263
|
+
result = mlodaAPI.run_all(
|
|
264
|
+
features=["age", "standard_scaled__weight"]
|
|
265
|
+
)
|
|
294
266
|
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
| **📝 Maintainability** | Complex nested pipeline objects | Clean, declarative feature names |
|
|
300
|
-
| **🏭 Scalability** | Framework-specific limitations | Horizontal scaling without architectural changes |
|
|
267
|
+
# That's it! You get processed data back
|
|
268
|
+
data = result[0]
|
|
269
|
+
print(data.head())
|
|
270
|
+
```
|
|
301
271
|
|
|
302
|
-
|
|
272
|
+
**What just happened?**
|
|
273
|
+
- mloda found your data automatically
|
|
274
|
+
- Applied transformations (scaling, encoding)
|
|
275
|
+
- Returned clean, ready-to-use DataFrame
|
|
303
276
|
|
|
304
|
-
|
|
277
|
+
> **Key Insight**: As long as the plugins and data accesses exist, mloda can derive any feature automatically.
|
|
305
278
|
|
|
306
|
-
###
|
|
307
|
-
```bash
|
|
308
|
-
pip install mloda
|
|
309
|
-
```
|
|
279
|
+
### 2. Setting Up Your Data
|
|
310
280
|
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
281
|
+
**Using DataCreator - The mloda Way**
|
|
282
|
+
|
|
283
|
+
```python
|
|
284
|
+
# DataCreator: Perfect for testing, demos, and prototyping
|
|
285
|
+
# Use this when you need synthetic data or want to test mloda without external files
|
|
316
286
|
from mloda_core.abstract_plugins.components.input_data.creator.data_creator import DataCreator
|
|
317
287
|
from mloda_core.abstract_plugins.abstract_feature_group import AbstractFeatureGroup
|
|
318
288
|
|
|
319
|
-
|
|
320
|
-
n_samples = 1000
|
|
321
|
-
|
|
322
|
-
class YourFirstSyntheticDataSet(AbstractFeatureGroup):
|
|
289
|
+
class SampleDataFeature(AbstractFeatureGroup):
|
|
323
290
|
@classmethod
|
|
324
291
|
def input_data(cls):
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
292
|
+
# Define what columns your data will have
|
|
293
|
+
return DataCreator({
|
|
294
|
+
"age", "weight", "state", "income", "target"
|
|
295
|
+
})
|
|
296
|
+
|
|
297
|
+
@classmethod
|
|
328
298
|
def calculate_feature(cls, data, features):
|
|
299
|
+
# Generate sample data that matches your DataCreator specification
|
|
300
|
+
# This is where you'd normally load from files, databases, or APIs
|
|
329
301
|
return {
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
302
|
+
'age': [25, 30, 35, None, 45, 28, 33],
|
|
303
|
+
'weight': [150, 180, None, 200, 165, 140, 175],
|
|
304
|
+
'state': ['CA', 'NY', 'TX', 'CA', 'FL', 'NY', 'TX'],
|
|
305
|
+
'income': [50000, 75000, 85000, 60000, None, 45000, 70000],
|
|
306
|
+
'target': [1, 0, 1, 0, 1, 0, 1]
|
|
307
|
+
}
|
|
308
|
+
```
|
|
335
309
|
|
|
336
|
-
|
|
337
|
-
features = [
|
|
338
|
-
"standard_scaled__mean_imputed__age",
|
|
339
|
-
"onehot_encoded__state",
|
|
340
|
-
"robust_scaled__weight"
|
|
341
|
-
]
|
|
310
|
+
**When to Use DataCreator vs Other Data Access Methods:**
|
|
342
311
|
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
312
|
+
- **DataCreator**: For testing, demos, synthetic data, or when you want to generate data programmatically within mloda
|
|
313
|
+
- **File Access** (`DataAccessCollection` with files): When your data lives in CSV, JSON, Parquet, etc.
|
|
314
|
+
- **Database Access** (`DataAccessCollection` with credentials): When connecting to SQL databases, data warehouses
|
|
315
|
+
- **API Access**: When fetching data from REST APIs or other web services
|
|
346
316
|
|
|
347
|
-
|
|
317
|
+
> **Key Insight**: DataCreator is mloda's built-in data generation tool - perfect for getting started without external dependencies. Once you're ready for production, switch to file or database access methods.
|
|
348
318
|
|
|
349
|
-
**
|
|
319
|
+
**Quick Start with Your Own Data:**
|
|
320
|
+
```python
|
|
321
|
+
# Replace DataCreator with real data access
|
|
322
|
+
from mloda_core.abstract_plugins.components.data_access_collection import DataAccessCollection
|
|
350
323
|
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
def clean_age_training(data): ... # Training pipeline
|
|
354
|
-
def clean_age_testing(data): ... # Testing pipeline
|
|
355
|
-
def clean_age_production(data): ... # Production API
|
|
356
|
-
def clean_age_spark(data): ... # Big data processing
|
|
357
|
-
def clean_age_analysis(data): ... # Analytics
|
|
324
|
+
# For files
|
|
325
|
+
data_access = DataAccessCollection(files={"your_data.csv"})
|
|
358
326
|
|
|
359
|
-
#
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
# Single implementation for all contexts
|
|
364
|
-
return process_age_data(data["age"])
|
|
365
|
-
|
|
366
|
-
# Same plugin, different environments & frameworks
|
|
367
|
-
mlodaAPI.run_all(["clean_age"], compute_frameworks={PandasDataframe}) # Dev
|
|
368
|
-
mlodaAPI.run_all(["clean_age"], compute_frameworks={SparkDataframe}) # Production
|
|
369
|
-
mlodaAPI.run_all(["clean_age"], compute_frameworks={PolarsDataframe}) # High performance
|
|
370
|
-
mlodaAPI.run_all(["clean_age"], compute_frameworks={DuckDBFramework}) # Analytics
|
|
327
|
+
# For databases
|
|
328
|
+
data_access = DataAccessCollection(
|
|
329
|
+
credential_dicts=[{"host": "your-db.com", "username": "user"}]
|
|
330
|
+
)
|
|
371
331
|
```
|
|
372
332
|
|
|
373
|
-
|
|
333
|
+
### 3. Understanding What You Get Back
|
|
374
334
|
|
|
375
|
-
|
|
335
|
+
**The Result Structure**
|
|
376
336
|
|
|
377
|
-
```
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
CSV --> RESULT
|
|
391
|
-
BATCH --> RESULT
|
|
392
|
-
SINGLE --> RESULT
|
|
393
|
-
ANALYSIS --> RESULT
|
|
394
|
-
|
|
395
|
-
style CSV fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
|
|
396
|
-
style BATCH fill:#fff3e0,stroke:#f57c00,stroke-width:2px
|
|
397
|
-
style SINGLE fill:#e1f5fe,stroke:#0288d1,stroke-width:2px
|
|
398
|
-
style ANALYSIS fill:#fce4ec,stroke:#c2185b,stroke-width:2px
|
|
399
|
-
style RESULT fill:#e8f5e8,stroke:#4caf50,stroke-width:3px
|
|
337
|
+
```python
|
|
338
|
+
from mloda_core.api.request import mlodaAPI
|
|
339
|
+
from mloda_plugins.compute_framework.base_implementations.pandas.dataframe import PandasDataframe
|
|
340
|
+
|
|
341
|
+
result = mlodaAPI.run_all(features, compute_frameworks={PandasDataframe})
|
|
342
|
+
|
|
343
|
+
# result is always a LIST of result objects
|
|
344
|
+
data_list = result
|
|
345
|
+
# Each object matches your compute framework type: pd.DataFrame, spark.DataFrame, etc.
|
|
346
|
+
|
|
347
|
+
# Access your processed data
|
|
348
|
+
data = result[0] # Most common case: single result
|
|
349
|
+
print(f"Shape: {data.shape}, Columns: {list(data.columns)}")
|
|
400
350
|
```
|
|
401
351
|
|
|
402
|
-
|
|
352
|
+
> **Key Insight**: mloda returns a list of results. Most simple cases return a single DataFrame that you access with `result[0]`.
|
|
403
353
|
|
|
404
|
-
|
|
354
|
+
### 4. The Features Parameter
|
|
405
355
|
|
|
406
|
-
|
|
407
|
-
|-------------|----------|---------|
|
|
408
|
-
| **💻 Local Development** | Prototyping & testing | Jupyter notebooks, VS Code |
|
|
409
|
-
| **☁️ Any Cloud** | Production workloads | AWS, GCP, Azure, DigitalOcean |
|
|
410
|
-
| **🏢 On-Premise** | Enterprise & compliance | Air-gapped environments |
|
|
411
|
-
| **📊 Notebooks** | Data science workflows | Jupyter, Colab, Databricks |
|
|
412
|
-
| **🌐 Web APIs** | Real-time serving | Flask, FastAPI, Django |
|
|
413
|
-
| **⚙️ Orchestration** | Batch processing | Airflow, Prefect, Dagster |
|
|
414
|
-
| **🐳 Containers** | Microservices | Docker, Kubernetes |
|
|
415
|
-
| **⚡ Serverless** | Event-driven | AWS Lambda, Google Functions |
|
|
356
|
+
**Feature Object Syntax**
|
|
416
357
|
|
|
417
|
-
|
|
358
|
+
```python
|
|
359
|
+
from mloda_core.abstract_plugins.components.feature import Feature
|
|
360
|
+
from mloda_core.abstract_plugins.components.options import Options
|
|
361
|
+
from mloda_core.abstract_plugins.plugin_loader.plugin_loader import PluginLoader
|
|
418
362
|
|
|
419
|
-
|
|
363
|
+
# Load all available plugins (required before using features)
|
|
364
|
+
PluginLoader.all()
|
|
420
365
|
|
|
421
|
-
|
|
366
|
+
features = [
|
|
367
|
+
"age", # Simple string
|
|
368
|
+
Feature(
|
|
369
|
+
"weight_replaced",
|
|
370
|
+
options=Options(
|
|
371
|
+
group={
|
|
372
|
+
"imputation_method": "mean",
|
|
373
|
+
"mloda_source_feature": "weight",
|
|
374
|
+
}
|
|
375
|
+
),
|
|
376
|
+
),
|
|
377
|
+
"onehot_encoded__state" # Chaining syntax
|
|
378
|
+
]
|
|
379
|
+
```
|
|
422
380
|
|
|
423
|
-
**
|
|
424
|
-
- **
|
|
425
|
-
- **
|
|
426
|
-
- **
|
|
427
|
-
- **Future-Proof**: Industry standard for columnar data processing
|
|
381
|
+
**Three Ways to Define Features:**
|
|
382
|
+
- **Simple strings**: For basic columns like "age"
|
|
383
|
+
- **Feature objects**: For explicit configuration and advanced options
|
|
384
|
+
- **Chaining syntax**: Convenient shorthand for transformations
|
|
428
385
|
|
|
429
|
-
|
|
386
|
+
### 5. Compute Frameworks
|
|
430
387
|
|
|
431
|
-
|
|
388
|
+
**Choose Your Processing Engine**
|
|
432
389
|
|
|
433
|
-
|
|
390
|
+
```python
|
|
391
|
+
# Different processing engines
|
|
392
|
+
features = [
|
|
393
|
+
Feature("age", compute_framework=PandasDataframe.get_class_name()),
|
|
394
|
+
Feature("weight", compute_framework=PolarsDataframe.get_class_name()),
|
|
395
|
+
]
|
|
434
396
|
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
| **🔀 Merges** | Consolidate data sources | Multiple feature tables into one |
|
|
439
|
-
| **🔍 Filters** | Data selection & quality | Remove outliers, select time ranges |
|
|
440
|
-
| **🏷️ Domain** | Data organization & governance | Logical data grouping and access control |
|
|
397
|
+
# Mixed - familiar, extensive ecosystem
|
|
398
|
+
result = mlodaAPI.run_all(features)
|
|
399
|
+
```
|
|
441
400
|
|
|
442
|
-
|
|
401
|
+
### 6. Data Access
|
|
443
402
|
|
|
444
|
-
|
|
403
|
+
**Tell mloda Where Your Data Lives**
|
|
445
404
|
|
|
446
|
-
|
|
405
|
+
```python
|
|
406
|
+
from mloda_core.abstract_plugins.components.data_access_collection import DataAccessCollection
|
|
447
407
|
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
408
|
+
# Configure data sources
|
|
409
|
+
data_access = DataAccessCollection(
|
|
410
|
+
files={"data/customers.csv"}, # Specific files
|
|
411
|
+
folders={"data/archive/"}, # Entire directories
|
|
412
|
+
credential_dicts=[{"host": "db.example.com"}] # Database credentials
|
|
413
|
+
)
|
|
453
414
|
|
|
454
|
-
|
|
415
|
+
result = mlodaAPI.run_all(
|
|
416
|
+
features=["age", "standard_scaled__income"],
|
|
417
|
+
compute_frameworks={PandasDataframe},
|
|
418
|
+
data_access_collection=data_access
|
|
419
|
+
)
|
|
420
|
+
```
|
|
421
|
+
|
|
422
|
+
> **Key Insight**: Configure data access once globally, and all features can use it automatically.
|
|
455
423
|
|
|
456
|
-
|
|
424
|
+
### 7. Putting It All Together
|
|
457
425
|
|
|
458
|
-
**
|
|
426
|
+
**Real-World Feature Engineering Pipeline**
|
|
459
427
|
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
428
|
+
```python
|
|
429
|
+
# Complete mlodaAPI call
|
|
430
|
+
result = mlodaAPI.run_all(
|
|
431
|
+
# What you want
|
|
432
|
+
features=[
|
|
433
|
+
"standard_scaled__age",
|
|
434
|
+
"onehot_encoded__state",
|
|
435
|
+
"mean_imputed__income"
|
|
436
|
+
],
|
|
437
|
+
|
|
438
|
+
# How to process it
|
|
439
|
+
compute_frameworks={PandasDataframe},
|
|
440
|
+
|
|
441
|
+
# Where to get it
|
|
442
|
+
data_access_collection=DataAccessCollection(files={"data/customers.csv"})
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
# Get your results
|
|
446
|
+
processed_data = result[0]
|
|
447
|
+
print(f"✅ Created {len(processed_data.columns)} features from {len(processed_data)} rows")
|
|
448
|
+
|
|
449
|
+
# Use in your ML pipeline
|
|
450
|
+
from sklearn.model_selection import train_test_split
|
|
451
|
+
X = processed_data.drop('target', axis=1)
|
|
452
|
+
y = processed_data['target']
|
|
453
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
|
|
454
|
+
```
|
|
466
455
|
|
|
467
|
-
|
|
456
|
+
> **🎉 You now understand mloda's core workflow!**
|
|
468
457
|
|
|
469
458
|
## 📖 Documentation
|
|
470
459
|
|
|
@@ -485,5 +474,4 @@ We welcome contributions! Whether you're building plugins, adding features, or i
|
|
|
485
474
|
## 📄 License
|
|
486
475
|
|
|
487
476
|
This project is licensed under the [Apache License, Version 2.0](https://github.com/mloda-ai/mloda/blob/main/LICENSE.TXT).
|
|
488
|
-
|
|
489
477
|
---
|