mloda 0.2.14__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mloda-0.2.14.dist-info → mloda-0.3.0.dist-info}/METADATA +33 -33
- {mloda-0.2.14.dist-info → mloda-0.3.0.dist-info}/RECORD +25 -25
- mloda_core/abstract_plugins/components/feature_chainer/feature_chain_parser.py +22 -21
- mloda_core/abstract_plugins/components/link.py +1 -0
- mloda_plugins/compute_framework/base_implementations/spark/spark_framework.py +2 -0
- mloda_plugins/feature_group/experimental/aggregated_feature_group/base.py +7 -7
- mloda_plugins/feature_group/experimental/clustering/base.py +17 -17
- mloda_plugins/feature_group/experimental/data_quality/missing_value/base.py +24 -14
- mloda_plugins/feature_group/experimental/dimensionality_reduction/base.py +17 -17
- mloda_plugins/feature_group/experimental/dimensionality_reduction/pandas.py +1 -1
- mloda_plugins/feature_group/experimental/forecasting/base.py +19 -18
- mloda_plugins/feature_group/experimental/geo_distance/base.py +25 -15
- mloda_plugins/feature_group/experimental/node_centrality/base.py +29 -29
- mloda_plugins/feature_group/experimental/node_centrality/pandas.py +3 -2
- mloda_plugins/feature_group/experimental/sklearn/encoding/base.py +19 -19
- mloda_plugins/feature_group/experimental/sklearn/encoding/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/pipeline/base.py +12 -16
- mloda_plugins/feature_group/experimental/sklearn/scaling/base.py +9 -9
- mloda_plugins/feature_group/experimental/text_cleaning/base.py +7 -7
- mloda_plugins/feature_group/experimental/time_window/base.py +22 -20
- {mloda-0.2.14.dist-info → mloda-0.3.0.dist-info}/WHEEL +0 -0
- {mloda-0.2.14.dist-info → mloda-0.3.0.dist-info}/entry_points.txt +0 -0
- {mloda-0.2.14.dist-info → mloda-0.3.0.dist-info}/licenses/LICENSE.TXT +0 -0
- {mloda-0.2.14.dist-info → mloda-0.3.0.dist-info}/licenses/NOTICE.md +0 -0
- {mloda-0.2.14.dist-info → mloda-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mloda
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Rethinking Data and Feature Engineering
|
|
5
5
|
Author-email: Tom Kaltofen <info@mloda.ai>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -17,7 +17,7 @@ License-File: NOTICE.md
|
|
|
17
17
|
Requires-Dist: pyarrow
|
|
18
18
|
Dynamic: license-file
|
|
19
19
|
|
|
20
|
-
# mloda: Make data and
|
|
20
|
+
# mloda: Make data, feature and context engineering shareable
|
|
21
21
|
|
|
22
22
|
[](https://mloda.ai)
|
|
23
23
|
[](https://mloda-ai.github.io/mloda/)
|
|
@@ -87,7 +87,7 @@ result = mlodaAPI.run_all(
|
|
|
87
87
|
features=[
|
|
88
88
|
"customer_id", # Original column
|
|
89
89
|
"age", # Original column
|
|
90
|
-
"
|
|
90
|
+
"income__standard_scaled" # Transform: scale income to mean=0, std=1
|
|
91
91
|
],
|
|
92
92
|
compute_frameworks={PandasDataframe}
|
|
93
93
|
)
|
|
@@ -104,54 +104,54 @@ print(data.head())
|
|
|
104
104
|
3. **mlodaAPI.run_all()** - Executed the feature pipeline:
|
|
105
105
|
- Got data from `SampleData`
|
|
106
106
|
- Extracted `customer_id` and `age` as-is
|
|
107
|
-
- Applied StandardScaler to `income` → `
|
|
107
|
+
- Applied StandardScaler to `income` → `income__standard_scaled`
|
|
108
108
|
4. **result[0]** - Retrieved the processed pandas DataFrame
|
|
109
109
|
|
|
110
|
-
> **Key Insight**: The syntax `
|
|
110
|
+
> **Key Insight**: The syntax `income__standard_scaled` is mloda's **feature chaining**. Behind the scenes, mloda creates a chain of **feature group** objects (`SourceFeatureGroup` → `StandardScalingFeatureGroup`), automatically resolving dependencies. See [Section 2](#2-understanding-feature-chaining-transformations) for full explanation of chaining syntax and [Section 4](#4-advanced-feature-objects-for-complex-configurations) to learn about the underlying feature group architecture.
|
|
111
111
|
|
|
112
112
|
### 2. Understanding Feature Chaining (Transformations)
|
|
113
113
|
|
|
114
114
|
**The Power of Double Underscore `__` Syntax**
|
|
115
115
|
|
|
116
|
-
As mentioned in Section 1, feature chaining (like `
|
|
116
|
+
As mentioned in Section 1, feature chaining (like `income__standard_scaled`) is syntactic sugar that mloda converts into a chain of **feature group objects**. Each transformation (`standard_scaled`, `mean_imputed`, etc.) corresponds to a specific feature group class.
|
|
117
117
|
|
|
118
118
|
mloda's chaining syntax lets you compose transformations using `__` as a separator:
|
|
119
119
|
|
|
120
120
|
```python
|
|
121
121
|
# Pattern examples (these show the syntax):
|
|
122
|
-
# "
|
|
123
|
-
# "
|
|
124
|
-
# "
|
|
122
|
+
# "income__standard_scaled" # Scale income column
|
|
123
|
+
# "age__mean_imputed" # Fill missing age values with mean
|
|
124
|
+
# "category__onehot_encoded" # One-hot encode category column
|
|
125
125
|
#
|
|
126
126
|
# You can chain transformations!
|
|
127
|
-
# Pattern: {
|
|
128
|
-
# "
|
|
127
|
+
# Pattern: {source}__{transform1}__{transform2}
|
|
128
|
+
# "income__mean_imputed__standard_scaled" # First impute, then scale
|
|
129
129
|
|
|
130
130
|
# Real working example:
|
|
131
|
-
_ = ["
|
|
131
|
+
_ = ["income__standard_scaled", "age__mean_imputed"] # Valid feature names
|
|
132
132
|
```
|
|
133
133
|
|
|
134
134
|
**Available Transformations:**
|
|
135
135
|
|
|
136
136
|
| Transformation | Purpose | Example |
|
|
137
137
|
|---------------|---------|---------|
|
|
138
|
-
| `
|
|
139
|
-
| `
|
|
140
|
-
| `
|
|
141
|
-
| `
|
|
142
|
-
| `
|
|
143
|
-
| `
|
|
144
|
-
| `
|
|
145
|
-
| `
|
|
138
|
+
| `__standard_scaled` | StandardScaler (mean=0, std=1) | `income__standard_scaled` |
|
|
139
|
+
| `__minmax_scaled` | MinMaxScaler (range [0,1]) | `age__minmax_scaled` |
|
|
140
|
+
| `__robust_scaled` | RobustScaler (median-based, handles outliers) | `price__robust_scaled` |
|
|
141
|
+
| `__mean_imputed` | Fill missing values with mean | `salary__mean_imputed` |
|
|
142
|
+
| `__median_imputed` | Fill missing values with median | `age__median_imputed` |
|
|
143
|
+
| `__mode_imputed` | Fill missing values with mode | `category__mode_imputed` |
|
|
144
|
+
| `__onehot_encoded` | One-hot encoding | `state__onehot_encoded` |
|
|
145
|
+
| `__label_encoded` | Label encoding | `priority__label_encoded` |
|
|
146
146
|
|
|
147
|
-
> **Key Insight**: Transformations are read
|
|
147
|
+
> **Key Insight**: Transformations are read left-to-right. `income__mean_imputed__standard_scaled` means: take `income` → apply mean imputation → apply standard scaling.
|
|
148
148
|
|
|
149
149
|
**When You Need More Control**
|
|
150
150
|
|
|
151
151
|
Most of the time, simple string syntax is enough:
|
|
152
152
|
```python
|
|
153
153
|
# Example feature list (simple strings)
|
|
154
|
-
example_features = ["customer_id", "
|
|
154
|
+
example_features = ["customer_id", "income__standard_scaled", "region__onehot_encoded"]
|
|
155
155
|
```
|
|
156
156
|
|
|
157
157
|
But for advanced configurations, you can explicitly create `Feature` objects with custom options (covered in Section 3).
|
|
@@ -160,11 +160,11 @@ But for advanced configurations, you can explicitly create `Feature` objects wit
|
|
|
160
160
|
|
|
161
161
|
**Understanding the Feature Group Architecture**
|
|
162
162
|
|
|
163
|
-
Behind the scenes, chaining like `
|
|
163
|
+
Behind the scenes, chaining like `income__standard_scaled` creates feature group objects:
|
|
164
164
|
|
|
165
165
|
```python
|
|
166
166
|
# When you write this string:
|
|
167
|
-
"
|
|
167
|
+
"income__standard_scaled"
|
|
168
168
|
|
|
169
169
|
# mloda creates this chain of feature groups:
|
|
170
170
|
# StandardScalingFeatureGroup (reads from) → IncomeSourceFeatureGroup
|
|
@@ -236,7 +236,7 @@ mloda supports multiple data access patterns depending on your use case:
|
|
|
236
236
|
# )
|
|
237
237
|
#
|
|
238
238
|
# result = mlodaAPI.run_all(
|
|
239
|
-
# features=["customer_id", "
|
|
239
|
+
# features=["customer_id", "income__standard_scaled"],
|
|
240
240
|
# compute_frameworks={PandasDataframe},
|
|
241
241
|
# data_access_collection=data_access
|
|
242
242
|
# )
|
|
@@ -254,7 +254,7 @@ mloda supports multiple data access patterns depending on your use case:
|
|
|
254
254
|
# )
|
|
255
255
|
#
|
|
256
256
|
# result = mlodaAPI.run_all(
|
|
257
|
-
# features=["customer_id", "
|
|
257
|
+
# features=["customer_id", "age__standard_scaled"],
|
|
258
258
|
# compute_frameworks={PandasDataframe},
|
|
259
259
|
# api_input_data_collection=api_input_data_collection,
|
|
260
260
|
# api_data=api_data
|
|
@@ -273,7 +273,7 @@ mloda supports multiple compute frameworks (pandas, polars, pyarrow, etc.). Most
|
|
|
273
273
|
# Using the SampleData class from Section 1
|
|
274
274
|
# Default: Everything processes with pandas
|
|
275
275
|
result = mlodaAPI.run_all(
|
|
276
|
-
features=["customer_id", "
|
|
276
|
+
features=["customer_id", "income__standard_scaled"],
|
|
277
277
|
compute_frameworks={PandasDataframe} # Use pandas for all features
|
|
278
278
|
)
|
|
279
279
|
|
|
@@ -334,12 +334,12 @@ from sklearn.metrics import accuracy_score
|
|
|
334
334
|
result = mlodaAPI.run_all(
|
|
335
335
|
features=[
|
|
336
336
|
"customer_id",
|
|
337
|
-
"
|
|
338
|
-
"
|
|
339
|
-
"
|
|
340
|
-
"
|
|
341
|
-
"
|
|
342
|
-
"
|
|
337
|
+
"age__standard_scaled",
|
|
338
|
+
"income__standard_scaled",
|
|
339
|
+
"account_balance__robust_scaled",
|
|
340
|
+
"subscription_tier__label_encoded",
|
|
341
|
+
"region__label_encoded",
|
|
342
|
+
"customer_segment__label_encoded",
|
|
343
343
|
"churned"
|
|
344
344
|
],
|
|
345
345
|
compute_frameworks={PandasDataframe}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
mloda-0.
|
|
2
|
-
mloda-0.
|
|
1
|
+
mloda-0.3.0.dist-info/licenses/LICENSE.TXT,sha256=gmhQwSkHxjiShsqQ1FpJ-20YFtaa4vRCE7aCx55-6nk,11366
|
|
2
|
+
mloda-0.3.0.dist-info/licenses/NOTICE.md,sha256=Hu10B2sPnGLIHxZ4QhACSLLxukJpeJzjvkzCu48q5fY,520
|
|
3
3
|
mloda_core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
mloda_core/abstract_plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
5
|
mloda_core/abstract_plugins/abstract_feature_group.py,sha256=I3fVEULHUtrvPoc94iyxyBQVacD7GGI5piqJ6FoqgAY,18435
|
|
@@ -17,12 +17,12 @@ mloda_core/abstract_plugins/components/feature_group_version.py,sha256=Syc1uXH50
|
|
|
17
17
|
mloda_core/abstract_plugins/components/feature_name.py,sha256=2NtKb_eespiOH9_j4y_pz5kBkmRbPUXq6U2giXlhkGg,778
|
|
18
18
|
mloda_core/abstract_plugins/components/feature_set.py,sha256=EeHep0iIvect21A6X-kNYBFUDgU8dkrfczTZwG_2FFY,4275
|
|
19
19
|
mloda_core/abstract_plugins/components/hashable_dict.py,sha256=xzUIn2wbujo3jwwGayHnSbrrADSiVYU_xUV1nt5Yk8M,426
|
|
20
|
-
mloda_core/abstract_plugins/components/link.py,sha256=
|
|
20
|
+
mloda_core/abstract_plugins/components/link.py,sha256=rmDutfOk-tLdUmWTLBmeXkEkCk4DhU-Lj0ov5GDnSxI,7445
|
|
21
21
|
mloda_core/abstract_plugins/components/options.py,sha256=k3fLwT4DpHN1Dmeht8mtXqjGhAsZnQI4sriKccC1mnE,11674
|
|
22
22
|
mloda_core/abstract_plugins/components/parallelization_modes.py,sha256=k7z5yvyQfhfNYcljfZ0dWBf0ZMpnCSqaW0vajCh202Q,144
|
|
23
23
|
mloda_core/abstract_plugins/components/utils.py,sha256=_ofeiOBQLwYU3_p9JBe61Ihps4dpFUcsrqI6XrA92Yo,530
|
|
24
24
|
mloda_core/abstract_plugins/components/feature_chainer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
-
mloda_core/abstract_plugins/components/feature_chainer/feature_chain_parser.py,sha256=
|
|
25
|
+
mloda_core/abstract_plugins/components/feature_chainer/feature_chain_parser.py,sha256=dqwQOLJTOrEFmG-lIwGrKZnJ9rilEDDNAfC373dLJHQ,13289
|
|
26
26
|
mloda_core/abstract_plugins/components/framework_transformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
27
27
|
mloda_core/abstract_plugins/components/framework_transformer/base_transformer.py,sha256=3eRSOzYZZ4OHRezvUnw4RLTUjirMGtcZCKQYJ1MuuZU,5793
|
|
28
28
|
mloda_core/abstract_plugins/components/framework_transformer/cfw_transformer.py,sha256=dODu95RTxAmLExId2XxPau-GZhBaGCO6k1sPntcwjfk,4298
|
|
@@ -114,7 +114,7 @@ mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_fra
|
|
|
114
114
|
mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_merge_engine.py,sha256=ueuL1i4B9OmCKYFBGHwXvlTOu_qD-mDdptMcx1VjH1s,8347
|
|
115
115
|
mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_pyarrow_transformer.py,sha256=S0yn42V95bN6Zxv2_JRRmX6NR_o7maEdzPluJrqpqD0,3438
|
|
116
116
|
mloda_plugins/compute_framework/base_implementations/spark/spark_filter_engine.py,sha256=w6Z6cFQhmy1sl4bH5R9KFVdJGq-B5_s0bfHuzmpifKM,5256
|
|
117
|
-
mloda_plugins/compute_framework/base_implementations/spark/spark_framework.py,sha256=
|
|
117
|
+
mloda_plugins/compute_framework/base_implementations/spark/spark_framework.py,sha256=yiUa66tV8ckTpaWZ-B3YS1_B63j_YIjM_xG-WAcuKIs,8279
|
|
118
118
|
mloda_plugins/compute_framework/base_implementations/spark/spark_merge_engine.py,sha256=syBOP6Ww9A_IfeJc49jpxByeP5PVvZTM9FFTUCZc3Xg,3452
|
|
119
119
|
mloda_plugins/compute_framework/base_implementations/spark/spark_pyarrow_transformer.py,sha256=CtIOllhGdYQisIiG0Ml0haG4sBC2UmrxKl8bhp4gzjY,3303
|
|
120
120
|
mloda_plugins/config/__init__.py,sha256=wm08JOS1kVronYOtmPJZCcEeMlA9wPOCFAIJG_Isi8c,34
|
|
@@ -127,29 +127,29 @@ mloda_plugins/feature_group/experimental/__init__.py,sha256=47DEQpj8HBSa-_TImW-5
|
|
|
127
127
|
mloda_plugins/feature_group/experimental/default_options_key.py,sha256=GpSwOvR806wWZJ93DxC-Y3hnt4g7E4dELm8B5k6mZ0I,1040
|
|
128
128
|
mloda_plugins/feature_group/experimental/source_input_feature.py,sha256=SXnC8iB6WxSbj-w5qtnRHtxV4K9H4qsg3uMJd3zg3GA,11080
|
|
129
129
|
mloda_plugins/feature_group/experimental/aggregated_feature_group/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
130
|
-
mloda_plugins/feature_group/experimental/aggregated_feature_group/base.py,sha256=
|
|
130
|
+
mloda_plugins/feature_group/experimental/aggregated_feature_group/base.py,sha256=t16yx9bnU8FSQwg4mGoBZWDQ30eWc0TzRE6D5Bg1tc8,11423
|
|
131
131
|
mloda_plugins/feature_group/experimental/aggregated_feature_group/pandas.py,sha256=7ntyidBNFVo-SmGTf1M5M0q-4dKonjQgmeqJ6XmwfYY,5014
|
|
132
132
|
mloda_plugins/feature_group/experimental/aggregated_feature_group/polars_lazy.py,sha256=ulsr6HDHHeaNSA63Fo4FIm5TwrBzFIfQgNrRWfWAX3I,6294
|
|
133
133
|
mloda_plugins/feature_group/experimental/aggregated_feature_group/pyarrow.py,sha256=A7mgNeAwa5-afpnkNDIf3xbDxspqh198wjxkvcMBsF8,5738
|
|
134
134
|
mloda_plugins/feature_group/experimental/clustering/__init__.py,sha256=769NSapfi48V7BBh8zoo-ale2We6K4OV6ocNlzAhfEw,59
|
|
135
|
-
mloda_plugins/feature_group/experimental/clustering/base.py,sha256=
|
|
135
|
+
mloda_plugins/feature_group/experimental/clustering/base.py,sha256=ijJeAq2nqkc5TNzuz30kSgs4MsFcGvvUf0XbynC1-Bo,18569
|
|
136
136
|
mloda_plugins/feature_group/experimental/clustering/pandas.py,sha256=0k3gBw3ITzt9DMnOG2PCt4o0NzdOQy9-XM15M51Xqas,19327
|
|
137
137
|
mloda_plugins/feature_group/experimental/data_quality/__init__.py,sha256=ga8jdKaLl4bxkxMqNtRbrkHFnRWZIp8f3bR7DVG5d-I,45
|
|
138
138
|
mloda_plugins/feature_group/experimental/data_quality/missing_value/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
139
|
-
mloda_plugins/feature_group/experimental/data_quality/missing_value/base.py,sha256
|
|
139
|
+
mloda_plugins/feature_group/experimental/data_quality/missing_value/base.py,sha256=FIJnlIAq6U5PW1pa52W1mXHjPV0_YZB_vR4ml5xrLeM,15780
|
|
140
140
|
mloda_plugins/feature_group/experimental/data_quality/missing_value/pandas.py,sha256=8l-uXJmxjlra8ADQisTQwla2abjT1UUplwuoyKIxp3k,8682
|
|
141
141
|
mloda_plugins/feature_group/experimental/data_quality/missing_value/pyarrow.py,sha256=d13kWrXxdRddQ_6GbX5hKMNKpY9iRwhmVcx0CG5wafQ,14346
|
|
142
142
|
mloda_plugins/feature_group/experimental/data_quality/missing_value/python_dict.py,sha256=OrOd5MZdbnL4DCJFSZYuda5t2b5MvOBqdedgIPisV9g,13968
|
|
143
|
-
mloda_plugins/feature_group/experimental/dimensionality_reduction/base.py,sha256=
|
|
144
|
-
mloda_plugins/feature_group/experimental/dimensionality_reduction/pandas.py,sha256=
|
|
143
|
+
mloda_plugins/feature_group/experimental/dimensionality_reduction/base.py,sha256=aIi6Cx09LxbmQH5geXlR78Cz5cTlMVWWpTbL85NJx34,17466
|
|
144
|
+
mloda_plugins/feature_group/experimental/dimensionality_reduction/pandas.py,sha256=v47-g2gHQnLEjxo0txM9OGlG7nX6kkKrzRTGK0dRkqM,13279
|
|
145
145
|
mloda_plugins/feature_group/experimental/dynamic_feature_group_factory/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
146
146
|
mloda_plugins/feature_group/experimental/dynamic_feature_group_factory/dynamic_feature_group_factory.py,sha256=6EHBHpDKeg9lapzzMeRnvP392JKskhrxWQ_QZYIkH7Q,12850
|
|
147
147
|
mloda_plugins/feature_group/experimental/forecasting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
148
|
-
mloda_plugins/feature_group/experimental/forecasting/base.py,sha256=
|
|
148
|
+
mloda_plugins/feature_group/experimental/forecasting/base.py,sha256=8XSTivQwb-UbF62NjikOT_kMFm_ixHnUGVO1hjHe_uQ,24068
|
|
149
149
|
mloda_plugins/feature_group/experimental/forecasting/forecasting_artifact.py,sha256=41HPYoJEXqTqcv6Zvce-vkL9RZ5YrdzSiJgmEFxGVR0,4289
|
|
150
150
|
mloda_plugins/feature_group/experimental/forecasting/pandas.py,sha256=Qus5jwAPs8bp546Y8e_piw6EoHkuru0Sl1UgdG0k_Yg,28913
|
|
151
151
|
mloda_plugins/feature_group/experimental/geo_distance/__init__.py,sha256=wqp7I3j87AmrVBi2rlqcz4Sj-R1QMe3EasmNFb_Zxg4,85
|
|
152
|
-
mloda_plugins/feature_group/experimental/geo_distance/base.py,sha256=
|
|
152
|
+
mloda_plugins/feature_group/experimental/geo_distance/base.py,sha256=Zz7DC4NbEc-oNqRir50bMNx7y8Bhq33WsKRUQmTDQP4,12801
|
|
153
153
|
mloda_plugins/feature_group/experimental/geo_distance/pandas.py,sha256=KwN_-sdpZobBiFev68ar0JWNXmupmAvh6f5L3CtbBAE,6023
|
|
154
154
|
mloda_plugins/feature_group/experimental/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
155
155
|
mloda_plugins/feature_group/experimental/llm/cli.py,sha256=65VO3deuQyNo2gQWRh6HuJXvzMtnYS6WIdaV-fqCFhc,1409
|
|
@@ -181,24 +181,24 @@ mloda_plugins/feature_group/experimental/llm/tools/available/replace_file_tool.p
|
|
|
181
181
|
mloda_plugins/feature_group/experimental/llm/tools/available/replace_file_tool_which_runs_tox.py,sha256=jTBpsIxF7mzZjeesd9ZeHUDwA17SkbLsL9brvl-YfOo,2119
|
|
182
182
|
mloda_plugins/feature_group/experimental/llm/tools/available/run_single_pytest.py,sha256=dLMb1iunH0EVY7YZ0NmlHC4kVhTOjs2Hjs2412dFTao,4114
|
|
183
183
|
mloda_plugins/feature_group/experimental/llm/tools/available/run_tox.py,sha256=2APL0MD_ExaMzsJK9_WfgDD9dmMY8amsgfc6B4Xgj70,3814
|
|
184
|
-
mloda_plugins/feature_group/experimental/node_centrality/base.py,sha256=
|
|
185
|
-
mloda_plugins/feature_group/experimental/node_centrality/pandas.py,sha256=
|
|
184
|
+
mloda_plugins/feature_group/experimental/node_centrality/base.py,sha256=bmWEA6qcdmwIz6Va3QYmwjban1YD16sKUiZn8n4Y49Y,14769
|
|
185
|
+
mloda_plugins/feature_group/experimental/node_centrality/pandas.py,sha256=pBvoe-rhAInIPeAKfxLZOrJzAkkauUuxKguhF6XXXws,20261
|
|
186
186
|
mloda_plugins/feature_group/experimental/sklearn/__init__.py,sha256=UubmqLyavXbzW40FeGY06XyORo-x1Uo0WCLcpmPWnAs,208
|
|
187
187
|
mloda_plugins/feature_group/experimental/sklearn/sklearn_artifact.py,sha256=Sa5bIurlF-YZ0ybl1cPJWpLLOUTfaDa1DCffNcEvoVA,12777
|
|
188
188
|
mloda_plugins/feature_group/experimental/sklearn/encoding/__init__.py,sha256=WOe_iTVz2CXmVcL2IUNqhLJQqINFvY2rUktDXsNSOl8,153
|
|
189
|
-
mloda_plugins/feature_group/experimental/sklearn/encoding/base.py,sha256=
|
|
190
|
-
mloda_plugins/feature_group/experimental/sklearn/encoding/pandas.py,sha256=
|
|
189
|
+
mloda_plugins/feature_group/experimental/sklearn/encoding/base.py,sha256=ikl4PBWU3eUXc9Dxn8llmaEoAtKQ3MaIzRIITbo8IBw,19884
|
|
190
|
+
mloda_plugins/feature_group/experimental/sklearn/encoding/pandas.py,sha256=_U9gD-39wAFVl8tL1QexcJ2WZc7fu6qShuI1L0O1XBI,6001
|
|
191
191
|
mloda_plugins/feature_group/experimental/sklearn/pipeline/__init__.py,sha256=Z_xSZFAFItwRlbBVxbBxwW_S61tQ8r1N8Ih59jTUXqk,199
|
|
192
|
-
mloda_plugins/feature_group/experimental/sklearn/pipeline/base.py,sha256=
|
|
192
|
+
mloda_plugins/feature_group/experimental/sklearn/pipeline/base.py,sha256=VsWEp8dNdu3k4NSd6ckPtGBt3hDAnly7a2fzxiylvXM,23447
|
|
193
193
|
mloda_plugins/feature_group/experimental/sklearn/pipeline/pandas.py,sha256=nKLRbqy2q5vFNhgEsHoBnwbaiJheV9bkgizDSYd_epE,4045
|
|
194
194
|
mloda_plugins/feature_group/experimental/sklearn/scaling/__init__.py,sha256=CsQEzK6DJ-WakWqsWTScHYsrBuOwLeX78zYV-NqxuDg,79
|
|
195
|
-
mloda_plugins/feature_group/experimental/sklearn/scaling/base.py,sha256
|
|
195
|
+
mloda_plugins/feature_group/experimental/sklearn/scaling/base.py,sha256=6CqOVyzKgTRdQCRjPT5RFfJTQ453MCO0GOoewpC7cuc,15409
|
|
196
196
|
mloda_plugins/feature_group/experimental/sklearn/scaling/pandas.py,sha256=8-DPSmUsEJVK4dlNh-041FI2YzmQ1Q7p6gWs0Zb7nKI,3960
|
|
197
|
-
mloda_plugins/feature_group/experimental/text_cleaning/base.py,sha256
|
|
197
|
+
mloda_plugins/feature_group/experimental/text_cleaning/base.py,sha256=-7nN7R7-wEkHoGYiry0UHtiL7W5_CKa-T1ktF0q7gUI,11313
|
|
198
198
|
mloda_plugins/feature_group/experimental/text_cleaning/pandas.py,sha256=7RbV8lMUzx5b8ph4IsXnab4v06IByrNOGte9oK7Zz0g,7339
|
|
199
199
|
mloda_plugins/feature_group/experimental/text_cleaning/python_dict.py,sha256=9wRE1RioFRL-OtX467u4OEPvhDTzQAvdB-XAaJ1zDys,7829
|
|
200
200
|
mloda_plugins/feature_group/experimental/time_window/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
201
|
-
mloda_plugins/feature_group/experimental/time_window/base.py,sha256=
|
|
201
|
+
mloda_plugins/feature_group/experimental/time_window/base.py,sha256=TAqEFrnHQVzBtVQ4Y2L5yJ8f35SBo0j9_AFZJJ6bakk,18367
|
|
202
202
|
mloda_plugins/feature_group/experimental/time_window/pandas.py,sha256=YFjkO2Xu_vnB1XfQx2bElKRpUty0Ldic04hiYJKYfEo,7863
|
|
203
203
|
mloda_plugins/feature_group/experimental/time_window/pyarrow.py,sha256=SVwlfIt2qZVFp3InfLoszdSIBZh_EYFGzvIvRW9RVfA,10762
|
|
204
204
|
mloda_plugins/feature_group/input_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -222,8 +222,8 @@ mloda_plugins/function_extender/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
|
|
|
222
222
|
mloda_plugins/function_extender/base_implementations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
223
223
|
mloda_plugins/function_extender/base_implementations/otel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
224
224
|
mloda_plugins/function_extender/base_implementations/otel/otel_extender.py,sha256=M8GKb55ZGaoRaNCQOp69qr3w8jSMSD6D3VuGBpfw2t4,731
|
|
225
|
-
mloda-0.
|
|
226
|
-
mloda-0.
|
|
227
|
-
mloda-0.
|
|
228
|
-
mloda-0.
|
|
229
|
-
mloda-0.
|
|
225
|
+
mloda-0.3.0.dist-info/METADATA,sha256=gR1iP4xYXJNucYNPRsxqS8XRs9lv3Dl21indx8rESeQ,16643
|
|
226
|
+
mloda-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
227
|
+
mloda-0.3.0.dist-info/entry_points.txt,sha256=f7hp7s4laABj9eN5YwEjQAyInF-fa687MXdz-hKYMIA,80
|
|
228
|
+
mloda-0.3.0.dist-info/top_level.txt,sha256=KScNbTs4_vV-mJ1pIlP6cyvMl611B3hNxVYj2hA0Ex4,25
|
|
229
|
+
mloda-0.3.0.dist-info/RECORD,,
|
|
@@ -35,23 +35,24 @@ class FeatureChainParser:
|
|
|
35
35
|
"""Internal method for parsing feature names - used by match_configuration_feature_chain_parser."""
|
|
36
36
|
_feature_name: str = feature_name.name if isinstance(feature_name, FeatureName) else feature_name
|
|
37
37
|
|
|
38
|
-
parts = _feature_name.
|
|
39
|
-
|
|
38
|
+
parts = _feature_name.rsplit(pattern, 1)
|
|
39
|
+
source_feature = parts[0] if len(parts) > 1 else ""
|
|
40
|
+
operation_part = parts[1] if len(parts) > 1 else parts[0]
|
|
40
41
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
remainder = parts[1]
|
|
44
|
-
|
|
45
|
-
for prefix_pattern in prefix_patterns:
|
|
46
|
-
if re.match(prefix_pattern, itself) is None:
|
|
42
|
+
for suffix_pattern in prefix_patterns:
|
|
43
|
+
if re.match(suffix_pattern, _feature_name) is None:
|
|
47
44
|
continue
|
|
48
45
|
|
|
49
|
-
if len(parts) == 1:
|
|
46
|
+
if len(parts) == 1 or not source_feature:
|
|
50
47
|
raise ValueError(f"Matches the pattern {pattern}, but has no source feature: {_feature_name}")
|
|
51
48
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
49
|
+
match = re.match(suffix_pattern, _feature_name)
|
|
50
|
+
if match and match.groups():
|
|
51
|
+
operation_config = match.group(1)
|
|
52
|
+
else:
|
|
53
|
+
operation_config = operation_part.split("_")[0]
|
|
54
|
+
|
|
55
|
+
return operation_config, source_feature
|
|
55
56
|
|
|
56
57
|
return None, None
|
|
57
58
|
|
|
@@ -286,13 +287,13 @@ class FeatureChainParser:
|
|
|
286
287
|
return False
|
|
287
288
|
|
|
288
289
|
@classmethod
|
|
289
|
-
def extract_source_feature(cls, feature_name: str,
|
|
290
|
+
def extract_source_feature(cls, feature_name: str, suffix_pattern: str) -> str:
|
|
290
291
|
"""
|
|
291
|
-
Extract the source feature from a feature name based on the
|
|
292
|
+
Extract the source feature from a feature name based on the suffix pattern.
|
|
292
293
|
|
|
293
294
|
Args:
|
|
294
295
|
feature_name: The feature name to parse
|
|
295
|
-
|
|
296
|
+
suffix_pattern: Regex pattern for the suffix (e.g., r"^.+__([w]+)$")
|
|
296
297
|
|
|
297
298
|
Returns:
|
|
298
299
|
The source feature part of the name
|
|
@@ -300,14 +301,14 @@ class FeatureChainParser:
|
|
|
300
301
|
Raises:
|
|
301
302
|
ValueError: If the feature name doesn't match the expected pattern
|
|
302
303
|
"""
|
|
303
|
-
match = re.match(
|
|
304
|
+
match = re.match(suffix_pattern, feature_name)
|
|
304
305
|
if not match:
|
|
305
306
|
raise ValueError(f"Invalid feature name format: {feature_name}")
|
|
306
307
|
|
|
307
|
-
#
|
|
308
|
-
|
|
309
|
-
if
|
|
308
|
+
# For L→R: source is everything BEFORE the last __
|
|
309
|
+
suffix_start = feature_name.rfind("__")
|
|
310
|
+
if suffix_start == -1:
|
|
310
311
|
raise ValueError(f"Invalid feature name format: {feature_name}. Missing double underscore separator.")
|
|
311
312
|
|
|
312
|
-
# Return everything
|
|
313
|
-
return feature_name[
|
|
313
|
+
# Return everything BEFORE the last double underscore (the source)
|
|
314
|
+
return feature_name[:suffix_start]
|
|
@@ -174,6 +174,8 @@ class SparkFramework(ComputeFrameWork):
|
|
|
174
174
|
self.set_framework_connection_object()
|
|
175
175
|
|
|
176
176
|
spark = self.framework_connection_object
|
|
177
|
+
if spark is None:
|
|
178
|
+
raise RuntimeError("Failed to initialize Spark session")
|
|
177
179
|
new_data_df = spark.createDataFrame(
|
|
178
180
|
[(i + 1, val) for i, val in enumerate(data_list)],
|
|
179
181
|
StructType(
|
|
@@ -40,15 +40,15 @@ class AggregatedFeatureGroup(AbstractFeatureGroup):
|
|
|
40
40
|
|
|
41
41
|
### 1. String-Based Creation
|
|
42
42
|
|
|
43
|
-
Features follow the naming pattern: `{
|
|
43
|
+
Features follow the naming pattern: `{mloda_source_features}__{aggregation_type}_aggr`
|
|
44
44
|
|
|
45
45
|
Examples:
|
|
46
46
|
```python
|
|
47
47
|
features = [
|
|
48
|
-
"
|
|
49
|
-
"
|
|
50
|
-
"
|
|
51
|
-
"
|
|
48
|
+
"sales__sum_aggr", # Sum of sales values
|
|
49
|
+
"temperature__avg_aggr", # Average temperature
|
|
50
|
+
"price__max_aggr", # Maximum price
|
|
51
|
+
"transactions__count_aggr" # Count of transactions
|
|
52
52
|
]
|
|
53
53
|
```
|
|
54
54
|
|
|
@@ -96,8 +96,8 @@ class AggregatedFeatureGroup(AbstractFeatureGroup):
|
|
|
96
96
|
"median": "Median value",
|
|
97
97
|
}
|
|
98
98
|
|
|
99
|
-
PATTERN = "
|
|
100
|
-
PREFIX_PATTERN = r"
|
|
99
|
+
PATTERN = "__"
|
|
100
|
+
PREFIX_PATTERN = r".*__([\w]+)_aggr$"
|
|
101
101
|
|
|
102
102
|
# Property mapping for configuration-based feature creation
|
|
103
103
|
PROPERTY_MAPPING = {
|
|
@@ -27,15 +27,15 @@ class ClusteringFeatureGroup(AbstractFeatureGroup):
|
|
|
27
27
|
## Feature Naming Convention
|
|
28
28
|
|
|
29
29
|
Clustering features follow this naming pattern:
|
|
30
|
-
`
|
|
30
|
+
`{mloda_source_features}__cluster_{algorithm}_{k_value}`
|
|
31
31
|
|
|
32
|
-
The source features
|
|
33
|
-
|
|
32
|
+
The source features come first, followed by the clustering operation.
|
|
33
|
+
Note the double underscore separating the source features from the operation.
|
|
34
34
|
|
|
35
35
|
Examples:
|
|
36
|
-
- `
|
|
37
|
-
- `
|
|
38
|
-
- `
|
|
36
|
+
- `customer_behavior__cluster_kmeans_5`: K-means clustering with 5 clusters on customer behavior data
|
|
37
|
+
- `transaction_patterns__cluster_hierarchical_3`: Hierarchical clustering with 3 clusters on transaction patterns
|
|
38
|
+
- `sensor_readings__cluster_dbscan_auto`: DBSCAN clustering with automatic cluster detection on sensor readings
|
|
39
39
|
|
|
40
40
|
## Configuration-Based Creation
|
|
41
41
|
|
|
@@ -57,7 +57,7 @@ class ClusteringFeatureGroup(AbstractFeatureGroup):
|
|
|
57
57
|
)
|
|
58
58
|
)
|
|
59
59
|
|
|
60
|
-
# The Engine will automatically parse this into a feature with name "
|
|
60
|
+
# The Engine will automatically parse this into a feature with name "customer_behavior__cluster_kmeans_5"
|
|
61
61
|
```
|
|
62
62
|
|
|
63
63
|
## Parameter Classification
|
|
@@ -102,7 +102,7 @@ class ClusteringFeatureGroup(AbstractFeatureGroup):
|
|
|
102
102
|
}
|
|
103
103
|
|
|
104
104
|
# Define the prefix pattern for this feature group
|
|
105
|
-
PREFIX_PATTERN = r"
|
|
105
|
+
PREFIX_PATTERN = r".*__cluster_([\w]+)_([\w]+)$"
|
|
106
106
|
PATTERN = "__"
|
|
107
107
|
|
|
108
108
|
# Property mapping for configuration-based feature creation
|
|
@@ -158,7 +158,7 @@ class ClusteringFeatureGroup(AbstractFeatureGroup):
|
|
|
158
158
|
@classmethod
|
|
159
159
|
def parse_clustering_prefix(cls, feature_name: str) -> tuple[str, str]:
|
|
160
160
|
"""
|
|
161
|
-
Parse the clustering
|
|
161
|
+
Parse the clustering suffix into its components.
|
|
162
162
|
|
|
163
163
|
Args:
|
|
164
164
|
feature_name: The feature name to parse
|
|
@@ -167,23 +167,23 @@ class ClusteringFeatureGroup(AbstractFeatureGroup):
|
|
|
167
167
|
A tuple containing (algorithm, k_value)
|
|
168
168
|
|
|
169
169
|
Raises:
|
|
170
|
-
ValueError: If the
|
|
170
|
+
ValueError: If the suffix doesn't match the expected pattern
|
|
171
171
|
"""
|
|
172
|
-
# Extract the
|
|
173
|
-
|
|
174
|
-
if
|
|
172
|
+
# Extract the suffix part (everything after the double underscore)
|
|
173
|
+
suffix_start = feature_name.find("__")
|
|
174
|
+
if suffix_start == -1:
|
|
175
175
|
raise ValueError(
|
|
176
176
|
f"Invalid clustering feature name format: {feature_name}. Missing double underscore separator."
|
|
177
177
|
)
|
|
178
178
|
|
|
179
|
-
|
|
179
|
+
suffix = feature_name[suffix_start + 2 :]
|
|
180
180
|
|
|
181
|
-
# Parse the
|
|
182
|
-
parts =
|
|
181
|
+
# Parse the suffix components
|
|
182
|
+
parts = suffix.split("_")
|
|
183
183
|
if len(parts) != 3 or parts[0] != "cluster":
|
|
184
184
|
raise ValueError(
|
|
185
185
|
f"Invalid clustering feature name format: {feature_name}. "
|
|
186
|
-
f"Expected format:
|
|
186
|
+
f"Expected format: {{mloda_source_features}}__cluster_{{algorithm}}_{{k_value}}"
|
|
187
187
|
)
|
|
188
188
|
|
|
189
189
|
algorithm, k_value = parts[1], parts[2]
|
|
@@ -37,14 +37,14 @@ class MissingValueFeatureGroup(AbstractFeatureGroup):
|
|
|
37
37
|
|
|
38
38
|
### 1. String-Based Creation
|
|
39
39
|
|
|
40
|
-
Features follow the naming pattern: `{
|
|
40
|
+
Features follow the naming pattern: `{mloda_source_features}__{imputation_method}_imputed`
|
|
41
41
|
|
|
42
42
|
Examples:
|
|
43
43
|
```python
|
|
44
44
|
features = [
|
|
45
|
-
"
|
|
46
|
-
"
|
|
47
|
-
"
|
|
45
|
+
"income__mean_imputed", # Impute missing values in income with the mean
|
|
46
|
+
"age__median_imputed", # Impute missing values in age with the median
|
|
47
|
+
"category__constant_imputed" # Impute missing values in category with a constant value
|
|
48
48
|
]
|
|
49
49
|
```
|
|
50
50
|
|
|
@@ -85,16 +85,16 @@ class MissingValueFeatureGroup(AbstractFeatureGroup):
|
|
|
85
85
|
from mloda_core.abstract_plugins.components.feature import Feature
|
|
86
86
|
|
|
87
87
|
# Impute missing income values with mean
|
|
88
|
-
feature = Feature(name="
|
|
88
|
+
feature = Feature(name="income__mean_imputed")
|
|
89
89
|
|
|
90
90
|
# Impute missing age values with median
|
|
91
|
-
feature = Feature(name="
|
|
91
|
+
feature = Feature(name="age__median_imputed")
|
|
92
92
|
|
|
93
93
|
# Impute missing category values with mode
|
|
94
|
-
feature = Feature(name="
|
|
94
|
+
feature = Feature(name="category__mode_imputed")
|
|
95
95
|
|
|
96
96
|
# Forward fill missing temperature values
|
|
97
|
-
feature = Feature(name="
|
|
97
|
+
feature = Feature(name="temperature__ffill_imputed")
|
|
98
98
|
```
|
|
99
99
|
|
|
100
100
|
### Configuration-Based Creation
|
|
@@ -158,7 +158,7 @@ class MissingValueFeatureGroup(AbstractFeatureGroup):
|
|
|
158
158
|
}
|
|
159
159
|
|
|
160
160
|
PATTERN = "__"
|
|
161
|
-
PREFIX_PATTERN = r"
|
|
161
|
+
PREFIX_PATTERN = r".*__([\w]+)_imputed$"
|
|
162
162
|
|
|
163
163
|
PROPERTY_MAPPING = {
|
|
164
164
|
IMPUTATION_METHOD: {
|
|
@@ -187,7 +187,10 @@ class MissingValueFeatureGroup(AbstractFeatureGroup):
|
|
|
187
187
|
source_feature: str | None = None
|
|
188
188
|
|
|
189
189
|
# Try string-based parsing first
|
|
190
|
-
|
|
190
|
+
# parse_feature_name returns (operation_config, source_feature)
|
|
191
|
+
operation_config, source_feature = FeatureChainParser.parse_feature_name(
|
|
192
|
+
feature_name, self.PATTERN, [self.PREFIX_PATTERN]
|
|
193
|
+
)
|
|
191
194
|
if source_feature is not None:
|
|
192
195
|
return {Feature(source_feature)}
|
|
193
196
|
|
|
@@ -202,11 +205,16 @@ class MissingValueFeatureGroup(AbstractFeatureGroup):
|
|
|
202
205
|
@classmethod
|
|
203
206
|
def get_imputation_method(cls, feature_name: str) -> str:
|
|
204
207
|
"""Extract the imputation method from the feature name."""
|
|
205
|
-
|
|
206
|
-
|
|
208
|
+
# parse_feature_name returns (operation_config, source_feature)
|
|
209
|
+
# The operation_config contains the imputation method extracted from the suffix pattern
|
|
210
|
+
operation_config, _ = FeatureChainParser.parse_feature_name(feature_name, cls.PATTERN, [cls.PREFIX_PATTERN])
|
|
211
|
+
if operation_config is None:
|
|
207
212
|
raise ValueError(f"Invalid missing value feature name format: {feature_name}")
|
|
208
213
|
|
|
209
|
-
|
|
214
|
+
# The PREFIX_PATTERN captures the method name (e.g., "mean" from "mean_imputed")
|
|
215
|
+
# So operation_config already contains just the method name
|
|
216
|
+
imputation_method = operation_config
|
|
217
|
+
|
|
210
218
|
# Validate imputation method
|
|
211
219
|
if imputation_method not in cls.IMPUTATION_METHODS:
|
|
212
220
|
raise ValueError(
|
|
@@ -257,7 +265,9 @@ class MissingValueFeatureGroup(AbstractFeatureGroup):
|
|
|
257
265
|
feature_name_str = feature.name.name if hasattr(feature.name, "name") else str(feature.name)
|
|
258
266
|
|
|
259
267
|
if cls.PATTERN in feature_name_str:
|
|
268
|
+
# Use get_imputation_method which already handles parse_feature_name correctly
|
|
260
269
|
imputation_method = cls.get_imputation_method(feature_name_str)
|
|
270
|
+
# Use extract_source_feature which returns everything before the last __
|
|
261
271
|
source_feature_name = FeatureChainParser.extract_source_feature(feature_name_str, cls.PREFIX_PATTERN)
|
|
262
272
|
return imputation_method, source_feature_name
|
|
263
273
|
|
|
@@ -271,7 +281,7 @@ class MissingValueFeatureGroup(AbstractFeatureGroup):
|
|
|
271
281
|
if imputation_method is None or source_feature_name is None:
|
|
272
282
|
raise ValueError(f"Could not extract imputation method and source feature from: {feature.name}")
|
|
273
283
|
|
|
274
|
-
|
|
284
|
+
# Validate imputation method (no need to strip "imputed" from config-based method)
|
|
275
285
|
if imputation_method not in cls.IMPUTATION_METHODS:
|
|
276
286
|
raise ValueError(
|
|
277
287
|
f"Unsupported imputation method: {imputation_method}. "
|