mloda 0.2.15__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {mloda-0.2.15.dist-info → mloda-0.3.0.dist-info}/METADATA +32 -32
  2. {mloda-0.2.15.dist-info → mloda-0.3.0.dist-info}/RECORD +24 -24
  3. mloda_core/abstract_plugins/components/feature_chainer/feature_chain_parser.py +22 -21
  4. mloda_plugins/compute_framework/base_implementations/spark/spark_framework.py +2 -0
  5. mloda_plugins/feature_group/experimental/aggregated_feature_group/base.py +7 -7
  6. mloda_plugins/feature_group/experimental/clustering/base.py +17 -17
  7. mloda_plugins/feature_group/experimental/data_quality/missing_value/base.py +24 -14
  8. mloda_plugins/feature_group/experimental/dimensionality_reduction/base.py +17 -17
  9. mloda_plugins/feature_group/experimental/dimensionality_reduction/pandas.py +1 -1
  10. mloda_plugins/feature_group/experimental/forecasting/base.py +19 -18
  11. mloda_plugins/feature_group/experimental/geo_distance/base.py +25 -15
  12. mloda_plugins/feature_group/experimental/node_centrality/base.py +29 -29
  13. mloda_plugins/feature_group/experimental/node_centrality/pandas.py +3 -2
  14. mloda_plugins/feature_group/experimental/sklearn/encoding/base.py +19 -19
  15. mloda_plugins/feature_group/experimental/sklearn/encoding/pandas.py +2 -2
  16. mloda_plugins/feature_group/experimental/sklearn/pipeline/base.py +12 -16
  17. mloda_plugins/feature_group/experimental/sklearn/scaling/base.py +9 -9
  18. mloda_plugins/feature_group/experimental/text_cleaning/base.py +7 -7
  19. mloda_plugins/feature_group/experimental/time_window/base.py +22 -20
  20. {mloda-0.2.15.dist-info → mloda-0.3.0.dist-info}/WHEEL +0 -0
  21. {mloda-0.2.15.dist-info → mloda-0.3.0.dist-info}/entry_points.txt +0 -0
  22. {mloda-0.2.15.dist-info → mloda-0.3.0.dist-info}/licenses/LICENSE.TXT +0 -0
  23. {mloda-0.2.15.dist-info → mloda-0.3.0.dist-info}/licenses/NOTICE.md +0 -0
  24. {mloda-0.2.15.dist-info → mloda-0.3.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mloda
3
- Version: 0.2.15
3
+ Version: 0.3.0
4
4
  Summary: Rethinking Data and Feature Engineering
5
5
  Author-email: Tom Kaltofen <info@mloda.ai>
6
6
  License: Apache-2.0
@@ -87,7 +87,7 @@ result = mlodaAPI.run_all(
87
87
  features=[
88
88
  "customer_id", # Original column
89
89
  "age", # Original column
90
- "standard_scaled__income" # Transform: scale income to mean=0, std=1
90
+ "income__standard_scaled" # Transform: scale income to mean=0, std=1
91
91
  ],
92
92
  compute_frameworks={PandasDataframe}
93
93
  )
@@ -104,54 +104,54 @@ print(data.head())
104
104
  3. **mlodaAPI.run_all()** - Executed the feature pipeline:
105
105
  - Got data from `SampleData`
106
106
  - Extracted `customer_id` and `age` as-is
107
- - Applied StandardScaler to `income` → `standard_scaled__income`
107
+ - Applied StandardScaler to `income` → `income__standard_scaled`
108
108
  4. **result[0]** - Retrieved the processed pandas DataFrame
109
109
 
110
- > **Key Insight**: The syntax `standard_scaled__income` is mloda's **feature chaining**. Behind the scenes, mloda creates a chain of **feature group** objects (`StandardScalingFeatureGroup` → `SourceFeatureGroup`), automatically resolving dependencies. See [Section 2](#2-understanding-feature-chaining-transformations) for full explanation of chaining syntax and [Section 4](#4-advanced-feature-objects-for-complex-configurations) to learn about the underlying feature group architecture.
110
+ > **Key Insight**: The syntax `income__standard_scaled` is mloda's **feature chaining**. Behind the scenes, mloda creates a chain of **feature group** objects (`SourceFeatureGroup` → `StandardScalingFeatureGroup`), automatically resolving dependencies. See [Section 2](#2-understanding-feature-chaining-transformations) for full explanation of chaining syntax and [Section 4](#4-advanced-feature-objects-for-complex-configurations) to learn about the underlying feature group architecture.
111
111
 
112
112
  ### 2. Understanding Feature Chaining (Transformations)
113
113
 
114
114
  **The Power of Double Underscore `__` Syntax**
115
115
 
116
- As mentioned in Section 1, feature chaining (like `standard_scaled__income`) is syntactic sugar that mloda converts into a chain of **feature group objects**. Each transformation (`standard_scaled`, `mean_imputed`, etc.) corresponds to a specific feature group class.
116
+ As mentioned in Section 1, feature chaining (like `income__standard_scaled`) is syntactic sugar that mloda converts into a chain of **feature group objects**. Each transformation (`standard_scaled`, `mean_imputed`, etc.) corresponds to a specific feature group class.
117
117
 
118
118
  mloda's chaining syntax lets you compose transformations using `__` as a separator:
119
119
 
120
120
  ```python
121
121
  # Pattern examples (these show the syntax):
122
- # "standard_scaled__income" # Scale income column
123
- # "mean_imputed__age" # Fill missing age values with mean
124
- # "onehot_encoded__category" # One-hot encode category column
122
+ # "income__standard_scaled" # Scale income column
123
+ # "age__mean_imputed" # Fill missing age values with mean
124
+ # "category__onehot_encoded" # One-hot encode category column
125
125
  #
126
126
  # You can chain transformations!
127
- # Pattern: {transform2}__{transform1}__{source}
128
- # "standard_scaled__mean_imputed__income" # First impute, then scale
127
+ # Pattern: {source}__{transform1}__{transform2}
128
+ # "income__mean_imputed__standard_scaled" # First impute, then scale
129
129
 
130
130
  # Real working example:
131
- _ = ["standard_scaled__income", "mean_imputed__age"] # Valid feature names
131
+ _ = ["income__standard_scaled", "age__mean_imputed"] # Valid feature names
132
132
  ```
133
133
 
134
134
  **Available Transformations:**
135
135
 
136
136
  | Transformation | Purpose | Example |
137
137
  |---------------|---------|---------|
138
- | `standard_scaled__` | StandardScaler (mean=0, std=1) | `standard_scaled__income` |
139
- | `minmax_scaled__` | MinMaxScaler (range [0,1]) | `minmax_scaled__age` |
140
- | `robust_scaled__` | RobustScaler (median-based, handles outliers) | `robust_scaled__price` |
141
- | `mean_imputed__` | Fill missing values with mean | `mean_imputed__salary` |
142
- | `median_imputed__` | Fill missing values with median | `median_imputed__age` |
143
- | `mode_imputed__` | Fill missing values with mode | `mode_imputed__category` |
144
- | `onehot_encoded__` | One-hot encoding | `onehot_encoded__state` |
145
- | `label_encoded__` | Label encoding | `label_encoded__priority` |
138
+ | `__standard_scaled` | StandardScaler (mean=0, std=1) | `income__standard_scaled` |
139
+ | `__minmax_scaled` | MinMaxScaler (range [0,1]) | `age__minmax_scaled` |
140
+ | `__robust_scaled` | RobustScaler (median-based, handles outliers) | `price__robust_scaled` |
141
+ | `__mean_imputed` | Fill missing values with mean | `salary__mean_imputed` |
142
+ | `__median_imputed` | Fill missing values with median | `age__median_imputed` |
143
+ | `__mode_imputed` | Fill missing values with mode | `category__mode_imputed` |
144
+ | `__onehot_encoded` | One-hot encoding | `state__onehot_encoded` |
145
+ | `__label_encoded` | Label encoding | `priority__label_encoded` |
146
146
 
147
- > **Key Insight**: Transformations are read right-to-left. `standard_scaled__mean_imputed__income` means: take `income` → apply mean imputation → apply standard scaling.
147
+ > **Key Insight**: Transformations are read left-to-right. `income__mean_imputed__standard_scaled` means: take `income` → apply mean imputation → apply standard scaling.
148
148
 
149
149
  **When You Need More Control**
150
150
 
151
151
  Most of the time, simple string syntax is enough:
152
152
  ```python
153
153
  # Example feature list (simple strings)
154
- example_features = ["customer_id", "standard_scaled__income", "onehot_encoded__region"]
154
+ example_features = ["customer_id", "income__standard_scaled", "region__onehot_encoded"]
155
155
  ```
156
156
 
157
157
  But for advanced configurations, you can explicitly create `Feature` objects with custom options (covered in Section 3).
@@ -160,11 +160,11 @@ But for advanced configurations, you can explicitly create `Feature` objects wit
160
160
 
161
161
  **Understanding the Feature Group Architecture**
162
162
 
163
- Behind the scenes, chaining like `standard_scaled__income` creates feature group objects:
163
+ Behind the scenes, chaining like `income__standard_scaled` creates feature group objects:
164
164
 
165
165
  ```python
166
166
  # When you write this string:
167
- "standard_scaled__income"
167
+ "income__standard_scaled"
168
168
 
169
169
  # mloda creates this chain of feature groups:
170
170
  # StandardScalingFeatureGroup (reads from) → IncomeSourceFeatureGroup
@@ -236,7 +236,7 @@ mloda supports multiple data access patterns depending on your use case:
236
236
  # )
237
237
  #
238
238
  # result = mlodaAPI.run_all(
239
- # features=["customer_id", "standard_scaled__income"],
239
+ # features=["customer_id", "income__standard_scaled"],
240
240
  # compute_frameworks={PandasDataframe},
241
241
  # data_access_collection=data_access
242
242
  # )
@@ -254,7 +254,7 @@ mloda supports multiple data access patterns depending on your use case:
254
254
  # )
255
255
  #
256
256
  # result = mlodaAPI.run_all(
257
- # features=["customer_id", "standard_scaled__age"],
257
+ # features=["customer_id", "age__standard_scaled"],
258
258
  # compute_frameworks={PandasDataframe},
259
259
  # api_input_data_collection=api_input_data_collection,
260
260
  # api_data=api_data
@@ -273,7 +273,7 @@ mloda supports multiple compute frameworks (pandas, polars, pyarrow, etc.). Most
273
273
  # Using the SampleData class from Section 1
274
274
  # Default: Everything processes with pandas
275
275
  result = mlodaAPI.run_all(
276
- features=["customer_id", "standard_scaled__income"],
276
+ features=["customer_id", "income__standard_scaled"],
277
277
  compute_frameworks={PandasDataframe} # Use pandas for all features
278
278
  )
279
279
 
@@ -334,12 +334,12 @@ from sklearn.metrics import accuracy_score
334
334
  result = mlodaAPI.run_all(
335
335
  features=[
336
336
  "customer_id",
337
- "standard_scaled__age",
338
- "standard_scaled__income",
339
- "robust_scaled__account_balance",
340
- "label_encoded__subscription_tier",
341
- "label_encoded__region",
342
- "label_encoded__customer_segment",
337
+ "age__standard_scaled",
338
+ "income__standard_scaled",
339
+ "account_balance__robust_scaled",
340
+ "subscription_tier__label_encoded",
341
+ "region__label_encoded",
342
+ "customer_segment__label_encoded",
343
343
  "churned"
344
344
  ],
345
345
  compute_frameworks={PandasDataframe}
@@ -1,5 +1,5 @@
1
- mloda-0.2.15.dist-info/licenses/LICENSE.TXT,sha256=gmhQwSkHxjiShsqQ1FpJ-20YFtaa4vRCE7aCx55-6nk,11366
2
- mloda-0.2.15.dist-info/licenses/NOTICE.md,sha256=Hu10B2sPnGLIHxZ4QhACSLLxukJpeJzjvkzCu48q5fY,520
1
+ mloda-0.3.0.dist-info/licenses/LICENSE.TXT,sha256=gmhQwSkHxjiShsqQ1FpJ-20YFtaa4vRCE7aCx55-6nk,11366
2
+ mloda-0.3.0.dist-info/licenses/NOTICE.md,sha256=Hu10B2sPnGLIHxZ4QhACSLLxukJpeJzjvkzCu48q5fY,520
3
3
  mloda_core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  mloda_core/abstract_plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  mloda_core/abstract_plugins/abstract_feature_group.py,sha256=I3fVEULHUtrvPoc94iyxyBQVacD7GGI5piqJ6FoqgAY,18435
@@ -22,7 +22,7 @@ mloda_core/abstract_plugins/components/options.py,sha256=k3fLwT4DpHN1Dmeht8mtXqj
22
22
  mloda_core/abstract_plugins/components/parallelization_modes.py,sha256=k7z5yvyQfhfNYcljfZ0dWBf0ZMpnCSqaW0vajCh202Q,144
23
23
  mloda_core/abstract_plugins/components/utils.py,sha256=_ofeiOBQLwYU3_p9JBe61Ihps4dpFUcsrqI6XrA92Yo,530
24
24
  mloda_core/abstract_plugins/components/feature_chainer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
- mloda_core/abstract_plugins/components/feature_chainer/feature_chain_parser.py,sha256=xmMIQProp2y5kM6b7IGnkpwaXm2Yq4p7D_FtYX9sCsE,13180
25
+ mloda_core/abstract_plugins/components/feature_chainer/feature_chain_parser.py,sha256=dqwQOLJTOrEFmG-lIwGrKZnJ9rilEDDNAfC373dLJHQ,13289
26
26
  mloda_core/abstract_plugins/components/framework_transformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
27
  mloda_core/abstract_plugins/components/framework_transformer/base_transformer.py,sha256=3eRSOzYZZ4OHRezvUnw4RLTUjirMGtcZCKQYJ1MuuZU,5793
28
28
  mloda_core/abstract_plugins/components/framework_transformer/cfw_transformer.py,sha256=dODu95RTxAmLExId2XxPau-GZhBaGCO6k1sPntcwjfk,4298
@@ -114,7 +114,7 @@ mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_fra
114
114
  mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_merge_engine.py,sha256=ueuL1i4B9OmCKYFBGHwXvlTOu_qD-mDdptMcx1VjH1s,8347
115
115
  mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_pyarrow_transformer.py,sha256=S0yn42V95bN6Zxv2_JRRmX6NR_o7maEdzPluJrqpqD0,3438
116
116
  mloda_plugins/compute_framework/base_implementations/spark/spark_filter_engine.py,sha256=w6Z6cFQhmy1sl4bH5R9KFVdJGq-B5_s0bfHuzmpifKM,5256
117
- mloda_plugins/compute_framework/base_implementations/spark/spark_framework.py,sha256=Jf57IEHKPXwlpc3A8jnoka8T-JVSFPIny_wxWKo86zw,8168
117
+ mloda_plugins/compute_framework/base_implementations/spark/spark_framework.py,sha256=yiUa66tV8ckTpaWZ-B3YS1_B63j_YIjM_xG-WAcuKIs,8279
118
118
  mloda_plugins/compute_framework/base_implementations/spark/spark_merge_engine.py,sha256=syBOP6Ww9A_IfeJc49jpxByeP5PVvZTM9FFTUCZc3Xg,3452
119
119
  mloda_plugins/compute_framework/base_implementations/spark/spark_pyarrow_transformer.py,sha256=CtIOllhGdYQisIiG0Ml0haG4sBC2UmrxKl8bhp4gzjY,3303
120
120
  mloda_plugins/config/__init__.py,sha256=wm08JOS1kVronYOtmPJZCcEeMlA9wPOCFAIJG_Isi8c,34
@@ -127,29 +127,29 @@ mloda_plugins/feature_group/experimental/__init__.py,sha256=47DEQpj8HBSa-_TImW-5
127
127
  mloda_plugins/feature_group/experimental/default_options_key.py,sha256=GpSwOvR806wWZJ93DxC-Y3hnt4g7E4dELm8B5k6mZ0I,1040
128
128
  mloda_plugins/feature_group/experimental/source_input_feature.py,sha256=SXnC8iB6WxSbj-w5qtnRHtxV4K9H4qsg3uMJd3zg3GA,11080
129
129
  mloda_plugins/feature_group/experimental/aggregated_feature_group/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
130
- mloda_plugins/feature_group/experimental/aggregated_feature_group/base.py,sha256=VpYgJnAQP3jmQ7YXS6239TLExu0nA681-fZoD2CjrqQ,11426
130
+ mloda_plugins/feature_group/experimental/aggregated_feature_group/base.py,sha256=t16yx9bnU8FSQwg4mGoBZWDQ30eWc0TzRE6D5Bg1tc8,11423
131
131
  mloda_plugins/feature_group/experimental/aggregated_feature_group/pandas.py,sha256=7ntyidBNFVo-SmGTf1M5M0q-4dKonjQgmeqJ6XmwfYY,5014
132
132
  mloda_plugins/feature_group/experimental/aggregated_feature_group/polars_lazy.py,sha256=ulsr6HDHHeaNSA63Fo4FIm5TwrBzFIfQgNrRWfWAX3I,6294
133
133
  mloda_plugins/feature_group/experimental/aggregated_feature_group/pyarrow.py,sha256=A7mgNeAwa5-afpnkNDIf3xbDxspqh198wjxkvcMBsF8,5738
134
134
  mloda_plugins/feature_group/experimental/clustering/__init__.py,sha256=769NSapfi48V7BBh8zoo-ale2We6K4OV6ocNlzAhfEw,59
135
- mloda_plugins/feature_group/experimental/clustering/base.py,sha256=7NFibbkFu4Wv5FMc3OFsMvSVqWDKFdoxx8YNk2XiJG0,18592
135
+ mloda_plugins/feature_group/experimental/clustering/base.py,sha256=ijJeAq2nqkc5TNzuz30kSgs4MsFcGvvUf0XbynC1-Bo,18569
136
136
  mloda_plugins/feature_group/experimental/clustering/pandas.py,sha256=0k3gBw3ITzt9DMnOG2PCt4o0NzdOQy9-XM15M51Xqas,19327
137
137
  mloda_plugins/feature_group/experimental/data_quality/__init__.py,sha256=ga8jdKaLl4bxkxMqNtRbrkHFnRWZIp8f3bR7DVG5d-I,45
138
138
  mloda_plugins/feature_group/experimental/data_quality/missing_value/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
139
- mloda_plugins/feature_group/experimental/data_quality/missing_value/base.py,sha256=-fu4ziP1dCtjy0iHwfxNvKYhinaLBK8v29Qr2xc_zoQ,15193
139
+ mloda_plugins/feature_group/experimental/data_quality/missing_value/base.py,sha256=FIJnlIAq6U5PW1pa52W1mXHjPV0_YZB_vR4ml5xrLeM,15780
140
140
  mloda_plugins/feature_group/experimental/data_quality/missing_value/pandas.py,sha256=8l-uXJmxjlra8ADQisTQwla2abjT1UUplwuoyKIxp3k,8682
141
141
  mloda_plugins/feature_group/experimental/data_quality/missing_value/pyarrow.py,sha256=d13kWrXxdRddQ_6GbX5hKMNKpY9iRwhmVcx0CG5wafQ,14346
142
142
  mloda_plugins/feature_group/experimental/data_quality/missing_value/python_dict.py,sha256=OrOd5MZdbnL4DCJFSZYuda5t2b5MvOBqdedgIPisV9g,13968
143
- mloda_plugins/feature_group/experimental/dimensionality_reduction/base.py,sha256=u3DZMrdz_aopkdrygSkKplJ4y_Jj5Hwww4ot36wJFP4,17431
144
- mloda_plugins/feature_group/experimental/dimensionality_reduction/pandas.py,sha256=50M72lvFkU4q7QqW8trS26f7NamJnucdrI5fdfnw8uE,13279
143
+ mloda_plugins/feature_group/experimental/dimensionality_reduction/base.py,sha256=aIi6Cx09LxbmQH5geXlR78Cz5cTlMVWWpTbL85NJx34,17466
144
+ mloda_plugins/feature_group/experimental/dimensionality_reduction/pandas.py,sha256=v47-g2gHQnLEjxo0txM9OGlG7nX6kkKrzRTGK0dRkqM,13279
145
145
  mloda_plugins/feature_group/experimental/dynamic_feature_group_factory/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
146
146
  mloda_plugins/feature_group/experimental/dynamic_feature_group_factory/dynamic_feature_group_factory.py,sha256=6EHBHpDKeg9lapzzMeRnvP392JKskhrxWQ_QZYIkH7Q,12850
147
147
  mloda_plugins/feature_group/experimental/forecasting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
148
- mloda_plugins/feature_group/experimental/forecasting/base.py,sha256=7Vzl2tl1Jz057GIYk-IaYJTMPBU21IyLxtGnTrXAecI,24033
148
+ mloda_plugins/feature_group/experimental/forecasting/base.py,sha256=8XSTivQwb-UbF62NjikOT_kMFm_ixHnUGVO1hjHe_uQ,24068
149
149
  mloda_plugins/feature_group/experimental/forecasting/forecasting_artifact.py,sha256=41HPYoJEXqTqcv6Zvce-vkL9RZ5YrdzSiJgmEFxGVR0,4289
150
150
  mloda_plugins/feature_group/experimental/forecasting/pandas.py,sha256=Qus5jwAPs8bp546Y8e_piw6EoHkuru0Sl1UgdG0k_Yg,28913
151
151
  mloda_plugins/feature_group/experimental/geo_distance/__init__.py,sha256=wqp7I3j87AmrVBi2rlqcz4Sj-R1QMe3EasmNFb_Zxg4,85
152
- mloda_plugins/feature_group/experimental/geo_distance/base.py,sha256=CHYbzIBPypKs22-DKlk_PDXf7-obKr9acGY7CXIyxaE,12259
152
+ mloda_plugins/feature_group/experimental/geo_distance/base.py,sha256=Zz7DC4NbEc-oNqRir50bMNx7y8Bhq33WsKRUQmTDQP4,12801
153
153
  mloda_plugins/feature_group/experimental/geo_distance/pandas.py,sha256=KwN_-sdpZobBiFev68ar0JWNXmupmAvh6f5L3CtbBAE,6023
154
154
  mloda_plugins/feature_group/experimental/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
155
155
  mloda_plugins/feature_group/experimental/llm/cli.py,sha256=65VO3deuQyNo2gQWRh6HuJXvzMtnYS6WIdaV-fqCFhc,1409
@@ -181,24 +181,24 @@ mloda_plugins/feature_group/experimental/llm/tools/available/replace_file_tool.p
181
181
  mloda_plugins/feature_group/experimental/llm/tools/available/replace_file_tool_which_runs_tox.py,sha256=jTBpsIxF7mzZjeesd9ZeHUDwA17SkbLsL9brvl-YfOo,2119
182
182
  mloda_plugins/feature_group/experimental/llm/tools/available/run_single_pytest.py,sha256=dLMb1iunH0EVY7YZ0NmlHC4kVhTOjs2Hjs2412dFTao,4114
183
183
  mloda_plugins/feature_group/experimental/llm/tools/available/run_tox.py,sha256=2APL0MD_ExaMzsJK9_WfgDD9dmMY8amsgfc6B4Xgj70,3814
184
- mloda_plugins/feature_group/experimental/node_centrality/base.py,sha256=2wU4PHrG429B3erVjnpoi9r5uvXisV71gOvXC8phOls,14600
185
- mloda_plugins/feature_group/experimental/node_centrality/pandas.py,sha256=PI2fjKutagb34WNGPP8yU8lIFU4XR3pLkQ9wFRddkbo,20164
184
+ mloda_plugins/feature_group/experimental/node_centrality/base.py,sha256=bmWEA6qcdmwIz6Va3QYmwjban1YD16sKUiZn8n4Y49Y,14769
185
+ mloda_plugins/feature_group/experimental/node_centrality/pandas.py,sha256=pBvoe-rhAInIPeAKfxLZOrJzAkkauUuxKguhF6XXXws,20261
186
186
  mloda_plugins/feature_group/experimental/sklearn/__init__.py,sha256=UubmqLyavXbzW40FeGY06XyORo-x1Uo0WCLcpmPWnAs,208
187
187
  mloda_plugins/feature_group/experimental/sklearn/sklearn_artifact.py,sha256=Sa5bIurlF-YZ0ybl1cPJWpLLOUTfaDa1DCffNcEvoVA,12777
188
188
  mloda_plugins/feature_group/experimental/sklearn/encoding/__init__.py,sha256=WOe_iTVz2CXmVcL2IUNqhLJQqINFvY2rUktDXsNSOl8,153
189
- mloda_plugins/feature_group/experimental/sklearn/encoding/base.py,sha256=P88oOfsdXrxZnoAmjMbKhD1ij_RcEWMxywDkdouTgpk,19875
190
- mloda_plugins/feature_group/experimental/sklearn/encoding/pandas.py,sha256=2kYTEOz_HygPUQSCnprVPHtKLRJg9nhuZim87tpfyJk,6001
189
+ mloda_plugins/feature_group/experimental/sklearn/encoding/base.py,sha256=ikl4PBWU3eUXc9Dxn8llmaEoAtKQ3MaIzRIITbo8IBw,19884
190
+ mloda_plugins/feature_group/experimental/sklearn/encoding/pandas.py,sha256=_U9gD-39wAFVl8tL1QexcJ2WZc7fu6qShuI1L0O1XBI,6001
191
191
  mloda_plugins/feature_group/experimental/sklearn/pipeline/__init__.py,sha256=Z_xSZFAFItwRlbBVxbBxwW_S61tQ8r1N8Ih59jTUXqk,199
192
- mloda_plugins/feature_group/experimental/sklearn/pipeline/base.py,sha256=2Wd4F0fbxCU1KdvwtHpP8M2ir32x3gQI0jFvT78b22U,23646
192
+ mloda_plugins/feature_group/experimental/sklearn/pipeline/base.py,sha256=VsWEp8dNdu3k4NSd6ckPtGBt3hDAnly7a2fzxiylvXM,23447
193
193
  mloda_plugins/feature_group/experimental/sklearn/pipeline/pandas.py,sha256=nKLRbqy2q5vFNhgEsHoBnwbaiJheV9bkgizDSYd_epE,4045
194
194
  mloda_plugins/feature_group/experimental/sklearn/scaling/__init__.py,sha256=CsQEzK6DJ-WakWqsWTScHYsrBuOwLeX78zYV-NqxuDg,79
195
- mloda_plugins/feature_group/experimental/sklearn/scaling/base.py,sha256=-rFud7Pu1vrylaF-lflOSG9p7zskppX4GA686dG9-Nk,15409
195
+ mloda_plugins/feature_group/experimental/sklearn/scaling/base.py,sha256=6CqOVyzKgTRdQCRjPT5RFfJTQ453MCO0GOoewpC7cuc,15409
196
196
  mloda_plugins/feature_group/experimental/sklearn/scaling/pandas.py,sha256=8-DPSmUsEJVK4dlNh-041FI2YzmQ1Q7p6gWs0Zb7nKI,3960
197
- mloda_plugins/feature_group/experimental/text_cleaning/base.py,sha256=N36x2njBTTuqCqC1PSB5VFSuG1PfwkrWBJ06XvNNUHc,11350
197
+ mloda_plugins/feature_group/experimental/text_cleaning/base.py,sha256=-7nN7R7-wEkHoGYiry0UHtiL7W5_CKa-T1ktF0q7gUI,11313
198
198
  mloda_plugins/feature_group/experimental/text_cleaning/pandas.py,sha256=7RbV8lMUzx5b8ph4IsXnab4v06IByrNOGte9oK7Zz0g,7339
199
199
  mloda_plugins/feature_group/experimental/text_cleaning/python_dict.py,sha256=9wRE1RioFRL-OtX467u4OEPvhDTzQAvdB-XAaJ1zDys,7829
200
200
  mloda_plugins/feature_group/experimental/time_window/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
201
- mloda_plugins/feature_group/experimental/time_window/base.py,sha256=KSA6z1OTY4Zfwgylgpxa7MHh6HLWCgN1Q1l2-TnaQuY,18217
201
+ mloda_plugins/feature_group/experimental/time_window/base.py,sha256=TAqEFrnHQVzBtVQ4Y2L5yJ8f35SBo0j9_AFZJJ6bakk,18367
202
202
  mloda_plugins/feature_group/experimental/time_window/pandas.py,sha256=YFjkO2Xu_vnB1XfQx2bElKRpUty0Ldic04hiYJKYfEo,7863
203
203
  mloda_plugins/feature_group/experimental/time_window/pyarrow.py,sha256=SVwlfIt2qZVFp3InfLoszdSIBZh_EYFGzvIvRW9RVfA,10762
204
204
  mloda_plugins/feature_group/input_data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -222,8 +222,8 @@ mloda_plugins/function_extender/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
222
222
  mloda_plugins/function_extender/base_implementations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
223
223
  mloda_plugins/function_extender/base_implementations/otel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
224
224
  mloda_plugins/function_extender/base_implementations/otel/otel_extender.py,sha256=M8GKb55ZGaoRaNCQOp69qr3w8jSMSD6D3VuGBpfw2t4,731
225
- mloda-0.2.15.dist-info/METADATA,sha256=JcCj2VopjqSjyA33sQBtDSJeiUjOJyqed7pnRdR1LpA,16644
226
- mloda-0.2.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
227
- mloda-0.2.15.dist-info/entry_points.txt,sha256=f7hp7s4laABj9eN5YwEjQAyInF-fa687MXdz-hKYMIA,80
228
- mloda-0.2.15.dist-info/top_level.txt,sha256=KScNbTs4_vV-mJ1pIlP6cyvMl611B3hNxVYj2hA0Ex4,25
229
- mloda-0.2.15.dist-info/RECORD,,
225
+ mloda-0.3.0.dist-info/METADATA,sha256=gR1iP4xYXJNucYNPRsxqS8XRs9lv3Dl21indx8rESeQ,16643
226
+ mloda-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
227
+ mloda-0.3.0.dist-info/entry_points.txt,sha256=f7hp7s4laABj9eN5YwEjQAyInF-fa687MXdz-hKYMIA,80
228
+ mloda-0.3.0.dist-info/top_level.txt,sha256=KScNbTs4_vV-mJ1pIlP6cyvMl611B3hNxVYj2hA0Ex4,25
229
+ mloda-0.3.0.dist-info/RECORD,,
@@ -35,23 +35,24 @@ class FeatureChainParser:
35
35
  """Internal method for parsing feature names - used by match_configuration_feature_chain_parser."""
36
36
  _feature_name: str = feature_name.name if isinstance(feature_name, FeatureName) else feature_name
37
37
 
38
- parts = _feature_name.split("__", 1)
39
- itself = parts[0] + "__" # Ensure we have the prefix part with double underscore
38
+ parts = _feature_name.rsplit(pattern, 1)
39
+ source_feature = parts[0] if len(parts) > 1 else ""
40
+ operation_part = parts[1] if len(parts) > 1 else parts[0]
40
41
 
41
- remainder = ""
42
- if len(parts) > 1:
43
- remainder = parts[1]
44
-
45
- for prefix_pattern in prefix_patterns:
46
- if re.match(prefix_pattern, itself) is None:
42
+ for suffix_pattern in prefix_patterns:
43
+ if re.match(suffix_pattern, _feature_name) is None:
47
44
  continue
48
45
 
49
- if len(parts) == 1:
46
+ if len(parts) == 1 or not source_feature:
50
47
  raise ValueError(f"Matches the pattern {pattern}, but has no source feature: {_feature_name}")
51
48
 
52
- source_feature = remainder
53
- has_prefix_configuration = itself.split(pattern, 1)[0]
54
- return has_prefix_configuration, source_feature
49
+ match = re.match(suffix_pattern, _feature_name)
50
+ if match and match.groups():
51
+ operation_config = match.group(1)
52
+ else:
53
+ operation_config = operation_part.split("_")[0]
54
+
55
+ return operation_config, source_feature
55
56
 
56
57
  return None, None
57
58
 
@@ -286,13 +287,13 @@ class FeatureChainParser:
286
287
  return False
287
288
 
288
289
  @classmethod
289
- def extract_source_feature(cls, feature_name: str, prefix_pattern: str) -> str:
290
+ def extract_source_feature(cls, feature_name: str, suffix_pattern: str) -> str:
290
291
  """
291
- Extract the source feature from a feature name based on the prefix pattern.
292
+ Extract the source feature from a feature name based on the suffix pattern.
292
293
 
293
294
  Args:
294
295
  feature_name: The feature name to parse
295
- prefix_pattern: Regex pattern for the prefix (e.g., r"^([w]+)_aggr__")
296
+ suffix_pattern: Regex pattern for the suffix (e.g., r"^.+__([w]+)$")
296
297
 
297
298
  Returns:
298
299
  The source feature part of the name
@@ -300,14 +301,14 @@ class FeatureChainParser:
300
301
  Raises:
301
302
  ValueError: If the feature name doesn't match the expected pattern
302
303
  """
303
- match = re.match(prefix_pattern, feature_name)
304
+ match = re.match(suffix_pattern, feature_name)
304
305
  if not match:
305
306
  raise ValueError(f"Invalid feature name format: {feature_name}")
306
307
 
307
- # Extract the prefix part (everything before the double underscore)
308
- prefix_end = feature_name.find("__")
309
- if prefix_end == -1:
308
+ # For L→R: source is everything BEFORE the last __
309
+ suffix_start = feature_name.rfind("__")
310
+ if suffix_start == -1:
310
311
  raise ValueError(f"Invalid feature name format: {feature_name}. Missing double underscore separator.")
311
312
 
312
- # Return everything after the double underscore
313
- return feature_name[prefix_end + 2 :]
313
+ # Return everything BEFORE the last double underscore (the source)
314
+ return feature_name[:suffix_start]
@@ -174,6 +174,8 @@ class SparkFramework(ComputeFrameWork):
174
174
  self.set_framework_connection_object()
175
175
 
176
176
  spark = self.framework_connection_object
177
+ if spark is None:
178
+ raise RuntimeError("Failed to initialize Spark session")
177
179
  new_data_df = spark.createDataFrame(
178
180
  [(i + 1, val) for i, val in enumerate(data_list)],
179
181
  StructType(
@@ -40,15 +40,15 @@ class AggregatedFeatureGroup(AbstractFeatureGroup):
40
40
 
41
41
  ### 1. String-Based Creation
42
42
 
43
- Features follow the naming pattern: `{aggregation_type}_aggr__{mloda_source_features}`
43
+ Features follow the naming pattern: `{mloda_source_features}__{aggregation_type}_aggr`
44
44
 
45
45
  Examples:
46
46
  ```python
47
47
  features = [
48
- "sum_aggr__sales", # Sum of sales values
49
- "avg_aggr__temperature", # Average temperature
50
- "max_aggr__price", # Maximum price
51
- "count_aggr__transactions" # Count of transactions
48
+ "sales__sum_aggr", # Sum of sales values
49
+ "temperature__avg_aggr", # Average temperature
50
+ "price__max_aggr", # Maximum price
51
+ "transactions__count_aggr" # Count of transactions
52
52
  ]
53
53
  ```
54
54
 
@@ -96,8 +96,8 @@ class AggregatedFeatureGroup(AbstractFeatureGroup):
96
96
  "median": "Median value",
97
97
  }
98
98
 
99
- PATTERN = "_aggr__"
100
- PREFIX_PATTERN = r"^([\w]+)_aggr__"
99
+ PATTERN = "__"
100
+ PREFIX_PATTERN = r".*__([\w]+)_aggr$"
101
101
 
102
102
  # Property mapping for configuration-based feature creation
103
103
  PROPERTY_MAPPING = {
@@ -27,15 +27,15 @@ class ClusteringFeatureGroup(AbstractFeatureGroup):
27
27
  ## Feature Naming Convention
28
28
 
29
29
  Clustering features follow this naming pattern:
30
- `cluster_{algorithm}_{k_value}__{mloda_source_features}`
30
+ `{mloda_source_features}__cluster_{algorithm}_{k_value}`
31
31
 
32
- The source features (mloda_source_features) are extracted from the feature name and used
33
- as input for the clustering algorithm. Note the double underscore before the source features.
32
+ The source features come first, followed by the clustering operation.
33
+ Note the double underscore separating the source features from the operation.
34
34
 
35
35
  Examples:
36
- - `cluster_kmeans_5__customer_behavior`: K-means clustering with 5 clusters on customer behavior data
37
- - `cluster_hierarchical_3__transaction_patterns`: Hierarchical clustering with 3 clusters on transaction patterns
38
- - `cluster_dbscan_auto__sensor_readings`: DBSCAN clustering with automatic cluster detection on sensor readings
36
+ - `customer_behavior__cluster_kmeans_5`: K-means clustering with 5 clusters on customer behavior data
37
+ - `transaction_patterns__cluster_hierarchical_3`: Hierarchical clustering with 3 clusters on transaction patterns
38
+ - `sensor_readings__cluster_dbscan_auto`: DBSCAN clustering with automatic cluster detection on sensor readings
39
39
 
40
40
  ## Configuration-Based Creation
41
41
 
@@ -57,7 +57,7 @@ class ClusteringFeatureGroup(AbstractFeatureGroup):
57
57
  )
58
58
  )
59
59
 
60
- # The Engine will automatically parse this into a feature with name "cluster_kmeans_5__customer_behavior"
60
+ # The Engine will automatically parse this into a feature with name "customer_behavior__cluster_kmeans_5"
61
61
  ```
62
62
 
63
63
  ## Parameter Classification
@@ -102,7 +102,7 @@ class ClusteringFeatureGroup(AbstractFeatureGroup):
102
102
  }
103
103
 
104
104
  # Define the prefix pattern for this feature group
105
- PREFIX_PATTERN = r"^cluster_([\w]+)_([\w]+)__"
105
+ PREFIX_PATTERN = r".*__cluster_([\w]+)_([\w]+)$"
106
106
  PATTERN = "__"
107
107
 
108
108
  # Property mapping for configuration-based feature creation
@@ -158,7 +158,7 @@ class ClusteringFeatureGroup(AbstractFeatureGroup):
158
158
  @classmethod
159
159
  def parse_clustering_prefix(cls, feature_name: str) -> tuple[str, str]:
160
160
  """
161
- Parse the clustering prefix into its components.
161
+ Parse the clustering suffix into its components.
162
162
 
163
163
  Args:
164
164
  feature_name: The feature name to parse
@@ -167,23 +167,23 @@ class ClusteringFeatureGroup(AbstractFeatureGroup):
167
167
  A tuple containing (algorithm, k_value)
168
168
 
169
169
  Raises:
170
- ValueError: If the prefix doesn't match the expected pattern
170
+ ValueError: If the suffix doesn't match the expected pattern
171
171
  """
172
- # Extract the prefix part (everything before the double underscore)
173
- prefix_end = feature_name.find("__")
174
- if prefix_end == -1:
172
+ # Extract the suffix part (everything after the double underscore)
173
+ suffix_start = feature_name.find("__")
174
+ if suffix_start == -1:
175
175
  raise ValueError(
176
176
  f"Invalid clustering feature name format: {feature_name}. Missing double underscore separator."
177
177
  )
178
178
 
179
- prefix = feature_name[:prefix_end]
179
+ suffix = feature_name[suffix_start + 2 :]
180
180
 
181
- # Parse the prefix components
182
- parts = prefix.split("_")
181
+ # Parse the suffix components
182
+ parts = suffix.split("_")
183
183
  if len(parts) != 3 or parts[0] != "cluster":
184
184
  raise ValueError(
185
185
  f"Invalid clustering feature name format: {feature_name}. "
186
- f"Expected format: cluster_{{algorithm}}_{{k_value}}__{{mloda_source_features}}"
186
+ f"Expected format: {{mloda_source_features}}__cluster_{{algorithm}}_{{k_value}}"
187
187
  )
188
188
 
189
189
  algorithm, k_value = parts[1], parts[2]
@@ -37,14 +37,14 @@ class MissingValueFeatureGroup(AbstractFeatureGroup):
37
37
 
38
38
  ### 1. String-Based Creation
39
39
 
40
- Features follow the naming pattern: `{imputation_method}_imputed__{mloda_source_features}`
40
+ Features follow the naming pattern: `{mloda_source_features}__{imputation_method}_imputed`
41
41
 
42
42
  Examples:
43
43
  ```python
44
44
  features = [
45
- "mean_imputed__income", # Impute missing values in income with the mean
46
- "median_imputed__age", # Impute missing values in age with the median
47
- "constant_imputed__category" # Impute missing values in category with a constant value
45
+ "income__mean_imputed", # Impute missing values in income with the mean
46
+ "age__median_imputed", # Impute missing values in age with the median
47
+ "category__constant_imputed" # Impute missing values in category with a constant value
48
48
  ]
49
49
  ```
50
50
 
@@ -85,16 +85,16 @@ class MissingValueFeatureGroup(AbstractFeatureGroup):
85
85
  from mloda_core.abstract_plugins.components.feature import Feature
86
86
 
87
87
  # Impute missing income values with mean
88
- feature = Feature(name="mean_imputed__income")
88
+ feature = Feature(name="income__mean_imputed")
89
89
 
90
90
  # Impute missing age values with median
91
- feature = Feature(name="median_imputed__age")
91
+ feature = Feature(name="age__median_imputed")
92
92
 
93
93
  # Impute missing category values with mode
94
- feature = Feature(name="mode_imputed__category")
94
+ feature = Feature(name="category__mode_imputed")
95
95
 
96
96
  # Forward fill missing temperature values
97
- feature = Feature(name="ffill_imputed__temperature")
97
+ feature = Feature(name="temperature__ffill_imputed")
98
98
  ```
99
99
 
100
100
  ### Configuration-Based Creation
@@ -158,7 +158,7 @@ class MissingValueFeatureGroup(AbstractFeatureGroup):
158
158
  }
159
159
 
160
160
  PATTERN = "__"
161
- PREFIX_PATTERN = r"^([\w]+)_imputed__"
161
+ PREFIX_PATTERN = r".*__([\w]+)_imputed$"
162
162
 
163
163
  PROPERTY_MAPPING = {
164
164
  IMPUTATION_METHOD: {
@@ -187,7 +187,10 @@ class MissingValueFeatureGroup(AbstractFeatureGroup):
187
187
  source_feature: str | None = None
188
188
 
189
189
  # Try string-based parsing first
190
- _, source_feature = FeatureChainParser.parse_feature_name(feature_name, self.PATTERN, [self.PREFIX_PATTERN])
190
+ # parse_feature_name returns (operation_config, source_feature)
191
+ operation_config, source_feature = FeatureChainParser.parse_feature_name(
192
+ feature_name, self.PATTERN, [self.PREFIX_PATTERN]
193
+ )
191
194
  if source_feature is not None:
192
195
  return {Feature(source_feature)}
193
196
 
@@ -202,11 +205,16 @@ class MissingValueFeatureGroup(AbstractFeatureGroup):
202
205
  @classmethod
203
206
  def get_imputation_method(cls, feature_name: str) -> str:
204
207
  """Extract the imputation method from the feature name."""
205
- imputation_method, _ = FeatureChainParser.parse_feature_name(feature_name, cls.PATTERN, [cls.PREFIX_PATTERN])
206
- if imputation_method is None:
208
+ # parse_feature_name returns (operation_config, source_feature)
209
+ # The operation_config contains the imputation method extracted from the suffix pattern
210
+ operation_config, _ = FeatureChainParser.parse_feature_name(feature_name, cls.PATTERN, [cls.PREFIX_PATTERN])
211
+ if operation_config is None:
207
212
  raise ValueError(f"Invalid missing value feature name format: {feature_name}")
208
213
 
209
- imputation_method = imputation_method.replace("imputed", "").strip("_")
214
+ # The PREFIX_PATTERN captures the method name (e.g., "mean" from "mean_imputed")
215
+ # So operation_config already contains just the method name
216
+ imputation_method = operation_config
217
+
210
218
  # Validate imputation method
211
219
  if imputation_method not in cls.IMPUTATION_METHODS:
212
220
  raise ValueError(
@@ -257,7 +265,9 @@ class MissingValueFeatureGroup(AbstractFeatureGroup):
257
265
  feature_name_str = feature.name.name if hasattr(feature.name, "name") else str(feature.name)
258
266
 
259
267
  if cls.PATTERN in feature_name_str:
268
+ # Use get_imputation_method which already handles parse_feature_name correctly
260
269
  imputation_method = cls.get_imputation_method(feature_name_str)
270
+ # Use extract_source_feature which returns everything before the last __
261
271
  source_feature_name = FeatureChainParser.extract_source_feature(feature_name_str, cls.PREFIX_PATTERN)
262
272
  return imputation_method, source_feature_name
263
273
 
@@ -271,7 +281,7 @@ class MissingValueFeatureGroup(AbstractFeatureGroup):
271
281
  if imputation_method is None or source_feature_name is None:
272
282
  raise ValueError(f"Could not extract imputation method and source feature from: {feature.name}")
273
283
 
274
- imputation_method = imputation_method.replace("imputed", "").strip("_")
284
+ # Validate imputation method (no need to strip "imputed" from config-based method)
275
285
  if imputation_method not in cls.IMPUTATION_METHODS:
276
286
  raise ValueError(
277
287
  f"Unsupported imputation method: {imputation_method}. "