mloda 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. {mloda-0.3.0.dist-info → mloda-0.3.2.dist-info}/METADATA +10 -10
  2. {mloda-0.3.0.dist-info → mloda-0.3.2.dist-info}/RECORD +92 -91
  3. mloda_core/abstract_plugins/components/base_artifact.py +3 -1
  4. mloda_core/abstract_plugins/components/feature.py +4 -4
  5. mloda_core/abstract_plugins/components/feature_chainer/feature_chain_parser.py +44 -17
  6. mloda_core/abstract_plugins/components/feature_collection.py +2 -2
  7. mloda_core/abstract_plugins/components/feature_group_version.py +4 -4
  8. mloda_core/abstract_plugins/components/feature_name.py +0 -3
  9. mloda_core/abstract_plugins/components/input_data/base_input_data.py +3 -3
  10. mloda_core/abstract_plugins/components/link.py +113 -29
  11. mloda_core/abstract_plugins/components/options.py +10 -10
  12. mloda_core/api/prepare/setup_compute_framework.py +2 -2
  13. mloda_core/api/request.py +44 -13
  14. mloda_core/core/step/feature_group_step.py +2 -1
  15. mloda_core/filter/filter_engine.py +3 -12
  16. mloda_core/filter/filter_parameter.py +55 -0
  17. mloda_core/filter/single_filter.py +4 -4
  18. mloda_core/prepare/execution_plan.py +12 -6
  19. mloda_core/prepare/graph/graph.py +3 -3
  20. mloda_core/prepare/identify_feature_group.py +10 -3
  21. mloda_core/prepare/resolve_links.py +86 -18
  22. mloda_core/runtime/flight/flight_server.py +1 -1
  23. mloda_core/runtime/run.py +7 -5
  24. mloda_core/runtime/worker/multiprocessing_worker.py +11 -9
  25. mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_filter_engine.py +7 -33
  26. mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_pyarrow_transformer.py +1 -1
  27. mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_filter_engine.py +22 -12
  28. mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_framework.py +2 -2
  29. mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_pyarrow_transformer.py +2 -2
  30. mloda_plugins/compute_framework/base_implementations/pandas/dataframe.py +2 -2
  31. mloda_plugins/compute_framework/base_implementations/pandas/pandaspyarrowtransformer.py +1 -1
  32. mloda_plugins/compute_framework/base_implementations/polars/dataframe.py +3 -3
  33. mloda_plugins/compute_framework/base_implementations/polars/lazy_dataframe.py +5 -5
  34. mloda_plugins/compute_framework/base_implementations/polars/polars_filter_engine.py +8 -34
  35. mloda_plugins/compute_framework/base_implementations/polars/polars_lazy_merge_engine.py +1 -1
  36. mloda_plugins/compute_framework/base_implementations/polars/polars_lazy_pyarrow_transformer.py +3 -3
  37. mloda_plugins/compute_framework/base_implementations/polars/polars_merge_engine.py +1 -1
  38. mloda_plugins/compute_framework/base_implementations/polars/polars_pyarrow_transformer.py +2 -2
  39. mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_filter_engine.py +7 -33
  40. mloda_plugins/compute_framework/base_implementations/pyarrow/table.py +1 -1
  41. mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_filter_engine.py +13 -32
  42. mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_framework.py +1 -1
  43. mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_pyarrow_transformer.py +1 -1
  44. mloda_plugins/compute_framework/base_implementations/spark/spark_filter_engine.py +13 -32
  45. mloda_plugins/compute_framework/base_implementations/spark/spark_framework.py +4 -4
  46. mloda_plugins/compute_framework/base_implementations/spark/spark_pyarrow_transformer.py +1 -1
  47. mloda_plugins/config/feature/loader.py +12 -18
  48. mloda_plugins/feature_group/experimental/aggregated_feature_group/base.py +20 -17
  49. mloda_plugins/feature_group/experimental/aggregated_feature_group/pandas.py +8 -8
  50. mloda_plugins/feature_group/experimental/aggregated_feature_group/polars_lazy.py +8 -8
  51. mloda_plugins/feature_group/experimental/aggregated_feature_group/pyarrow.py +7 -7
  52. mloda_plugins/feature_group/experimental/clustering/base.py +26 -26
  53. mloda_plugins/feature_group/experimental/clustering/pandas.py +31 -29
  54. mloda_plugins/feature_group/experimental/data_quality/missing_value/base.py +23 -22
  55. mloda_plugins/feature_group/experimental/data_quality/missing_value/pandas.py +16 -16
  56. mloda_plugins/feature_group/experimental/data_quality/missing_value/pyarrow.py +9 -11
  57. mloda_plugins/feature_group/experimental/data_quality/missing_value/python_dict.py +8 -8
  58. mloda_plugins/feature_group/experimental/default_options_key.py +1 -1
  59. mloda_plugins/feature_group/experimental/dimensionality_reduction/base.py +17 -15
  60. mloda_plugins/feature_group/experimental/dimensionality_reduction/pandas.py +30 -18
  61. mloda_plugins/feature_group/experimental/dynamic_feature_group_factory/dynamic_feature_group_factory.py +35 -35
  62. mloda_plugins/feature_group/experimental/forecasting/base.py +39 -29
  63. mloda_plugins/feature_group/experimental/forecasting/pandas.py +18 -18
  64. mloda_plugins/feature_group/experimental/geo_distance/base.py +18 -20
  65. mloda_plugins/feature_group/experimental/geo_distance/pandas.py +2 -2
  66. mloda_plugins/feature_group/experimental/llm/cli_features/refactor_git_cached.py +6 -6
  67. mloda_plugins/feature_group/experimental/llm/installed_packages_feature_group.py +2 -2
  68. mloda_plugins/feature_group/experimental/llm/list_directory_feature_group.py +2 -2
  69. mloda_plugins/feature_group/experimental/llm/llm_api/llm_base_request.py +2 -2
  70. mloda_plugins/feature_group/experimental/llm/llm_api/request_loop.py +3 -2
  71. mloda_plugins/feature_group/experimental/llm/llm_file_selector.py +1 -1
  72. mloda_plugins/feature_group/experimental/node_centrality/base.py +8 -12
  73. mloda_plugins/feature_group/experimental/node_centrality/pandas.py +2 -2
  74. mloda_plugins/feature_group/experimental/sklearn/encoding/base.py +11 -12
  75. mloda_plugins/feature_group/experimental/sklearn/encoding/pandas.py +2 -2
  76. mloda_plugins/feature_group/experimental/sklearn/pipeline/base.py +9 -14
  77. mloda_plugins/feature_group/experimental/sklearn/pipeline/pandas.py +2 -2
  78. mloda_plugins/feature_group/experimental/sklearn/scaling/base.py +8 -9
  79. mloda_plugins/feature_group/experimental/sklearn/scaling/pandas.py +2 -2
  80. mloda_plugins/feature_group/experimental/source_input_feature.py +10 -10
  81. mloda_plugins/feature_group/experimental/text_cleaning/base.py +8 -11
  82. mloda_plugins/feature_group/experimental/text_cleaning/pandas.py +2 -2
  83. mloda_plugins/feature_group/experimental/time_window/base.py +27 -25
  84. mloda_plugins/feature_group/experimental/time_window/pandas.py +8 -8
  85. mloda_plugins/feature_group/experimental/time_window/pyarrow.py +6 -6
  86. mloda_plugins/feature_group/input_data/read_context_files.py +1 -1
  87. mloda_plugins/function_extender/base_implementations/otel/otel_extender.py +1 -1
  88. {mloda-0.3.0.dist-info → mloda-0.3.2.dist-info}/WHEEL +0 -0
  89. {mloda-0.3.0.dist-info → mloda-0.3.2.dist-info}/entry_points.txt +0 -0
  90. {mloda-0.3.0.dist-info → mloda-0.3.2.dist-info}/licenses/LICENSE.TXT +0 -0
  91. {mloda-0.3.0.dist-info → mloda-0.3.2.dist-info}/licenses/NOTICE.md +0 -0
  92. {mloda-0.3.0.dist-info → mloda-0.3.2.dist-info}/top_level.txt +0 -0
@@ -4,15 +4,17 @@ Pandas implementation for dimensionality reduction feature groups.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
- from typing import Any, List, cast
7
+ from typing import Any, List, TYPE_CHECKING, cast
8
8
 
9
+ if TYPE_CHECKING:
10
+ from numpy.typing import NDArray
9
11
 
10
12
  try:
11
13
  import pandas as pd
12
14
  import numpy as np
13
15
  except ImportError:
14
16
  pd = None
15
- np = None # type: ignore
17
+ np = None # type: ignore[assignment]
16
18
 
17
19
  # Check if required packages are available
18
20
  SKLEARN_AVAILABLE = True
@@ -26,7 +28,7 @@ except ImportError:
26
28
 
27
29
 
28
30
  from mloda_core.abstract_plugins.compute_frame_work import ComputeFrameWork
29
- from mloda_plugins.compute_framework.base_implementations.pandas.dataframe import PandasDataframe
31
+ from mloda_plugins.compute_framework.base_implementations.pandas.dataframe import PandasDataFrame
30
32
  from mloda_plugins.feature_group.experimental.dimensionality_reduction.base import DimensionalityReductionFeatureGroup
31
33
 
32
34
 
@@ -34,7 +36,7 @@ class PandasDimensionalityReductionFeatureGroup(DimensionalityReductionFeatureGr
34
36
  @classmethod
35
37
  def compute_framework_rule(cls) -> set[type[ComputeFrameWork]]:
36
38
  """Define the compute framework for this feature group."""
37
- return {PandasDataframe}
39
+ return {PandasDataFrame}
38
40
 
39
41
  @classmethod
40
42
  def _check_source_feature_exists(cls, data: pd.DataFrame, feature_name: str) -> None:
@@ -52,7 +54,7 @@ class PandasDimensionalityReductionFeatureGroup(DimensionalityReductionFeatureGr
52
54
  raise ValueError(f"Feature '{feature_name}' not found in the data")
53
55
 
54
56
  @classmethod
55
- def _add_result_to_data(cls, data: pd.DataFrame, feature_name: str, result: np.ndarray) -> pd.DataFrame: # type: ignore
57
+ def _add_result_to_data(cls, data: "pd.DataFrame", feature_name: str, result: "NDArray[Any]") -> "pd.DataFrame":
56
58
  """
57
59
  Add the dimensionality reduction result to the DataFrame using the multiple result columns pattern.
58
60
 
@@ -83,7 +85,7 @@ class PandasDimensionalityReductionFeatureGroup(DimensionalityReductionFeatureGr
83
85
  dimension: int,
84
86
  source_features: List[str],
85
87
  options: Any,
86
- ) -> np.ndarray: # type: ignore
88
+ ) -> "NDArray[Any]":
87
89
  """
88
90
  Perform dimensionality reduction on the specified features.
89
91
 
@@ -184,7 +186,7 @@ class PandasDimensionalityReductionFeatureGroup(DimensionalityReductionFeatureGr
184
186
  raise ValueError(f"Unsupported dimensionality reduction algorithm: {algorithm}")
185
187
 
186
188
  @classmethod
187
- def _perform_pca_reduction(cls, X: np.ndarray, dimension: int, svd_solver: str = "auto") -> np.ndarray: # type: ignore
189
+ def _perform_pca_reduction(cls, X: "NDArray[Any]", dimension: int, svd_solver: str = "auto") -> "NDArray[Any]":
188
190
  """
189
191
  Perform Principal Component Analysis (PCA).
190
192
 
@@ -202,17 +204,17 @@ class PandasDimensionalityReductionFeatureGroup(DimensionalityReductionFeatureGr
202
204
 
203
205
  # Perform PCA
204
206
  pca = PCA(n_components=dimension, random_state=42, svd_solver=svd_solver)
205
- return pca.fit_transform(X) # type: ignore
207
+ return cast("NDArray[Any]", pca.fit_transform(X))
206
208
 
207
209
  @classmethod
208
210
  def _perform_tsne_reduction(
209
211
  cls,
210
- X: np.ndarray, # type: ignore
212
+ X: "NDArray[Any]",
211
213
  dimension: int,
212
214
  max_iter: int = 250,
213
215
  n_iter_without_progress: int = 50,
214
216
  method: str = "barnes_hut",
215
- ) -> np.ndarray: # type: ignore
217
+ ) -> "NDArray[Any]":
216
218
  """
217
219
  Perform t-Distributed Stochastic Neighbor Embedding (t-SNE).
218
220
 
@@ -254,10 +256,10 @@ class PandasDimensionalityReductionFeatureGroup(DimensionalityReductionFeatureGr
254
256
  n_iter_without_progress=n_iter_without_progress,
255
257
  method=actual_method,
256
258
  )
257
- return tsne.fit_transform(X) # type: ignore
259
+ return cast("NDArray[Any]", tsne.fit_transform(X))
258
260
 
259
261
  @classmethod
260
- def _perform_ica_reduction(cls, X: np.ndarray, dimension: int, max_iter: int = 200) -> np.ndarray: # type: ignore
262
+ def _perform_ica_reduction(cls, X: "NDArray[Any]", dimension: int, max_iter: int = 200) -> "NDArray[Any]":
261
263
  """
262
264
  Perform Independent Component Analysis (ICA).
263
265
 
@@ -273,12 +275,22 @@ class PandasDimensionalityReductionFeatureGroup(DimensionalityReductionFeatureGr
273
275
  if not SKLEARN_AVAILABLE:
274
276
  raise ImportError("scikit-learn is required for ICA dimensionality reduction")
275
277
 
278
+ # For small datasets, increase tolerance and iterations for better convergence
279
+ n_samples = X.shape[0]
280
+ if n_samples < 50:
281
+ # Small datasets may need more iterations and higher tolerance
282
+ actual_max_iter = max(max_iter, 1000)
283
+ tol = 0.01
284
+ else:
285
+ actual_max_iter = max_iter
286
+ tol = 1e-4 # sklearn default
287
+
276
288
  # Perform ICA
277
- ica = FastICA(n_components=dimension, random_state=42, max_iter=max_iter)
278
- return ica.fit_transform(X) # type: ignore
289
+ ica = FastICA(n_components=dimension, random_state=42, max_iter=actual_max_iter, tol=tol)
290
+ return cast("NDArray[Any]", ica.fit_transform(X))
279
291
 
280
292
  @classmethod
281
- def _perform_lda_reduction(cls, X: np.ndarray, dimension: int, df: pd.DataFrame) -> np.ndarray: # type: ignore
293
+ def _perform_lda_reduction(cls, X: "NDArray[Any]", dimension: int, df: "pd.DataFrame") -> "NDArray[Any]":
282
294
  """
283
295
  Perform Linear Discriminant Analysis (LDA).
284
296
 
@@ -307,10 +319,10 @@ class PandasDimensionalityReductionFeatureGroup(DimensionalityReductionFeatureGr
307
319
 
308
320
  # Perform LDA
309
321
  lda = LinearDiscriminantAnalysis(n_components=dimension)
310
- return lda.fit_transform(X, y) # type: ignore
322
+ return cast("NDArray[Any]", lda.fit_transform(X, y))
311
323
 
312
324
  @classmethod
313
- def _perform_isomap_reduction(cls, X: np.ndarray, dimension: int, n_neighbors: int = 5) -> np.ndarray: # type: ignore
325
+ def _perform_isomap_reduction(cls, X: "NDArray[Any]", dimension: int, n_neighbors: int = 5) -> "NDArray[Any]":
314
326
  """
315
327
  Perform Isometric Mapping (Isomap).
316
328
 
@@ -328,4 +340,4 @@ class PandasDimensionalityReductionFeatureGroup(DimensionalityReductionFeatureGr
328
340
 
329
341
  # Perform Isomap
330
342
  isomap = Isomap(n_components=dimension, n_neighbors=n_neighbors)
331
- return isomap.fit_transform(X) # type: ignore
343
+ return cast("NDArray[Any]", isomap.fit_transform(X))
@@ -141,7 +141,7 @@ class DynamicFeatureGroupCreator:
141
141
  properties = {
142
142
  "match_feature_group_criteria": custom_match_criteria,
143
143
  "input_features": custom_input_features,
144
- "compute_framework_rule": lambda: {PandasDataframe},
144
+ "compute_framework_rule": lambda: {PandasDataFrame},
145
145
  }
146
146
 
147
147
  CustomFG = DynamicFeatureGroupCreator.create(
@@ -202,70 +202,70 @@ class DynamicFeatureGroupCreator:
202
202
  if class_name in DynamicFeatureGroupCreator._created_classes:
203
203
  return DynamicFeatureGroupCreator._created_classes[class_name]
204
204
 
205
- def set_feature_name(self, config: Options, feature_name: FeatureName) -> FeatureName: # type: ignore
205
+ def set_feature_name(self, config: Options, feature_name: FeatureName) -> FeatureName: # type: ignore[no-untyped-def]
206
206
  if "set_feature_name" in properties:
207
- return properties["set_feature_name"](self, config, feature_name) # type: ignore
207
+ return properties["set_feature_name"](self, config, feature_name) # type: ignore[no-any-return]
208
208
  return feature_name
209
209
 
210
- def match_feature_group_criteria( # type: ignore
210
+ def match_feature_group_criteria( # type: ignore[no-untyped-def]
211
211
  cls,
212
212
  feature_name: Union[FeatureName, str],
213
213
  options: Options,
214
214
  data_access_collection: Optional[DataAccessCollection] = None,
215
215
  ) -> bool:
216
216
  if "match_feature_group_criteria" in properties:
217
- return properties["match_feature_group_criteria"](cls, feature_name, options, data_access_collection) # type: ignore
218
- return super(new_class, cls).match_feature_group_criteria(feature_name, options, data_access_collection) # type: ignore
217
+ return properties["match_feature_group_criteria"](cls, feature_name, options, data_access_collection) # type: ignore[no-any-return]
218
+ return super(new_class, cls).match_feature_group_criteria(feature_name, options, data_access_collection) # type: ignore[misc, arg-type, no-any-return]
219
219
 
220
- def input_data(cls) -> Optional[BaseInputData]: # type: ignore
220
+ def input_data(cls) -> Optional[BaseInputData]: # type: ignore[no-untyped-def]
221
221
  if "input_data" in properties:
222
- return properties["input_data"]() # type: ignore
223
- return super(new_class, cls).input_data() # type: ignore
222
+ return properties["input_data"]() # type: ignore[no-any-return]
223
+ return super(new_class, cls).input_data() # type: ignore[misc, arg-type, no-any-return]
224
224
 
225
- def validate_input_features(cls, data: Any, features: FeatureSet) -> Optional[bool]: # type: ignore
225
+ def validate_input_features(cls, data: Any, features: FeatureSet) -> Optional[bool]: # type: ignore[no-untyped-def]
226
226
  if "validate_input_features" in properties:
227
- return properties["validate_input_features"](cls, data, features) # type: ignore
228
- return super(new_class, cls).validate_input_features(data, features) # type: ignore
227
+ return properties["validate_input_features"](cls, data, features) # type: ignore[no-any-return]
228
+ return super(new_class, cls).validate_input_features(data, features) # type: ignore[misc, arg-type, no-any-return]
229
229
 
230
- def calculate_feature(cls, data: Any, features: FeatureSet) -> Any: # type: ignore
230
+ def calculate_feature(cls, data: Any, features: FeatureSet) -> Any: # type: ignore[no-untyped-def]
231
231
  if "calculate_feature" in properties:
232
232
  return properties["calculate_feature"](cls, data, features)
233
- return super(new_class, cls).calculate_feature(data, features) # type: ignore
233
+ return super(new_class, cls).calculate_feature(data, features) # type: ignore[misc, arg-type]
234
234
 
235
- def validate_output_features(cls, data: Any, features: FeatureSet) -> Optional[bool]: # type: ignore
235
+ def validate_output_features(cls, data: Any, features: FeatureSet) -> Optional[bool]: # type: ignore[no-untyped-def]
236
236
  if "validate_output_features" in properties:
237
- return properties["validate_output_features"](cls, data, features) # type: ignore
238
- return super(new_class, cls).validate_output_features(data, features) # type: ignore
237
+ return properties["validate_output_features"](cls, data, features) # type: ignore[no-any-return]
238
+ return super(new_class, cls).validate_output_features(data, features) # type: ignore[misc, arg-type, no-any-return]
239
239
 
240
- def artifact(cls) -> Optional[Type[Any]]: # type: ignore
240
+ def artifact(cls) -> Optional[Type[Any]]: # type: ignore[no-untyped-def]
241
241
  if "artifact" in properties:
242
- return properties["artifact"]() # type: ignore
243
- return super(new_class, cls).artifact() # type: ignore
242
+ return properties["artifact"]() # type: ignore[no-any-return]
243
+ return super(new_class, cls).artifact() # type: ignore[misc, arg-type, no-any-return]
244
244
 
245
- def compute_framework_rule(cls) -> Union[bool, Set[Type[ComputeFrameWork]]]: # type: ignore
245
+ def compute_framework_rule(cls) -> Union[bool, Set[Type[ComputeFrameWork]]]: # type: ignore[no-untyped-def]
246
246
  if "compute_framework_rule" in properties:
247
- return properties["compute_framework_rule"]() # type: ignore
248
- return super(new_class, cls).compute_framework_rule() # type: ignore
247
+ return properties["compute_framework_rule"]() # type: ignore[no-any-return]
248
+ return super(new_class, cls).compute_framework_rule() # type: ignore[misc, arg-type, no-any-return]
249
249
 
250
- def return_data_type_rule(cls, feature: Any) -> Optional[DataType]: # type: ignore
250
+ def return_data_type_rule(cls, feature: Any) -> Optional[DataType]: # type: ignore[no-untyped-def]
251
251
  if "return_data_type_rule" in properties:
252
- return properties["return_data_type_rule"](cls, feature) # type: ignore
253
- return super(new_class, cls).return_data_type_rule(feature) # type: ignore
252
+ return properties["return_data_type_rule"](cls, feature) # type: ignore[no-any-return]
253
+ return super(new_class, cls).return_data_type_rule(feature) # type: ignore[misc, arg-type, no-any-return]
254
254
 
255
- def input_features(self, options: Options, feature_name: FeatureName) -> Optional[Set[Any]]: # type: ignore
255
+ def input_features(self, options: Options, feature_name: FeatureName) -> Optional[Set[Any]]: # type: ignore[no-untyped-def]
256
256
  if "input_features" in properties:
257
- return properties["input_features"](self, options, feature_name) # type: ignore
258
- return super(new_class, self).input_features(options, feature_name) # type: ignore
257
+ return properties["input_features"](self, options, feature_name) # type: ignore[no-any-return]
258
+ return super(new_class, self).input_features(options, feature_name) # type: ignore[misc, arg-type, no-any-return]
259
259
 
260
- def index_columns(cls) -> Optional[List[Index]]: # type: ignore
260
+ def index_columns(cls) -> Optional[List[Index]]: # type: ignore[no-untyped-def]
261
261
  if "index_columns" in properties:
262
- return properties["index_columns"]() # type: ignore
263
- return super(new_class, cls).index_columns() # type: ignore
262
+ return properties["index_columns"]() # type: ignore[no-any-return]
263
+ return super(new_class, cls).index_columns() # type: ignore[misc, arg-type, no-any-return]
264
264
 
265
- def supports_index(cls, index: Index) -> Optional[bool]: # type: ignore
265
+ def supports_index(cls, index: Index) -> Optional[bool]: # type: ignore[no-untyped-def]
266
266
  if "supports_index" in properties:
267
- return properties["supports_index"](cls, index) # type: ignore
268
- return super(new_class, cls).supports_index(index) # type: ignore
267
+ return properties["supports_index"](cls, index) # type: ignore[no-any-return]
268
+ return super(new_class, cls).supports_index(index) # type: ignore[misc, arg-type, no-any-return]
269
269
 
270
270
  new_class = type(
271
271
  class_name,
@@ -4,12 +4,16 @@ Base implementation for forecasting feature groups.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
+ from abc import abstractmethod
7
8
  from typing import Any, List, Optional, Set, Type, Union
8
9
 
9
10
  from mloda_core.abstract_plugins.abstract_feature_group import AbstractFeatureGroup
10
11
  from mloda_core.abstract_plugins.components.base_artifact import BaseArtifact
11
12
  from mloda_core.abstract_plugins.components.feature import Feature
12
- from mloda_core.abstract_plugins.components.feature_chainer.feature_chain_parser import FeatureChainParser
13
+ from mloda_core.abstract_plugins.components.feature_chainer.feature_chain_parser import (
14
+ CHAIN_SEPARATOR,
15
+ FeatureChainParser,
16
+ )
13
17
  from mloda_core.abstract_plugins.components.feature_name import FeatureName
14
18
  from mloda_core.abstract_plugins.components.feature_set import FeatureSet
15
19
  from mloda_core.abstract_plugins.components.options import Options
@@ -30,7 +34,7 @@ class ForecastingFeatureGroup(AbstractFeatureGroup):
30
34
 
31
35
  ### 1. String-Based Creation
32
36
 
33
- Features follow the naming pattern: `{mloda_source_features}__{algorithm}_forecast_{horizon}{time_unit}`
37
+ Features follow the naming pattern: `{in_features}__{algorithm}_forecast_{horizon}{time_unit}`
34
38
 
35
39
  Examples:
36
40
  ```python
@@ -53,7 +57,7 @@ class ForecastingFeatureGroup(AbstractFeatureGroup):
53
57
  ForecastingFeatureGroup.ALGORITHM: "linear",
54
58
  ForecastingFeatureGroup.HORIZON: 7,
55
59
  ForecastingFeatureGroup.TIME_UNIT: "day",
56
- DefaultOptionKeys.mloda_source_features: "sales",
60
+ DefaultOptionKeys.in_features: "sales",
57
61
  }
58
62
  )
59
63
  )
@@ -66,7 +70,7 @@ class ForecastingFeatureGroup(AbstractFeatureGroup):
66
70
  - `algorithm`: The forecasting algorithm to use
67
71
  - `horizon`: The forecast horizon (number of time units)
68
72
  - `time_unit`: The time unit for the horizon
69
- - `mloda_source_features`: The source feature to generate forecasts for
73
+ - `in_features`: The source feature to generate forecasts for
70
74
 
71
75
  ### Group Parameters
72
76
  Currently none for ForecastingFeatureGroup. Parameters that affect Feature Group
@@ -128,7 +132,6 @@ class ForecastingFeatureGroup(AbstractFeatureGroup):
128
132
 
129
133
  # Define the prefix pattern for this feature group
130
134
  PREFIX_PATTERN = r".*__([\w]+)_forecast_(\d+)([\w]+)$"
131
- PATTERN = "__"
132
135
 
133
136
  # Property mapping for configuration-based features with group/context separation
134
137
  PROPERTY_MAPPING = {
@@ -151,7 +154,7 @@ class ForecastingFeatureGroup(AbstractFeatureGroup):
151
154
  DefaultOptionKeys.mloda_context: True,
152
155
  DefaultOptionKeys.mloda_strict_validation: True,
153
156
  },
154
- DefaultOptionKeys.mloda_source_features: {
157
+ DefaultOptionKeys.in_features: {
155
158
  "explanation": "Source feature to generate forecasts for",
156
159
  DefaultOptionKeys.mloda_context: True,
157
160
  DefaultOptionKeys.mloda_strict_validation: False,
@@ -202,13 +205,13 @@ class ForecastingFeatureGroup(AbstractFeatureGroup):
202
205
  source_feature: str | None = None
203
206
 
204
207
  # Try string-based parsing first
205
- _, source_feature = FeatureChainParser.parse_feature_name(feature_name, self.PATTERN, [self.PREFIX_PATTERN])
208
+ _, source_feature = FeatureChainParser.parse_feature_name(feature_name, [self.PREFIX_PATTERN])
206
209
  if source_feature is not None:
207
210
  time_filter_feature = Feature(self.get_time_filter_feature(options))
208
211
  return {Feature(source_feature), time_filter_feature}
209
212
 
210
213
  # Fall back to configuration-based approach
211
- source_features = options.get_source_features()
214
+ source_features = options.get_in_features()
212
215
  if len(source_features) != 1:
213
216
  raise ValueError(
214
217
  f"Expected exactly one source feature, but found {len(source_features)}: {source_features}"
@@ -246,7 +249,7 @@ class ForecastingFeatureGroup(AbstractFeatureGroup):
246
249
  if len(parts) < 3 or parts[1] != "forecast":
247
250
  raise ValueError(
248
251
  f"Invalid forecast feature name format: {feature_name}. "
249
- f"Expected format: {{mloda_source_features}}__{{algorithm}}_forecast_{{horizon}}{{time_unit}}"
252
+ f"Expected format: {{in_features}}__{{algorithm}}_forecast_{{horizon}}{{time_unit}}"
250
253
  )
251
254
 
252
255
  algorithm = parts[0]
@@ -297,7 +300,6 @@ class ForecastingFeatureGroup(AbstractFeatureGroup):
297
300
  feature_name,
298
301
  options,
299
302
  property_mapping=cls.PROPERTY_MAPPING,
300
- pattern=cls.PATTERN,
301
303
  prefix_patterns=[cls.PREFIX_PATTERN],
302
304
  )
303
305
 
@@ -306,7 +308,7 @@ class ForecastingFeatureGroup(AbstractFeatureGroup):
306
308
  feature_name_str = feature_name.name if isinstance(feature_name, FeatureName) else feature_name
307
309
 
308
310
  # Check if this is a string-based feature (contains the pattern)
309
- if cls.PATTERN in feature_name_str:
311
+ if FeatureChainParser.is_chained_feature(feature_name_str):
310
312
  try:
311
313
  # Use existing validation logic that validates algorithm, horizon, and time_unit
312
314
  cls.parse_forecast_suffix(feature_name_str)
@@ -350,13 +352,13 @@ class ForecastingFeatureGroup(AbstractFeatureGroup):
350
352
 
351
353
  # Process each requested feature with the original clean data
352
354
  for feature in features.features:
353
- algorithm, horizon, time_unit, mloda_source_features = cls._extract_forecasting_parameters(feature)
355
+ algorithm, horizon, time_unit, in_features = cls._extract_forecasting_parameters(feature)
354
356
 
355
357
  # Resolve multi-column features automatically
356
- # If mloda_source_features is "onehot_encoded__product", this discovers
358
+ # If in_features is "onehot_encoded__product", this discovers
357
359
  # ["onehot_encoded__product~0", "onehot_encoded__product~1", ...]
358
360
  available_columns = cls._get_available_columns(original_data)
359
- resolved_columns = cls.resolve_multi_column_feature(mloda_source_features, available_columns)
361
+ resolved_columns = cls.resolve_multi_column_feature(in_features, available_columns)
360
362
 
361
363
  # Check that resolved columns exist
362
364
  cls._check_source_features_exist(original_data, resolved_columns)
@@ -428,14 +430,15 @@ class ForecastingFeatureGroup(AbstractFeatureGroup):
428
430
  # Try string-based parsing first
429
431
  feature_name_str = feature.name.name if hasattr(feature.name, "name") else str(feature.name)
430
432
 
431
- if cls.PATTERN in feature_name_str:
433
+ if FeatureChainParser.is_chained_feature(feature_name_str):
432
434
  algorithm, horizon, time_unit = cls.parse_forecast_suffix(feature_name_str)
433
- # Extract source feature (everything before the __)
434
- source_feature_name = feature_name_str.split(cls.PATTERN)[0]
435
+
436
+ # Extract source feature name (everything before the last double underscore)
437
+ source_feature_name = feature_name_str.rsplit(CHAIN_SEPARATOR, 1)[0]
435
438
  return algorithm, horizon, time_unit, source_feature_name
436
439
 
437
440
  # Fall back to configuration-based approach
438
- source_features = feature.options.get_source_features()
441
+ source_features = feature.options.get_in_features()
439
442
  source_feature = next(iter(source_features))
440
443
  source_feature_name = source_feature.get_name()
441
444
 
@@ -466,6 +469,7 @@ class ForecastingFeatureGroup(AbstractFeatureGroup):
466
469
  return algorithm, horizon, time_unit, source_feature_name
467
470
 
468
471
  @classmethod
472
+ @abstractmethod
469
473
  def _check_time_filter_feature_exists(cls, data: Any, time_filter_feature: str) -> None:
470
474
  """
471
475
  Check if the time filter feature exists in the data.
@@ -477,9 +481,10 @@ class ForecastingFeatureGroup(AbstractFeatureGroup):
477
481
  Raises:
478
482
  ValueError: If the time filter feature does not exist in the data
479
483
  """
480
- raise NotImplementedError(f"_check_time_filter_feature_exists not implemented in {cls.__name__}")
484
+ ...
481
485
 
482
486
  @classmethod
487
+ @abstractmethod
483
488
  def _check_time_filter_feature_is_datetime(cls, data: Any, time_filter_feature: str) -> None:
484
489
  """
485
490
  Check if the time filter feature is a datetime column.
@@ -491,9 +496,10 @@ class ForecastingFeatureGroup(AbstractFeatureGroup):
491
496
  Raises:
492
497
  ValueError: If the time filter feature is not a datetime column
493
498
  """
494
- raise NotImplementedError(f"_check_time_filter_feature_is_datetime not implemented in {cls.__name__}")
499
+ ...
495
500
 
496
501
  @classmethod
502
+ @abstractmethod
497
503
  def _get_available_columns(cls, data: Any) -> Set[str]:
498
504
  """
499
505
  Get the set of available column names from the data.
@@ -504,9 +510,10 @@ class ForecastingFeatureGroup(AbstractFeatureGroup):
504
510
  Returns:
505
511
  Set of column names available in the data
506
512
  """
507
- raise NotImplementedError(f"_get_available_columns not implemented in {cls.__name__}")
513
+ ...
508
514
 
509
515
  @classmethod
516
+ @abstractmethod
510
517
  def _check_source_features_exist(cls, data: Any, feature_names: List[str]) -> None:
511
518
  """
512
519
  Check if the resolved source features exist in the data.
@@ -518,9 +525,10 @@ class ForecastingFeatureGroup(AbstractFeatureGroup):
518
525
  Raises:
519
526
  ValueError: If none of the features exist in the data
520
527
  """
521
- raise NotImplementedError(f"_check_source_features_exist not implemented in {cls.__name__}")
528
+ ...
522
529
 
523
530
  @classmethod
531
+ @abstractmethod
524
532
  def _add_result_to_data(cls, data: Any, feature_name: str, result: Any) -> Any:
525
533
  """
526
534
  Add the result to the data.
@@ -533,16 +541,17 @@ class ForecastingFeatureGroup(AbstractFeatureGroup):
533
541
  Returns:
534
542
  The updated data
535
543
  """
536
- raise NotImplementedError(f"_add_result_to_data not implemented in {cls.__name__}")
544
+ ...
537
545
 
538
546
  @classmethod
547
+ @abstractmethod
539
548
  def _perform_forecasting(
540
549
  cls,
541
550
  data: Any,
542
551
  algorithm: str,
543
552
  horizon: int,
544
553
  time_unit: str,
545
- mloda_source_features: List[str],
554
+ in_features: List[str],
546
555
  time_filter_feature: str,
547
556
  model_artifact: Optional[Any] = None,
548
557
  ) -> tuple[Any, Optional[Any]]:
@@ -558,23 +567,24 @@ class ForecastingFeatureGroup(AbstractFeatureGroup):
558
567
  algorithm: The forecasting algorithm to use
559
568
  horizon: The forecast horizon
560
569
  time_unit: The time unit for the horizon
561
- mloda_source_features: List of resolved source feature names to forecast
570
+ in_features: List of resolved source feature names to forecast
562
571
  time_filter_feature: The name of the time filter feature
563
572
  model_artifact: Optional artifact containing a trained model
564
573
 
565
574
  Returns:
566
575
  A tuple containing (forecast_result, updated_artifact)
567
576
  """
568
- raise NotImplementedError(f"_perform_forecasting not implemented in {cls.__name__}")
577
+ ...
569
578
 
570
579
  @classmethod
580
+ @abstractmethod
571
581
  def _perform_forecasting_with_confidence(
572
582
  cls,
573
583
  data: Any,
574
584
  algorithm: str,
575
585
  horizon: int,
576
586
  time_unit: str,
577
- mloda_source_features: List[str],
587
+ in_features: List[str],
578
588
  time_filter_feature: str,
579
589
  model_artifact: Optional[Any] = None,
580
590
  ) -> tuple[Any, Any, Any, Optional[Any]]:
@@ -588,7 +598,7 @@ class ForecastingFeatureGroup(AbstractFeatureGroup):
588
598
  algorithm: The forecasting algorithm to use
589
599
  horizon: The forecast horizon
590
600
  time_unit: The time unit for the horizon
591
- mloda_source_features: List of resolved source feature names to forecast
601
+ in_features: List of resolved source feature names to forecast
592
602
  time_filter_feature: The name of the time filter feature
593
603
  model_artifact: Optional artifact containing a trained model
594
604
 
@@ -599,4 +609,4 @@ class ForecastingFeatureGroup(AbstractFeatureGroup):
599
609
  - upper_bound: The upper confidence bound
600
610
  - updated_artifact: The updated artifact (or None)
601
611
  """
602
- raise NotImplementedError(f"_perform_forecasting_with_confidence not implemented in {cls.__name__}")
612
+ ...
@@ -28,7 +28,7 @@ except ImportError:
28
28
 
29
29
 
30
30
  from mloda_core.abstract_plugins.compute_frame_work import ComputeFrameWork
31
- from mloda_plugins.compute_framework.base_implementations.pandas.dataframe import PandasDataframe
31
+ from mloda_plugins.compute_framework.base_implementations.pandas.dataframe import PandasDataFrame
32
32
  from mloda_plugins.feature_group.experimental.forecasting.base import ForecastingFeatureGroup
33
33
 
34
34
 
@@ -36,7 +36,7 @@ class PandasForecastingFeatureGroup(ForecastingFeatureGroup):
36
36
  @classmethod
37
37
  def compute_framework_rule(cls) -> set[type[ComputeFrameWork]]:
38
38
  """Define the compute framework for this feature group."""
39
- return {PandasDataframe}
39
+ return {PandasDataFrame}
40
40
 
41
41
  @classmethod
42
42
  def _get_available_columns(cls, data: pd.DataFrame) -> Set[str]:
@@ -120,7 +120,7 @@ class PandasForecastingFeatureGroup(ForecastingFeatureGroup):
120
120
  algorithm: str,
121
121
  horizon: int,
122
122
  time_unit: str,
123
- mloda_source_features: List[str],
123
+ in_features: List[str],
124
124
  time_filter_feature: str,
125
125
  model_artifact: Optional[Any] = None,
126
126
  ) -> Tuple[pd.Series, Dict[str, Any]]:
@@ -142,7 +142,7 @@ class PandasForecastingFeatureGroup(ForecastingFeatureGroup):
142
142
  algorithm: The forecasting algorithm to use
143
143
  horizon: The forecast horizon
144
144
  time_unit: The time unit for the horizon
145
- mloda_source_features: List of resolved source feature names to forecast
145
+ in_features: List of resolved source feature names to forecast
146
146
  time_filter_feature: The name of the time filter feature
147
147
  model_artifact: Optional artifact containing a trained model
148
148
 
@@ -173,7 +173,7 @@ class PandasForecastingFeatureGroup(ForecastingFeatureGroup):
173
173
  # For multi-column features, we need to handle each column separately or aggregate them
174
174
  # For now, we'll use the first column for single-column behavior
175
175
  # In the future, this could be extended to forecast multiple columns or aggregated columns
176
- source_feature_name = mloda_source_features[0] if len(mloda_source_features) == 1 else mloda_source_features[0]
176
+ source_feature_name = in_features[0] if len(in_features) == 1 else in_features[0]
177
177
 
178
178
  # Create or load the model
179
179
  if model_artifact is None:
@@ -314,14 +314,14 @@ class PandasForecastingFeatureGroup(ForecastingFeatureGroup):
314
314
 
315
315
  @classmethod
316
316
  def _create_features(
317
- cls, df: pd.DataFrame, mloda_source_features: str, time_filter_feature: str, lag_features: List[int]
317
+ cls, df: pd.DataFrame, in_features: str, time_filter_feature: str, lag_features: List[int]
318
318
  ) -> Tuple[pd.DataFrame, pd.Series]:
319
319
  """
320
320
  Create features for training the forecasting model.
321
321
 
322
322
  Args:
323
323
  df: The pandas DataFrame
324
- mloda_source_features: The name of the source feature
324
+ in_features: The name of the source feature
325
325
  time_filter_feature: The name of the time filter feature
326
326
  lag_features: List of lag periods to use
327
327
 
@@ -332,13 +332,13 @@ class PandasForecastingFeatureGroup(ForecastingFeatureGroup):
332
332
  df_features = df.copy()
333
333
 
334
334
  # Extract target variable
335
- y = df_features[mloda_source_features]
335
+ y = df_features[in_features]
336
336
 
337
337
  # Create time-based features
338
338
  df_features = cls._create_time_features(df_features, time_filter_feature)
339
339
 
340
340
  # Create lag features (previous values)
341
- df_features = cls._create_lag_features(df_features, mloda_source_features, lags=lag_features)
341
+ df_features = cls._create_lag_features(df_features, in_features, lags=lag_features)
342
342
 
343
343
  # Drop rows with NaN values (from lag features)
344
344
  df_features = df_features.dropna()
@@ -353,7 +353,7 @@ class PandasForecastingFeatureGroup(ForecastingFeatureGroup):
353
353
  )
354
354
 
355
355
  # Drop the original source feature and time filter feature
356
- X = df_features.drop([mloda_source_features, time_filter_feature], axis=1)
356
+ X = df_features.drop([in_features, time_filter_feature], axis=1)
357
357
 
358
358
  return X, y
359
359
 
@@ -420,7 +420,7 @@ class PandasForecastingFeatureGroup(ForecastingFeatureGroup):
420
420
  cls,
421
421
  df: pd.DataFrame,
422
422
  future_timestamps: List[datetime],
423
- mloda_source_features: str,
423
+ in_features: str,
424
424
  time_filter_feature: str,
425
425
  lag_features: List[int],
426
426
  ) -> pd.DataFrame:
@@ -430,7 +430,7 @@ class PandasForecastingFeatureGroup(ForecastingFeatureGroup):
430
430
  Args:
431
431
  df: The pandas DataFrame with historical data
432
432
  future_timestamps: List of future timestamps to create features for
433
- mloda_source_features: The name of the source feature
433
+ in_features: The name of the source feature
434
434
  time_filter_feature: The name of the time filter feature
435
435
  lag_features: List of lag periods to use
436
436
 
@@ -446,7 +446,7 @@ class PandasForecastingFeatureGroup(ForecastingFeatureGroup):
446
446
  # Get the most recent values for lag features
447
447
  max_lag = max(lag_features)
448
448
  available_values = min(len(df), max_lag)
449
- last_values = df[mloda_source_features].iloc[-available_values:].tolist()
449
+ last_values = df[in_features].iloc[-available_values:].tolist()
450
450
  last_values.reverse() # Reverse to get [t-n, ..., t-2, t-1]
451
451
 
452
452
  # Pad with the last value if we don't have enough history
@@ -457,9 +457,9 @@ class PandasForecastingFeatureGroup(ForecastingFeatureGroup):
457
457
  for lag in lag_features:
458
458
  lag_index = lag - 1 # Convert lag to index (lag 1 = index 0)
459
459
  if lag_index < len(last_values):
460
- future_df[f"{mloda_source_features}_lag_{lag}"] = last_values[lag_index]
460
+ future_df[f"{in_features}_lag_{lag}"] = last_values[lag_index]
461
461
  else:
462
- future_df[f"{mloda_source_features}_lag_{lag}"] = last_values[-1]
462
+ future_df[f"{in_features}_lag_{lag}"] = last_values[-1]
463
463
 
464
464
  # Drop the time filter feature
465
465
  future_df = future_df.drop([time_filter_feature], axis=1)
@@ -513,7 +513,7 @@ class PandasForecastingFeatureGroup(ForecastingFeatureGroup):
513
513
  algorithm: str,
514
514
  horizon: int,
515
515
  time_unit: str,
516
- mloda_source_features: List[str],
516
+ in_features: List[str],
517
517
  time_filter_feature: str,
518
518
  model_artifact: Optional[Any] = None,
519
519
  ) -> Tuple[pd.Series, pd.Series, pd.Series, Dict[str, Any]]:
@@ -531,7 +531,7 @@ class PandasForecastingFeatureGroup(ForecastingFeatureGroup):
531
531
  algorithm: The forecasting algorithm to use
532
532
  horizon: The forecast horizon
533
533
  time_unit: The time unit for the horizon
534
- mloda_source_features: List of resolved source feature names to forecast
534
+ in_features: List of resolved source feature names to forecast
535
535
  time_filter_feature: The name of the time filter feature
536
536
  model_artifact: Optional artifact containing a trained model
537
537
 
@@ -560,7 +560,7 @@ class PandasForecastingFeatureGroup(ForecastingFeatureGroup):
560
560
  lag_features = cls._determine_lag_features(horizon, time_unit, len(df))
561
561
 
562
562
  # For multi-column features, use the first column
563
- source_feature_name = mloda_source_features[0] if len(mloda_source_features) == 1 else mloda_source_features[0]
563
+ source_feature_name = in_features[0] if len(in_features) == 1 else in_features[0]
564
564
 
565
565
  # Create or load the model
566
566
  if model_artifact is None: