mloda 0.3.3__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mloda/__init__.py +17 -0
- {mloda_core → mloda/core}/abstract_plugins/components/base_artifact.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/base_validator.py +13 -0
- {mloda_core → mloda/core}/abstract_plugins/components/data_access_collection.py +1 -1
- {mloda_core → mloda/core}/abstract_plugins/components/data_types.py +39 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature.py +39 -33
- {mloda_core → mloda/core}/abstract_plugins/components/feature_chainer/feature_chain_parser.py +19 -19
- mloda/core/abstract_plugins/components/feature_chainer/feature_chain_parser_mixin.py +197 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature_collection.py +6 -6
- {mloda_core → mloda/core}/abstract_plugins/components/feature_group_version.py +8 -8
- {mloda_core → mloda/core}/abstract_plugins/components/feature_set.py +18 -24
- {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/cfw_transformer.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/index/add_index_feature.py +4 -4
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/api_input_data.py +3 -3
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/api_input_data_collection.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/base_api_data.py +1 -1
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/base_input_data.py +6 -6
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/creator/data_creator.py +3 -3
- mloda/core/abstract_plugins/components/link.py +437 -0
- {mloda_core → mloda/core}/abstract_plugins/components/match_data/match_data.py +3 -3
- {mloda_core → mloda/core}/abstract_plugins/components/merge/base_merge_engine.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/options.py +12 -36
- {mloda_core → mloda/core}/abstract_plugins/components/parallelization_modes.py +1 -1
- {mloda_core → mloda/core}/abstract_plugins/components/plugin_option/plugin_collector.py +14 -14
- mloda/core/abstract_plugins/components/validators/datatype_validator.py +96 -0
- mloda/core/abstract_plugins/components/validators/feature_set_validator.py +38 -0
- mloda/core/abstract_plugins/components/validators/feature_validator.py +23 -0
- mloda/core/abstract_plugins/components/validators/link_validator.py +79 -0
- mloda/core/abstract_plugins/components/validators/options_validator.py +57 -0
- mloda_core/abstract_plugins/compute_frame_work.py → mloda/core/abstract_plugins/compute_framework.py +45 -37
- mloda_core/abstract_plugins/abstract_feature_group.py → mloda/core/abstract_plugins/feature_group.py +56 -33
- mloda/core/abstract_plugins/function_extender.py +78 -0
- mloda/core/api/plugin_docs.py +220 -0
- mloda/core/api/plugin_info.py +32 -0
- {mloda_core → mloda/core}/api/prepare/setup_compute_framework.py +11 -11
- {mloda_core → mloda/core}/api/request.py +42 -33
- {mloda_core → mloda/core}/core/cfw_manager.py +8 -8
- {mloda_core → mloda/core}/core/engine.py +47 -47
- {mloda_core → mloda/core}/core/step/abstract_step.py +7 -7
- {mloda_core → mloda/core}/core/step/feature_group_step.py +12 -12
- {mloda_core → mloda/core}/core/step/join_step.py +14 -14
- {mloda_core → mloda/core}/core/step/transform_frame_work_step.py +16 -16
- {mloda_core → mloda/core}/filter/filter_engine.py +1 -1
- {mloda_core → mloda/core}/filter/filter_type_enum.py +1 -1
- {mloda_core → mloda/core}/filter/global_filter.py +23 -23
- {mloda_core → mloda/core}/filter/single_filter.py +6 -6
- {mloda_core → mloda/core}/prepare/accessible_plugins.py +15 -18
- {mloda_core → mloda/core}/prepare/execution_plan.py +65 -39
- {mloda_core → mloda/core}/prepare/graph/build_graph.py +6 -6
- {mloda_core → mloda/core}/prepare/graph/graph.py +1 -1
- {mloda_core → mloda/core}/prepare/graph/properties.py +5 -5
- {mloda_core → mloda/core}/prepare/identify_feature_group.py +12 -14
- {mloda_core → mloda/core}/prepare/joinstep_collection.py +3 -3
- {mloda_core → mloda/core}/prepare/resolve_compute_frameworks.py +6 -6
- {mloda_core → mloda/core}/prepare/resolve_graph.py +11 -11
- {mloda_core → mloda/core}/prepare/resolve_links.py +11 -31
- mloda/core/prepare/validators/resolve_link_validator.py +32 -0
- mloda/core/runtime/compute_framework_executor.py +271 -0
- mloda/core/runtime/data_lifecycle_manager.py +160 -0
- mloda/core/runtime/flight/__init__.py +0 -0
- {mloda_core → mloda/core}/runtime/flight/runner_flight_server.py +1 -1
- mloda/core/runtime/run.py +317 -0
- mloda/core/runtime/worker/__init__.py +0 -0
- {mloda_core → mloda/core}/runtime/worker/multiprocessing_worker.py +15 -10
- {mloda_core → mloda/core}/runtime/worker/thread_worker.py +2 -2
- mloda/core/runtime/worker_manager.py +96 -0
- mloda/provider/__init__.py +101 -0
- mloda/steward/__init__.py +25 -0
- mloda/user/__init__.py +57 -0
- {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/METADATA +24 -31
- mloda-0.4.1.dist-info/RECORD +248 -0
- {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/top_level.txt +1 -1
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_framework.py +15 -13
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_framework.py +12 -10
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/pandas/dataframe.py +18 -16
- mloda_plugins/compute_framework/base_implementations/pandas/pandas_filter_engine.py +36 -13
- mloda_plugins/compute_framework/base_implementations/pandas/pandas_merge_engine.py +7 -7
- mloda_plugins/compute_framework/base_implementations/pandas/pandaspyarrowtransformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/polars/dataframe.py +16 -14
- mloda_plugins/compute_framework/base_implementations/polars/lazy_dataframe.py +13 -12
- mloda_plugins/compute_framework/base_implementations/polars/polars_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/polars/polars_lazy_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/polars/polars_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/polars/polars_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/pyarrow/table.py +12 -11
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_framework.py +11 -9
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/spark/spark_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/spark/spark_framework.py +17 -15
- mloda_plugins/compute_framework/base_implementations/spark/spark_merge_engine.py +2 -3
- mloda_plugins/compute_framework/base_implementations/spark/spark_pyarrow_transformer.py +1 -1
- mloda_plugins/config/feature/loader.py +2 -2
- mloda_plugins/feature_group/experimental/aggregated_feature_group/base.py +45 -64
- mloda_plugins/feature_group/experimental/aggregated_feature_group/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/aggregated_feature_group/polars_lazy.py +2 -2
- mloda_plugins/feature_group/experimental/aggregated_feature_group/pyarrow.py +2 -2
- mloda_plugins/feature_group/experimental/clustering/base.py +67 -97
- mloda_plugins/feature_group/experimental/clustering/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/data_quality/missing_value/base.py +58 -82
- mloda_plugins/feature_group/experimental/data_quality/missing_value/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/data_quality/missing_value/pyarrow.py +2 -2
- mloda_plugins/feature_group/experimental/data_quality/missing_value/python_dict.py +2 -2
- mloda_plugins/feature_group/experimental/default_options_key.py +16 -19
- mloda_plugins/feature_group/experimental/dimensionality_reduction/base.py +81 -96
- mloda_plugins/feature_group/experimental/dimensionality_reduction/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/dynamic_feature_group_factory/dynamic_feature_group_factory.py +24 -24
- mloda_plugins/feature_group/experimental/forecasting/base.py +108 -106
- mloda_plugins/feature_group/experimental/forecasting/forecasting_artifact.py +2 -2
- mloda_plugins/feature_group/experimental/forecasting/pandas.py +15 -15
- mloda_plugins/feature_group/experimental/geo_distance/base.py +52 -44
- mloda_plugins/feature_group/experimental/geo_distance/pandas.py +2 -3
- mloda_plugins/feature_group/experimental/llm/cli.py +4 -4
- mloda_plugins/feature_group/experimental/llm/cli_features/refactor_git_cached.py +19 -19
- mloda_plugins/feature_group/experimental/llm/installed_packages_feature_group.py +8 -8
- mloda_plugins/feature_group/experimental/llm/list_directory_feature_group.py +5 -5
- mloda_plugins/feature_group/experimental/llm/llm_api/claude.py +3 -3
- mloda_plugins/feature_group/experimental/llm/llm_api/gemini.py +3 -3
- mloda_plugins/feature_group/experimental/llm/llm_api/llm_base_request.py +5 -5
- mloda_plugins/feature_group/experimental/llm/llm_api/openai.py +3 -3
- mloda_plugins/feature_group/experimental/llm/llm_api/request_loop.py +6 -6
- mloda_plugins/feature_group/experimental/llm/llm_file_selector.py +10 -10
- mloda_plugins/feature_group/experimental/llm/tools/tool_collection.py +1 -1
- mloda_plugins/feature_group/experimental/node_centrality/base.py +46 -74
- mloda_plugins/feature_group/experimental/node_centrality/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/encoding/base.py +53 -53
- mloda_plugins/feature_group/experimental/sklearn/encoding/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/pipeline/base.py +52 -39
- mloda_plugins/feature_group/experimental/sklearn/pipeline/pandas.py +3 -4
- mloda_plugins/feature_group/experimental/sklearn/scaling/base.py +44 -60
- mloda_plugins/feature_group/experimental/sklearn/scaling/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/sklearn_artifact.py +2 -3
- mloda_plugins/feature_group/experimental/source_input_feature.py +15 -15
- mloda_plugins/feature_group/experimental/text_cleaning/base.py +38 -63
- mloda_plugins/feature_group/experimental/text_cleaning/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/text_cleaning/python_dict.py +2 -2
- mloda_plugins/feature_group/experimental/time_window/base.py +108 -95
- mloda_plugins/feature_group/experimental/time_window/pandas.py +13 -13
- mloda_plugins/feature_group/experimental/time_window/pyarrow.py +12 -12
- mloda_plugins/feature_group/input_data/api_data/api_data.py +9 -11
- mloda_plugins/feature_group/input_data/read_context_files.py +7 -7
- mloda_plugins/feature_group/input_data/read_db.py +7 -9
- mloda_plugins/feature_group/input_data/read_db_feature.py +4 -4
- mloda_plugins/feature_group/input_data/read_dbs/sqlite.py +23 -13
- mloda_plugins/feature_group/input_data/read_file.py +8 -8
- mloda_plugins/feature_group/input_data/read_file_feature.py +4 -4
- mloda_plugins/feature_group/input_data/read_files/csv.py +6 -6
- mloda_plugins/feature_group/input_data/read_files/feather.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/json.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/orc.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/parquet.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/text_file_reader.py +5 -5
- mloda_plugins/function_extender/base_implementations/otel/otel_extender.py +4 -4
- mloda-0.3.3.dist-info/RECORD +0 -230
- mloda_core/abstract_plugins/components/link.py +0 -286
- mloda_core/abstract_plugins/function_extender.py +0 -34
- mloda_core/runtime/run.py +0 -617
- {mloda_core → mloda/core}/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/domain.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature_chainer/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature_name.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/base_transformer.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/hashable_dict.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/index/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/index/index.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/creator/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/match_data/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/merge/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/plugin_option/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/utils.py +0 -0
- {mloda_core/abstract_plugins/plugin_loader → mloda/core/abstract_plugins/components/validators}/__init__.py +0 -0
- {mloda_core/api → mloda/core/abstract_plugins/plugin_loader}/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/plugin_loader/plugin_loader.py +0 -0
- {mloda_core/api/prepare → mloda/core/api}/__init__.py +0 -0
- {mloda_core/core → mloda/core/api/prepare}/__init__.py +0 -0
- {mloda_core/core/step → mloda/core/core}/__init__.py +0 -0
- {mloda_core/filter → mloda/core/core/step}/__init__.py +0 -0
- {mloda_core/prepare → mloda/core/filter}/__init__.py +0 -0
- {mloda_core → mloda/core}/filter/filter_parameter.py +0 -0
- {mloda_core/prepare/graph → mloda/core/prepare}/__init__.py +0 -0
- {mloda_core/runtime → mloda/core/prepare/graph}/__init__.py +0 -0
- {mloda_core/runtime/flight → mloda/core/prepare/validators}/__init__.py +0 -0
- {mloda_core/runtime/worker → mloda/core/runtime}/__init__.py +0 -0
- {mloda_core → mloda/core}/runtime/flight/flight_server.py +0 -0
- {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/WHEEL +0 -0
- {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/entry_points.txt +0 -0
- {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/licenses/LICENSE.TXT +0 -0
- {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
from typing import Set, Type
|
|
2
2
|
|
|
3
|
-
from
|
|
3
|
+
from mloda.core.abstract_plugins.feature_group import FeatureGroup
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
class
|
|
6
|
+
class PluginCollector:
|
|
7
7
|
"""
|
|
8
|
-
The
|
|
8
|
+
The PluginCollector class is a helper class with the purpose to disable or enable feature groups.
|
|
9
9
|
|
|
10
10
|
This class is useful for rapid prototype development, where you want to disable or enable feature groups,
|
|
11
11
|
when the other, competing feature groups are found.
|
|
@@ -15,16 +15,16 @@ class PlugInCollector:
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
def __init__(self) -> None:
|
|
18
|
-
self.disabled_feature_group_classes: Set[Type[
|
|
19
|
-
self.enabled_feature_group_classes: Set[Type[
|
|
18
|
+
self.disabled_feature_group_classes: Set[Type[FeatureGroup]] = set()
|
|
19
|
+
self.enabled_feature_group_classes: Set[Type[FeatureGroup]] = set()
|
|
20
20
|
|
|
21
|
-
def add_disabled_feature_group_classes(self, feature_group_cls: Set[Type[
|
|
21
|
+
def add_disabled_feature_group_classes(self, feature_group_cls: Set[Type[FeatureGroup]]) -> None:
|
|
22
22
|
self.disabled_feature_group_classes.update(feature_group_cls)
|
|
23
23
|
|
|
24
|
-
def add_enabled_feature_group_classes(self, feature_group_cls: Set[Type[
|
|
24
|
+
def add_enabled_feature_group_classes(self, feature_group_cls: Set[Type[FeatureGroup]]) -> None:
|
|
25
25
|
self.enabled_feature_group_classes.update(feature_group_cls)
|
|
26
26
|
|
|
27
|
-
def applicable_feature_group_class(self, feature_group_cls: Type[
|
|
27
|
+
def applicable_feature_group_class(self, feature_group_cls: Type[FeatureGroup]) -> bool:
|
|
28
28
|
if feature_group_cls in self.disabled_feature_group_classes:
|
|
29
29
|
return False
|
|
30
30
|
|
|
@@ -38,22 +38,22 @@ class PlugInCollector:
|
|
|
38
38
|
|
|
39
39
|
@staticmethod
|
|
40
40
|
def disabled_feature_groups(
|
|
41
|
-
feature_group_cls: Set[Type[
|
|
42
|
-
) -> "
|
|
41
|
+
feature_group_cls: Set[Type[FeatureGroup]] | Type[FeatureGroup],
|
|
42
|
+
) -> "PluginCollector":
|
|
43
43
|
if not isinstance(feature_group_cls, Set):
|
|
44
44
|
feature_group_cls = {feature_group_cls}
|
|
45
45
|
|
|
46
|
-
plugin_collector =
|
|
46
|
+
plugin_collector = PluginCollector()
|
|
47
47
|
plugin_collector.add_disabled_feature_group_classes(feature_group_cls)
|
|
48
48
|
return plugin_collector
|
|
49
49
|
|
|
50
50
|
@staticmethod
|
|
51
51
|
def enabled_feature_groups(
|
|
52
|
-
feature_group_cls: Set[Type[
|
|
53
|
-
) -> "
|
|
52
|
+
feature_group_cls: Set[Type[FeatureGroup]] | Type[FeatureGroup],
|
|
53
|
+
) -> "PluginCollector":
|
|
54
54
|
if not isinstance(feature_group_cls, Set):
|
|
55
55
|
feature_group_cls = {feature_group_cls}
|
|
56
56
|
|
|
57
|
-
plugin_collector =
|
|
57
|
+
plugin_collector = PluginCollector()
|
|
58
58
|
plugin_collector.add_enabled_feature_group_classes(feature_group_cls)
|
|
59
59
|
return plugin_collector
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
from mloda.core.abstract_plugins.components.data_types import DataType
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DataTypeMismatchError(ValueError):
|
|
8
|
+
"""Raised when feature data type doesn't match declared type."""
|
|
9
|
+
|
|
10
|
+
def __init__(self, feature_name: str, declared: DataType, actual: DataType) -> None:
|
|
11
|
+
self.feature_name = feature_name
|
|
12
|
+
self.declared = declared
|
|
13
|
+
self.actual = actual
|
|
14
|
+
super().__init__(
|
|
15
|
+
f"Feature '{feature_name}': declared {declared.name}, got {actual.name}, coercion not supported"
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DataTypeValidator:
|
|
20
|
+
"""Validates feature data matches declared DataType."""
|
|
21
|
+
|
|
22
|
+
_COMPATIBLE_TYPES = {
|
|
23
|
+
DataType.INT64: {DataType.INT32, DataType.INT64},
|
|
24
|
+
DataType.DOUBLE: {DataType.FLOAT, DataType.DOUBLE, DataType.INT32, DataType.INT64},
|
|
25
|
+
DataType.TIMESTAMP_MICROS: {DataType.TIMESTAMP_MILLIS, DataType.TIMESTAMP_MICROS},
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def _types_compatible(cls, declared: DataType, actual: DataType) -> bool:
|
|
30
|
+
"""Check if actual type is compatible with declared (allows widening)."""
|
|
31
|
+
if declared == actual:
|
|
32
|
+
return True
|
|
33
|
+
return actual in cls._COMPATIBLE_TYPES.get(declared, set())
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def _types_loosely_compatible(cls, declared: DataType, actual: DataType) -> bool:
|
|
37
|
+
"""Check if types are loosely compatible (allows any numeric/timestamp pairing).
|
|
38
|
+
|
|
39
|
+
Lenient mode allows data type mismatches within the same category:
|
|
40
|
+
- All numeric types (INT32, INT64, FLOAT, DOUBLE) are interchangeable
|
|
41
|
+
- All timestamp types are interchangeable
|
|
42
|
+
- Other types must match exactly
|
|
43
|
+
|
|
44
|
+
This fixes legacy FeatureGroups that declare INT32 but return DOUBLE.
|
|
45
|
+
"""
|
|
46
|
+
if declared == actual:
|
|
47
|
+
return True
|
|
48
|
+
|
|
49
|
+
numeric_types = {DataType.INT32, DataType.INT64, DataType.FLOAT, DataType.DOUBLE}
|
|
50
|
+
if declared in numeric_types and actual in numeric_types:
|
|
51
|
+
return True
|
|
52
|
+
|
|
53
|
+
timestamp_types = {DataType.TIMESTAMP_MILLIS, DataType.TIMESTAMP_MICROS}
|
|
54
|
+
if declared in timestamp_types and actual in timestamp_types:
|
|
55
|
+
return True
|
|
56
|
+
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def validate(cls, data: Any, features: Any, strict_only: bool = False) -> None:
|
|
61
|
+
"""Validate that data columns match declared feature types.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
data: PyArrow table or similar with column data
|
|
65
|
+
features: FeatureSet containing features to validate
|
|
66
|
+
strict_only: If True, only validate when strict_type_enforcement is enabled.
|
|
67
|
+
This maintains backward compatibility with existing code.
|
|
68
|
+
"""
|
|
69
|
+
from mloda_plugins.feature_group.experimental.default_options_key import DefaultOptionKeys
|
|
70
|
+
|
|
71
|
+
for feature in features.features:
|
|
72
|
+
if feature.data_type is None:
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
col_name = feature.get_name()
|
|
76
|
+
if col_name not in data.column_names:
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
arrow_type = data.schema.field(col_name).type
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
actual_type = DataType.from_arrow_type(arrow_type)
|
|
83
|
+
except ValueError:
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
strict_mode = False
|
|
87
|
+
if feature.options:
|
|
88
|
+
strict_value = feature.options.get(DefaultOptionKeys.strict_type_enforcement)
|
|
89
|
+
strict_mode = strict_value if strict_value is not None else False
|
|
90
|
+
|
|
91
|
+
if strict_mode:
|
|
92
|
+
if not cls._types_compatible(feature.data_type, actual_type):
|
|
93
|
+
raise DataTypeMismatchError(col_name, feature.data_type, actual_type)
|
|
94
|
+
else:
|
|
95
|
+
if not cls._types_loosely_compatible(feature.data_type, actual_type):
|
|
96
|
+
raise DataTypeMismatchError(col_name, feature.data_type, actual_type)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from typing import Any, Optional, Set, TYPE_CHECKING
|
|
2
|
+
|
|
3
|
+
if TYPE_CHECKING:
|
|
4
|
+
from mloda.core.abstract_plugins.components.feature import Feature
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class FeatureSetValidator:
|
|
8
|
+
@staticmethod
|
|
9
|
+
def validate_options_initialized(options: Any, context: str = "FeatureSet") -> None:
|
|
10
|
+
if options is None:
|
|
11
|
+
raise ValueError(f"Options not initialized in {context}")
|
|
12
|
+
|
|
13
|
+
@staticmethod
|
|
14
|
+
def validate_equal_options(features: Set["Feature"]) -> None:
|
|
15
|
+
if len(features) <= 1:
|
|
16
|
+
return
|
|
17
|
+
|
|
18
|
+
options_list = [feature.options for feature in features]
|
|
19
|
+
first_options = options_list[0]
|
|
20
|
+
|
|
21
|
+
for options in options_list[1:]:
|
|
22
|
+
if options != first_options:
|
|
23
|
+
raise ValueError("Features have different options")
|
|
24
|
+
|
|
25
|
+
@staticmethod
|
|
26
|
+
def validate_feature_added(feature_name: Optional[str], context: str = "feature") -> None:
|
|
27
|
+
if feature_name is None:
|
|
28
|
+
raise ValueError(f"Feature name is None in {context}")
|
|
29
|
+
|
|
30
|
+
@staticmethod
|
|
31
|
+
def validate_filters_not_set(filters: Any) -> None:
|
|
32
|
+
if filters is not None:
|
|
33
|
+
raise ValueError("Filters already set")
|
|
34
|
+
|
|
35
|
+
@staticmethod
|
|
36
|
+
def validate_filters_is_set_type(filters: Any) -> None:
|
|
37
|
+
if not isinstance(filters, set):
|
|
38
|
+
raise ValueError("Filters must be a Set type")
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from typing import Optional, Set, Type
|
|
2
|
+
from mloda.core.abstract_plugins.compute_framework import ComputeFramework
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class FeatureValidator:
|
|
6
|
+
@staticmethod
|
|
7
|
+
def validate_and_resolve_compute_framework(
|
|
8
|
+
framework_name: str, available_frameworks: Set[Type[ComputeFramework]], source: str = "parameter"
|
|
9
|
+
) -> Type[ComputeFramework]:
|
|
10
|
+
for subclass in available_frameworks:
|
|
11
|
+
if framework_name == subclass.get_class_name():
|
|
12
|
+
return subclass
|
|
13
|
+
raise ValueError(f"Compute framework via {source} {framework_name} not found.")
|
|
14
|
+
|
|
15
|
+
@staticmethod
|
|
16
|
+
def validate_compute_frameworks_resolved(
|
|
17
|
+
compute_frameworks: Optional[Set[Type[ComputeFramework]]], feature_name: str
|
|
18
|
+
) -> None:
|
|
19
|
+
if compute_frameworks is None:
|
|
20
|
+
raise ValueError(
|
|
21
|
+
f"Feature {feature_name} does not have any compute framework. "
|
|
22
|
+
"This function can only be called when the frameworks were resolved."
|
|
23
|
+
)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING, Any, Optional, Set, Tuple, Union
|
|
2
|
+
|
|
3
|
+
if TYPE_CHECKING:
|
|
4
|
+
from mloda.core.abstract_plugins.components.link import Link
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class LinkValidator:
|
|
8
|
+
@staticmethod
|
|
9
|
+
def validate_index_not_empty(index: Union[str, Tuple[str, ...]], context: str = "index") -> None:
|
|
10
|
+
if not index:
|
|
11
|
+
raise ValueError(f"{context} cannot be empty")
|
|
12
|
+
|
|
13
|
+
@staticmethod
|
|
14
|
+
def validate_join_type(jointype: Any) -> None:
|
|
15
|
+
from mloda.core.abstract_plugins.components.link import JoinType
|
|
16
|
+
|
|
17
|
+
if not isinstance(jointype, JoinType):
|
|
18
|
+
raise ValueError(f"Join type {jointype} is not supported")
|
|
19
|
+
|
|
20
|
+
@staticmethod
|
|
21
|
+
def validate_no_double_joins(links: Set["Link"]) -> None:
|
|
22
|
+
from mloda.core.abstract_plugins.components.link import JoinType
|
|
23
|
+
|
|
24
|
+
for i_link in links:
|
|
25
|
+
for j_link in links:
|
|
26
|
+
if i_link == j_link:
|
|
27
|
+
continue
|
|
28
|
+
if (
|
|
29
|
+
i_link.left_feature_group == j_link.right_feature_group
|
|
30
|
+
and i_link.right_feature_group == j_link.left_feature_group
|
|
31
|
+
and i_link.jointype not in [JoinType.APPEND, JoinType.UNION]
|
|
32
|
+
):
|
|
33
|
+
raise ValueError(
|
|
34
|
+
f"Link {i_link} and {j_link} have at least two different defined joins. Please remove one."
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def validate_no_conflicting_join_types(links: Set["Link"]) -> None:
|
|
39
|
+
for i_link in links:
|
|
40
|
+
for j_link in links:
|
|
41
|
+
if i_link == j_link:
|
|
42
|
+
continue
|
|
43
|
+
if (
|
|
44
|
+
i_link.left_feature_group == j_link.left_feature_group
|
|
45
|
+
and i_link.right_feature_group == j_link.right_feature_group
|
|
46
|
+
and i_link.jointype != j_link.jointype
|
|
47
|
+
):
|
|
48
|
+
raise ValueError(
|
|
49
|
+
f"Link {i_link} and {j_link} have different join types for the same feature groups. Please remove one."
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def validate_right_join_constraints(links: Set["Link"]) -> None:
|
|
54
|
+
from mloda.core.abstract_plugins.components.link import JoinType
|
|
55
|
+
|
|
56
|
+
for i_link in links:
|
|
57
|
+
if i_link.jointype == JoinType.RIGHT:
|
|
58
|
+
for j_link in links:
|
|
59
|
+
if i_link == j_link:
|
|
60
|
+
continue
|
|
61
|
+
if (
|
|
62
|
+
i_link.left_feature_group == j_link.left_feature_group
|
|
63
|
+
or i_link.left_feature_group == j_link.right_feature_group
|
|
64
|
+
):
|
|
65
|
+
raise ValueError(
|
|
66
|
+
f"Link {i_link} and {j_link} have multiple right joins for the same feature group on the left side or switching from left to right side although using right join. Please reconsider your joinlogic and if possible, use left joins instead of rightjoins. This will currently break the planner or during execution."
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
@classmethod
|
|
70
|
+
def validate_links(cls, links: Optional[Set["Link"]]) -> None:
|
|
71
|
+
if links is None:
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
for link in links:
|
|
75
|
+
cls.validate_join_type(link.jointype)
|
|
76
|
+
|
|
77
|
+
cls.validate_no_double_joins(links)
|
|
78
|
+
cls.validate_no_conflicting_join_types(links)
|
|
79
|
+
cls.validate_right_join_constraints(links)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from typing import Any, Dict, Set
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class OptionsValidator:
|
|
5
|
+
"""Validates Options configuration consistency."""
|
|
6
|
+
|
|
7
|
+
@staticmethod
|
|
8
|
+
def validate_no_duplicate_keys(group: Dict[str, Any], context: Dict[str, Any]) -> None:
|
|
9
|
+
"""
|
|
10
|
+
Ensure no key exists in both group and context.
|
|
11
|
+
|
|
12
|
+
Raises ValueError if any key exists in both, with duplicate keys in message.
|
|
13
|
+
"""
|
|
14
|
+
duplicate_keys = set(group.keys()) & set(context.keys())
|
|
15
|
+
if duplicate_keys:
|
|
16
|
+
raise ValueError(f"Keys cannot exist in both group and context: {duplicate_keys}")
|
|
17
|
+
|
|
18
|
+
@staticmethod
|
|
19
|
+
def validate_can_add_to_group(key: str, value: Any, group: Dict[str, Any], context: Dict[str, Any]) -> None:
|
|
20
|
+
"""
|
|
21
|
+
Validate that a key can be added to group.
|
|
22
|
+
|
|
23
|
+
Checks:
|
|
24
|
+
1. If key exists in group with different value -> ValueError (include key in message)
|
|
25
|
+
2. If key exists in context -> ValueError (include key in message)
|
|
26
|
+
"""
|
|
27
|
+
if key in group:
|
|
28
|
+
if value != group[key]:
|
|
29
|
+
raise ValueError(f"Key {key} already exists in group options with a different value: {group[key]}")
|
|
30
|
+
if key in context:
|
|
31
|
+
raise ValueError(f"Key {key} already exists in context options. Cannot add to group.")
|
|
32
|
+
|
|
33
|
+
@staticmethod
|
|
34
|
+
def validate_can_add_to_context(key: str, value: Any, group: Dict[str, Any], context: Dict[str, Any]) -> None:
|
|
35
|
+
"""
|
|
36
|
+
Validate that a key can be added to context.
|
|
37
|
+
|
|
38
|
+
Checks:
|
|
39
|
+
1. If key exists in context with different value -> ValueError (include key in message)
|
|
40
|
+
2. If key exists in group -> ValueError (include key in message)
|
|
41
|
+
"""
|
|
42
|
+
if key in context:
|
|
43
|
+
if value != context[key]:
|
|
44
|
+
raise ValueError(f"Key {key} already exists in context options with a different value: {context[key]}")
|
|
45
|
+
if key in group:
|
|
46
|
+
raise ValueError(f"Key {key} already exists in group options. Cannot add to context.")
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def validate_no_group_context_conflicts(other_group_keys: Set[str], self_context_keys: Set[str]) -> None:
|
|
50
|
+
"""
|
|
51
|
+
Validate no conflicts between other's group keys and self's context keys.
|
|
52
|
+
|
|
53
|
+
Raises ValueError if any key exists in both, with conflicting keys in message.
|
|
54
|
+
"""
|
|
55
|
+
conflicting_keys = other_group_keys & self_context_keys
|
|
56
|
+
if conflicting_keys:
|
|
57
|
+
raise ValueError(f"Cannot update group: keys already exist in context: {conflicting_keys}")
|
mloda_core/abstract_plugins/compute_frame_work.py → mloda/core/abstract_plugins/compute_framework.py
RENAMED
|
@@ -1,23 +1,26 @@
|
|
|
1
1
|
from abc import ABC
|
|
2
2
|
from typing import Any, List, Optional, Set, Type, Union, final
|
|
3
3
|
from uuid import UUID, uuid4
|
|
4
|
-
from
|
|
5
|
-
from mloda_core.abstract_plugins.components.framework_transformer.cfw_transformer import (
|
|
4
|
+
from mloda.core.abstract_plugins.components.framework_transformer.cfw_transformer import (
|
|
6
5
|
ComputeFrameworkTransformer,
|
|
7
6
|
)
|
|
8
|
-
from
|
|
7
|
+
from mloda.core.abstract_plugins.components.merge.base_merge_engine import BaseMergeEngine
|
|
9
8
|
import pyarrow as pa
|
|
10
9
|
|
|
11
|
-
from
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
10
|
+
from mloda.core.abstract_plugins.function_extender import (
|
|
11
|
+
Extender,
|
|
12
|
+
ExtenderHook,
|
|
13
|
+
_CompositeExtender,
|
|
14
|
+
)
|
|
15
|
+
from mloda.core.abstract_plugins.components.feature_name import FeatureName
|
|
16
|
+
from mloda.core.abstract_plugins.components.parallelization_modes import ParallelizationMode
|
|
17
|
+
from mloda.core.filter.filter_engine import BaseFilterEngine
|
|
18
|
+
from mloda.core.runtime.flight.flight_server import FlightServer
|
|
16
19
|
|
|
17
20
|
|
|
18
|
-
class
|
|
21
|
+
class ComputeFramework(ABC):
|
|
19
22
|
"""
|
|
20
|
-
Documentation
|
|
23
|
+
Documentation ComputeFramework:
|
|
21
24
|
|
|
22
25
|
This class is used to define the compute framework.
|
|
23
26
|
|
|
@@ -41,10 +44,10 @@ class ComputeFrameWork(ABC):
|
|
|
41
44
|
|
|
42
45
|
def __init__(
|
|
43
46
|
self,
|
|
44
|
-
mode:
|
|
47
|
+
mode: ParallelizationMode,
|
|
45
48
|
children_if_root: frozenset[UUID],
|
|
46
49
|
uuid: UUID = uuid4(),
|
|
47
|
-
function_extender: Optional[Set[
|
|
50
|
+
function_extender: Optional[Set[Extender]] = None,
|
|
48
51
|
) -> None:
|
|
49
52
|
"""This class is initialized step execution."""
|
|
50
53
|
self.mode = mode
|
|
@@ -64,18 +67,19 @@ class ComputeFrameWork(ABC):
|
|
|
64
67
|
# connection object for frameworks that need persistent connections (e.g., DuckDB, Spark)
|
|
65
68
|
self.framework_connection_object: Optional[Any] = None
|
|
66
69
|
|
|
67
|
-
@
|
|
68
|
-
def expected_data_framework() -> Any:
|
|
70
|
+
@classmethod
|
|
71
|
+
def expected_data_framework(cls) -> Any:
|
|
69
72
|
"""
|
|
70
73
|
This function should return the expected data framework for the compute framework.
|
|
71
74
|
However, we only need to set it if we really want to be sure that the datatype is correct.
|
|
72
75
|
"""
|
|
73
76
|
return None
|
|
74
77
|
|
|
75
|
-
|
|
78
|
+
@classmethod
|
|
79
|
+
def filter_engine(cls) -> Type[BaseFilterEngine]:
|
|
76
80
|
"""
|
|
77
81
|
This function should return the filtered data.
|
|
78
|
-
The BaseFilterEngine should be overwritten by the appropriate
|
|
82
|
+
The BaseFilterEngine should be overwritten by the appropriate ComputeFramework if needed
|
|
79
83
|
"""
|
|
80
84
|
raise NotImplementedError
|
|
81
85
|
|
|
@@ -122,16 +126,15 @@ class ComputeFrameWork(ABC):
|
|
|
122
126
|
"""
|
|
123
127
|
return data
|
|
124
128
|
|
|
125
|
-
|
|
129
|
+
@classmethod
|
|
130
|
+
def merge_engine(cls) -> Type[BaseMergeEngine]:
|
|
126
131
|
"""
|
|
127
132
|
This function should return a subclass of the BaseMergeEngine.
|
|
128
133
|
With this, we can merge data from the same compute framework.
|
|
129
134
|
|
|
130
135
|
This implementation is optional.
|
|
131
136
|
"""
|
|
132
|
-
raise NotImplementedError(
|
|
133
|
-
f"Merge functionality is for this compute framework not implemented {self.__class__.__name__}."
|
|
134
|
-
)
|
|
137
|
+
raise NotImplementedError(f"Merge functionality is for this compute framework not implemented {cls.__name__}.")
|
|
135
138
|
|
|
136
139
|
def set_framework_connection_object(self, framework_connection_object: Optional[Any] = None) -> None:
|
|
137
140
|
"""
|
|
@@ -241,7 +244,7 @@ class ComputeFrameWork(ABC):
|
|
|
241
244
|
if self.data is None:
|
|
242
245
|
return
|
|
243
246
|
|
|
244
|
-
extender = self.get_function_extender(
|
|
247
|
+
extender = self.get_function_extender(ExtenderHook.VALIDATE_INPUT_FEATURE)
|
|
245
248
|
if extender is None:
|
|
246
249
|
result = feature_group.validate_input_features(self.data, features)
|
|
247
250
|
else:
|
|
@@ -255,7 +258,12 @@ class ComputeFrameWork(ABC):
|
|
|
255
258
|
def run_validate_output_features(self, feature_group: Any, features: Any) -> Any:
|
|
256
259
|
if self.data is None:
|
|
257
260
|
return
|
|
258
|
-
|
|
261
|
+
|
|
262
|
+
from mloda.core.abstract_plugins.components.validators.datatype_validator import DataTypeValidator
|
|
263
|
+
|
|
264
|
+
DataTypeValidator.validate(self.data, features, strict_only=True)
|
|
265
|
+
|
|
266
|
+
extender = self.get_function_extender(ExtenderHook.VALIDATE_OUTPUT_FEATURE)
|
|
259
267
|
if extender is None:
|
|
260
268
|
result = feature_group.validate_output_features(self.data, features)
|
|
261
269
|
else:
|
|
@@ -276,7 +284,7 @@ class ComputeFrameWork(ABC):
|
|
|
276
284
|
|
|
277
285
|
@final
|
|
278
286
|
def __eq__(self, other: object) -> bool:
|
|
279
|
-
if not isinstance(other,
|
|
287
|
+
if not isinstance(other, ComputeFramework):
|
|
280
288
|
return False
|
|
281
289
|
return self.get_class_name() == other.get_class_name() and self.children_if_root == other.children_if_root
|
|
282
290
|
|
|
@@ -323,23 +331,23 @@ class ComputeFrameWork(ABC):
|
|
|
323
331
|
return False
|
|
324
332
|
|
|
325
333
|
@final
|
|
326
|
-
def get_function_extender(self, wrapper_function_enum:
|
|
327
|
-
|
|
334
|
+
def get_function_extender(self, wrapper_function_enum: ExtenderHook) -> Optional[Extender]:
|
|
335
|
+
matching_extenders = []
|
|
328
336
|
for extender in self.function_extender:
|
|
329
337
|
if wrapper_function_enum in extender.wraps():
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
return
|
|
338
|
+
matching_extenders.append(extender)
|
|
339
|
+
|
|
340
|
+
if len(matching_extenders) == 0:
|
|
341
|
+
return None
|
|
342
|
+
if len(matching_extenders) == 1:
|
|
343
|
+
return matching_extenders[0]
|
|
344
|
+
|
|
345
|
+
sorted_extenders = sorted(matching_extenders, key=lambda e: e.priority)
|
|
346
|
+
return _CompositeExtender(sorted_extenders, wrapper_function_enum)
|
|
339
347
|
|
|
340
348
|
@final
|
|
341
349
|
def run_calculate_feature(self, feature_group: Any, features: Any) -> Any:
|
|
342
|
-
extender = self.get_function_extender(
|
|
350
|
+
extender = self.get_function_extender(ExtenderHook.FEATURE_GROUP_CALCULATE_FEATURE)
|
|
343
351
|
|
|
344
352
|
try:
|
|
345
353
|
if extender is None:
|
|
@@ -365,8 +373,8 @@ When a feature depends on multiple input features, you must provide explicit Lin
|
|
|
365
373
|
how to merge them. Without Links, the framework cannot determine how to combine the data.
|
|
366
374
|
|
|
367
375
|
Example:
|
|
368
|
-
from
|
|
369
|
-
from
|
|
376
|
+
from mloda.core.abstract_plugins.components.link import Link
|
|
377
|
+
from mloda.core.abstract_plugins.components.index.index import Index
|
|
370
378
|
|
|
371
379
|
links = {{
|
|
372
380
|
Link.inner(
|