mloda 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mloda/__init__.py +17 -0
- {mloda_core → mloda/core}/abstract_plugins/components/base_artifact.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/base_validator.py +13 -0
- {mloda_core → mloda/core}/abstract_plugins/components/data_access_collection.py +1 -1
- {mloda_core → mloda/core}/abstract_plugins/components/data_types.py +39 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature.py +39 -33
- {mloda_core → mloda/core}/abstract_plugins/components/feature_chainer/feature_chain_parser.py +19 -19
- mloda/core/abstract_plugins/components/feature_chainer/feature_chain_parser_mixin.py +197 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature_collection.py +6 -6
- {mloda_core → mloda/core}/abstract_plugins/components/feature_group_version.py +8 -8
- {mloda_core → mloda/core}/abstract_plugins/components/feature_set.py +18 -24
- {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/cfw_transformer.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/index/add_index_feature.py +4 -4
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/api_input_data.py +3 -3
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/api_input_data_collection.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/base_api_data.py +1 -1
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/base_input_data.py +6 -6
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/creator/data_creator.py +3 -3
- mloda/core/abstract_plugins/components/link.py +437 -0
- {mloda_core → mloda/core}/abstract_plugins/components/match_data/match_data.py +3 -3
- {mloda_core → mloda/core}/abstract_plugins/components/merge/base_merge_engine.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/options.py +12 -36
- {mloda_core → mloda/core}/abstract_plugins/components/parallelization_modes.py +1 -1
- {mloda_core → mloda/core}/abstract_plugins/components/plugin_option/plugin_collector.py +14 -14
- mloda/core/abstract_plugins/components/validators/datatype_validator.py +96 -0
- mloda/core/abstract_plugins/components/validators/feature_set_validator.py +38 -0
- mloda/core/abstract_plugins/components/validators/feature_validator.py +23 -0
- mloda/core/abstract_plugins/components/validators/link_validator.py +79 -0
- mloda/core/abstract_plugins/components/validators/options_validator.py +57 -0
- mloda_core/abstract_plugins/compute_frame_work.py → mloda/core/abstract_plugins/compute_framework.py +46 -37
- mloda_core/abstract_plugins/abstract_feature_group.py → mloda/core/abstract_plugins/feature_group.py +56 -33
- mloda/core/abstract_plugins/function_extender.py +78 -0
- mloda/core/api/plugin_docs.py +220 -0
- mloda/core/api/plugin_info.py +32 -0
- {mloda_core → mloda/core}/api/prepare/setup_compute_framework.py +11 -11
- {mloda_core → mloda/core}/api/request.py +42 -33
- {mloda_core → mloda/core}/core/cfw_manager.py +8 -8
- {mloda_core → mloda/core}/core/engine.py +47 -46
- {mloda_core → mloda/core}/core/step/abstract_step.py +7 -7
- {mloda_core → mloda/core}/core/step/feature_group_step.py +12 -12
- {mloda_core → mloda/core}/core/step/join_step.py +14 -14
- {mloda_core → mloda/core}/core/step/transform_frame_work_step.py +16 -16
- {mloda_core → mloda/core}/filter/filter_engine.py +1 -1
- {mloda_core → mloda/core}/filter/filter_type_enum.py +1 -1
- {mloda_core → mloda/core}/filter/global_filter.py +23 -23
- {mloda_core → mloda/core}/filter/single_filter.py +6 -6
- {mloda_core → mloda/core}/prepare/accessible_plugins.py +16 -18
- {mloda_core → mloda/core}/prepare/execution_plan.py +65 -39
- {mloda_core → mloda/core}/prepare/graph/build_graph.py +6 -6
- {mloda_core → mloda/core}/prepare/graph/graph.py +1 -1
- {mloda_core → mloda/core}/prepare/graph/properties.py +5 -5
- {mloda_core → mloda/core}/prepare/identify_feature_group.py +12 -14
- {mloda_core → mloda/core}/prepare/joinstep_collection.py +3 -3
- {mloda_core → mloda/core}/prepare/resolve_compute_frameworks.py +6 -6
- {mloda_core → mloda/core}/prepare/resolve_graph.py +11 -11
- {mloda_core → mloda/core}/prepare/resolve_links.py +11 -31
- mloda/core/prepare/validators/resolve_link_validator.py +32 -0
- mloda/core/runtime/compute_framework_executor.py +271 -0
- mloda/core/runtime/data_lifecycle_manager.py +160 -0
- mloda/core/runtime/flight/__init__.py +0 -0
- {mloda_core → mloda/core}/runtime/flight/runner_flight_server.py +1 -1
- mloda/core/runtime/run.py +317 -0
- mloda/core/runtime/worker/__init__.py +0 -0
- {mloda_core → mloda/core}/runtime/worker/multiprocessing_worker.py +15 -10
- {mloda_core → mloda/core}/runtime/worker/thread_worker.py +2 -2
- mloda/core/runtime/worker_manager.py +96 -0
- mloda/provider/__init__.py +101 -0
- mloda/steward/__init__.py +25 -0
- mloda/user/__init__.py +57 -0
- {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/METADATA +18 -22
- mloda-0.4.0.dist-info/RECORD +248 -0
- {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/top_level.txt +1 -1
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_framework.py +15 -13
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_framework.py +12 -10
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/pandas/dataframe.py +18 -16
- mloda_plugins/compute_framework/base_implementations/pandas/pandas_filter_engine.py +36 -13
- mloda_plugins/compute_framework/base_implementations/pandas/pandas_merge_engine.py +7 -7
- mloda_plugins/compute_framework/base_implementations/pandas/pandaspyarrowtransformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/polars/dataframe.py +16 -14
- mloda_plugins/compute_framework/base_implementations/polars/lazy_dataframe.py +13 -12
- mloda_plugins/compute_framework/base_implementations/polars/polars_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/polars/polars_lazy_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/polars/polars_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/polars/polars_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/pyarrow/table.py +12 -10
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_framework.py +11 -9
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/spark/spark_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/spark/spark_framework.py +17 -15
- mloda_plugins/compute_framework/base_implementations/spark/spark_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/spark/spark_pyarrow_transformer.py +1 -1
- mloda_plugins/config/feature/loader.py +2 -2
- mloda_plugins/feature_group/experimental/aggregated_feature_group/base.py +45 -62
- mloda_plugins/feature_group/experimental/aggregated_feature_group/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/aggregated_feature_group/polars_lazy.py +2 -2
- mloda_plugins/feature_group/experimental/aggregated_feature_group/pyarrow.py +2 -2
- mloda_plugins/feature_group/experimental/clustering/base.py +69 -97
- mloda_plugins/feature_group/experimental/clustering/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/data_quality/missing_value/base.py +58 -79
- mloda_plugins/feature_group/experimental/data_quality/missing_value/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/data_quality/missing_value/pyarrow.py +2 -2
- mloda_plugins/feature_group/experimental/data_quality/missing_value/python_dict.py +2 -2
- mloda_plugins/feature_group/experimental/default_options_key.py +16 -19
- mloda_plugins/feature_group/experimental/dimensionality_reduction/base.py +80 -94
- mloda_plugins/feature_group/experimental/dimensionality_reduction/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/dynamic_feature_group_factory/dynamic_feature_group_factory.py +24 -24
- mloda_plugins/feature_group/experimental/forecasting/base.py +106 -104
- mloda_plugins/feature_group/experimental/forecasting/forecasting_artifact.py +2 -2
- mloda_plugins/feature_group/experimental/forecasting/pandas.py +15 -15
- mloda_plugins/feature_group/experimental/geo_distance/base.py +50 -42
- mloda_plugins/feature_group/experimental/geo_distance/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/llm/cli.py +4 -4
- mloda_plugins/feature_group/experimental/llm/cli_features/refactor_git_cached.py +19 -19
- mloda_plugins/feature_group/experimental/llm/installed_packages_feature_group.py +8 -8
- mloda_plugins/feature_group/experimental/llm/list_directory_feature_group.py +5 -5
- mloda_plugins/feature_group/experimental/llm/llm_api/claude.py +3 -3
- mloda_plugins/feature_group/experimental/llm/llm_api/gemini.py +3 -3
- mloda_plugins/feature_group/experimental/llm/llm_api/llm_base_request.py +5 -5
- mloda_plugins/feature_group/experimental/llm/llm_api/openai.py +3 -3
- mloda_plugins/feature_group/experimental/llm/llm_api/request_loop.py +6 -6
- mloda_plugins/feature_group/experimental/llm/llm_file_selector.py +10 -10
- mloda_plugins/feature_group/experimental/llm/tools/tool_collection.py +1 -1
- mloda_plugins/feature_group/experimental/node_centrality/base.py +46 -72
- mloda_plugins/feature_group/experimental/node_centrality/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/encoding/base.py +51 -51
- mloda_plugins/feature_group/experimental/sklearn/encoding/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/pipeline/base.py +52 -39
- mloda_plugins/feature_group/experimental/sklearn/pipeline/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/scaling/base.py +44 -58
- mloda_plugins/feature_group/experimental/sklearn/scaling/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/sklearn_artifact.py +2 -2
- mloda_plugins/feature_group/experimental/source_input_feature.py +15 -15
- mloda_plugins/feature_group/experimental/text_cleaning/base.py +38 -61
- mloda_plugins/feature_group/experimental/text_cleaning/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/text_cleaning/python_dict.py +2 -2
- mloda_plugins/feature_group/experimental/time_window/base.py +106 -93
- mloda_plugins/feature_group/experimental/time_window/pandas.py +13 -13
- mloda_plugins/feature_group/experimental/time_window/pyarrow.py +12 -12
- mloda_plugins/feature_group/input_data/api_data/api_data.py +9 -11
- mloda_plugins/feature_group/input_data/read_context_files.py +7 -7
- mloda_plugins/feature_group/input_data/read_db.py +7 -9
- mloda_plugins/feature_group/input_data/read_db_feature.py +4 -4
- mloda_plugins/feature_group/input_data/read_dbs/sqlite.py +23 -13
- mloda_plugins/feature_group/input_data/read_file.py +8 -8
- mloda_plugins/feature_group/input_data/read_file_feature.py +4 -4
- mloda_plugins/feature_group/input_data/read_files/csv.py +6 -6
- mloda_plugins/feature_group/input_data/read_files/feather.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/json.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/orc.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/parquet.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/text_file_reader.py +5 -5
- mloda_plugins/function_extender/base_implementations/otel/otel_extender.py +4 -4
- mloda-0.3.3.dist-info/RECORD +0 -230
- mloda_core/abstract_plugins/components/link.py +0 -286
- mloda_core/abstract_plugins/function_extender.py +0 -34
- mloda_core/runtime/run.py +0 -617
- {mloda_core → mloda/core}/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/domain.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature_chainer/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature_name.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/base_transformer.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/hashable_dict.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/index/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/index/index.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/creator/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/match_data/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/merge/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/plugin_option/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/utils.py +0 -0
- {mloda_core/abstract_plugins/plugin_loader → mloda/core/abstract_plugins/components/validators}/__init__.py +0 -0
- {mloda_core/api → mloda/core/abstract_plugins/plugin_loader}/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/plugin_loader/plugin_loader.py +0 -0
- {mloda_core/api/prepare → mloda/core/api}/__init__.py +0 -0
- {mloda_core/core → mloda/core/api/prepare}/__init__.py +0 -0
- {mloda_core/core/step → mloda/core/core}/__init__.py +0 -0
- {mloda_core/filter → mloda/core/core/step}/__init__.py +0 -0
- {mloda_core/prepare → mloda/core/filter}/__init__.py +0 -0
- {mloda_core → mloda/core}/filter/filter_parameter.py +0 -0
- {mloda_core/prepare/graph → mloda/core/prepare}/__init__.py +0 -0
- {mloda_core/runtime → mloda/core/prepare/graph}/__init__.py +0 -0
- {mloda_core/runtime/flight → mloda/core/prepare/validators}/__init__.py +0 -0
- {mloda_core/runtime/worker → mloda/core/runtime}/__init__.py +0 -0
- {mloda_core → mloda/core}/runtime/flight/flight_server.py +0 -0
- {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/WHEEL +0 -0
- {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/entry_points.txt +0 -0
- {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/licenses/LICENSE.TXT +0 -0
- {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
from typing import Set, Type
|
|
2
2
|
|
|
3
|
-
from
|
|
3
|
+
from mloda.core.abstract_plugins.feature_group import FeatureGroup
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
class
|
|
6
|
+
class PluginCollector:
|
|
7
7
|
"""
|
|
8
|
-
The
|
|
8
|
+
The PluginCollector class is a helper class with the purpose to disable or enable feature groups.
|
|
9
9
|
|
|
10
10
|
This class is useful for rapid prototype development, where you want to disable or enable feature groups,
|
|
11
11
|
when the other, competing feature groups are found.
|
|
@@ -15,16 +15,16 @@ class PlugInCollector:
|
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
def __init__(self) -> None:
|
|
18
|
-
self.disabled_feature_group_classes: Set[Type[
|
|
19
|
-
self.enabled_feature_group_classes: Set[Type[
|
|
18
|
+
self.disabled_feature_group_classes: Set[Type[FeatureGroup]] = set()
|
|
19
|
+
self.enabled_feature_group_classes: Set[Type[FeatureGroup]] = set()
|
|
20
20
|
|
|
21
|
-
def add_disabled_feature_group_classes(self, feature_group_cls: Set[Type[
|
|
21
|
+
def add_disabled_feature_group_classes(self, feature_group_cls: Set[Type[FeatureGroup]]) -> None:
|
|
22
22
|
self.disabled_feature_group_classes.update(feature_group_cls)
|
|
23
23
|
|
|
24
|
-
def add_enabled_feature_group_classes(self, feature_group_cls: Set[Type[
|
|
24
|
+
def add_enabled_feature_group_classes(self, feature_group_cls: Set[Type[FeatureGroup]]) -> None:
|
|
25
25
|
self.enabled_feature_group_classes.update(feature_group_cls)
|
|
26
26
|
|
|
27
|
-
def applicable_feature_group_class(self, feature_group_cls: Type[
|
|
27
|
+
def applicable_feature_group_class(self, feature_group_cls: Type[FeatureGroup]) -> bool:
|
|
28
28
|
if feature_group_cls in self.disabled_feature_group_classes:
|
|
29
29
|
return False
|
|
30
30
|
|
|
@@ -38,22 +38,22 @@ class PlugInCollector:
|
|
|
38
38
|
|
|
39
39
|
@staticmethod
|
|
40
40
|
def disabled_feature_groups(
|
|
41
|
-
feature_group_cls: Set[Type[
|
|
42
|
-
) -> "
|
|
41
|
+
feature_group_cls: Set[Type[FeatureGroup]] | Type[FeatureGroup],
|
|
42
|
+
) -> "PluginCollector":
|
|
43
43
|
if not isinstance(feature_group_cls, Set):
|
|
44
44
|
feature_group_cls = {feature_group_cls}
|
|
45
45
|
|
|
46
|
-
plugin_collector =
|
|
46
|
+
plugin_collector = PluginCollector()
|
|
47
47
|
plugin_collector.add_disabled_feature_group_classes(feature_group_cls)
|
|
48
48
|
return plugin_collector
|
|
49
49
|
|
|
50
50
|
@staticmethod
|
|
51
51
|
def enabled_feature_groups(
|
|
52
|
-
feature_group_cls: Set[Type[
|
|
53
|
-
) -> "
|
|
52
|
+
feature_group_cls: Set[Type[FeatureGroup]] | Type[FeatureGroup],
|
|
53
|
+
) -> "PluginCollector":
|
|
54
54
|
if not isinstance(feature_group_cls, Set):
|
|
55
55
|
feature_group_cls = {feature_group_cls}
|
|
56
56
|
|
|
57
|
-
plugin_collector =
|
|
57
|
+
plugin_collector = PluginCollector()
|
|
58
58
|
plugin_collector.add_enabled_feature_group_classes(feature_group_cls)
|
|
59
59
|
return plugin_collector
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
from mloda.core.abstract_plugins.components.data_types import DataType
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DataTypeMismatchError(ValueError):
|
|
8
|
+
"""Raised when feature data type doesn't match declared type."""
|
|
9
|
+
|
|
10
|
+
def __init__(self, feature_name: str, declared: DataType, actual: DataType) -> None:
|
|
11
|
+
self.feature_name = feature_name
|
|
12
|
+
self.declared = declared
|
|
13
|
+
self.actual = actual
|
|
14
|
+
super().__init__(
|
|
15
|
+
f"Feature '{feature_name}': declared {declared.name}, got {actual.name}, coercion not supported"
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DataTypeValidator:
|
|
20
|
+
"""Validates feature data matches declared DataType."""
|
|
21
|
+
|
|
22
|
+
_COMPATIBLE_TYPES = {
|
|
23
|
+
DataType.INT64: {DataType.INT32, DataType.INT64},
|
|
24
|
+
DataType.DOUBLE: {DataType.FLOAT, DataType.DOUBLE, DataType.INT32, DataType.INT64},
|
|
25
|
+
DataType.TIMESTAMP_MICROS: {DataType.TIMESTAMP_MILLIS, DataType.TIMESTAMP_MICROS},
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def _types_compatible(cls, declared: DataType, actual: DataType) -> bool:
|
|
30
|
+
"""Check if actual type is compatible with declared (allows widening)."""
|
|
31
|
+
if declared == actual:
|
|
32
|
+
return True
|
|
33
|
+
return actual in cls._COMPATIBLE_TYPES.get(declared, set())
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def _types_loosely_compatible(cls, declared: DataType, actual: DataType) -> bool:
|
|
37
|
+
"""Check if types are loosely compatible (allows any numeric/timestamp pairing).
|
|
38
|
+
|
|
39
|
+
Lenient mode allows data type mismatches within the same category:
|
|
40
|
+
- All numeric types (INT32, INT64, FLOAT, DOUBLE) are interchangeable
|
|
41
|
+
- All timestamp types are interchangeable
|
|
42
|
+
- Other types must match exactly
|
|
43
|
+
|
|
44
|
+
This fixes legacy FeatureGroups that declare INT32 but return DOUBLE.
|
|
45
|
+
"""
|
|
46
|
+
if declared == actual:
|
|
47
|
+
return True
|
|
48
|
+
|
|
49
|
+
numeric_types = {DataType.INT32, DataType.INT64, DataType.FLOAT, DataType.DOUBLE}
|
|
50
|
+
if declared in numeric_types and actual in numeric_types:
|
|
51
|
+
return True
|
|
52
|
+
|
|
53
|
+
timestamp_types = {DataType.TIMESTAMP_MILLIS, DataType.TIMESTAMP_MICROS}
|
|
54
|
+
if declared in timestamp_types and actual in timestamp_types:
|
|
55
|
+
return True
|
|
56
|
+
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def validate(cls, data: Any, features: Any, strict_only: bool = False) -> None:
|
|
61
|
+
"""Validate that data columns match declared feature types.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
data: PyArrow table or similar with column data
|
|
65
|
+
features: FeatureSet containing features to validate
|
|
66
|
+
strict_only: If True, only validate when strict_type_enforcement is enabled.
|
|
67
|
+
This maintains backward compatibility with existing code.
|
|
68
|
+
"""
|
|
69
|
+
from mloda_plugins.feature_group.experimental.default_options_key import DefaultOptionKeys
|
|
70
|
+
|
|
71
|
+
for feature in features.features:
|
|
72
|
+
if feature.data_type is None:
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
col_name = feature.get_name()
|
|
76
|
+
if col_name not in data.column_names:
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
arrow_type = data.schema.field(col_name).type
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
actual_type = DataType.from_arrow_type(arrow_type)
|
|
83
|
+
except ValueError:
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
strict_mode = False
|
|
87
|
+
if feature.options:
|
|
88
|
+
strict_value = feature.options.get(DefaultOptionKeys.strict_type_enforcement)
|
|
89
|
+
strict_mode = strict_value if strict_value is not None else False
|
|
90
|
+
|
|
91
|
+
if strict_mode:
|
|
92
|
+
if not cls._types_compatible(feature.data_type, actual_type):
|
|
93
|
+
raise DataTypeMismatchError(col_name, feature.data_type, actual_type)
|
|
94
|
+
else:
|
|
95
|
+
if not cls._types_loosely_compatible(feature.data_type, actual_type):
|
|
96
|
+
raise DataTypeMismatchError(col_name, feature.data_type, actual_type)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from typing import Any, Optional, Set, TYPE_CHECKING
|
|
2
|
+
|
|
3
|
+
if TYPE_CHECKING:
|
|
4
|
+
from mloda.core.abstract_plugins.components.feature import Feature
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class FeatureSetValidator:
|
|
8
|
+
@staticmethod
|
|
9
|
+
def validate_options_initialized(options: Any, context: str = "FeatureSet") -> None:
|
|
10
|
+
if options is None:
|
|
11
|
+
raise ValueError(f"Options not initialized in {context}")
|
|
12
|
+
|
|
13
|
+
@staticmethod
|
|
14
|
+
def validate_equal_options(features: Set["Feature"]) -> None:
|
|
15
|
+
if len(features) <= 1:
|
|
16
|
+
return
|
|
17
|
+
|
|
18
|
+
options_list = [feature.options for feature in features]
|
|
19
|
+
first_options = options_list[0]
|
|
20
|
+
|
|
21
|
+
for options in options_list[1:]:
|
|
22
|
+
if options != first_options:
|
|
23
|
+
raise ValueError("Features have different options")
|
|
24
|
+
|
|
25
|
+
@staticmethod
|
|
26
|
+
def validate_feature_added(feature_name: Optional[str], context: str = "feature") -> None:
|
|
27
|
+
if feature_name is None:
|
|
28
|
+
raise ValueError(f"Feature name is None in {context}")
|
|
29
|
+
|
|
30
|
+
@staticmethod
|
|
31
|
+
def validate_filters_not_set(filters: Any) -> None:
|
|
32
|
+
if filters is not None:
|
|
33
|
+
raise ValueError("Filters already set")
|
|
34
|
+
|
|
35
|
+
@staticmethod
|
|
36
|
+
def validate_filters_is_set_type(filters: Any) -> None:
|
|
37
|
+
if not isinstance(filters, set):
|
|
38
|
+
raise ValueError("Filters must be a Set type")
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from typing import Optional, Set, Type
|
|
2
|
+
from mloda.core.abstract_plugins.compute_framework import ComputeFramework
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class FeatureValidator:
|
|
6
|
+
@staticmethod
|
|
7
|
+
def validate_and_resolve_compute_framework(
|
|
8
|
+
framework_name: str, available_frameworks: Set[Type[ComputeFramework]], source: str = "parameter"
|
|
9
|
+
) -> Type[ComputeFramework]:
|
|
10
|
+
for subclass in available_frameworks:
|
|
11
|
+
if framework_name == subclass.get_class_name():
|
|
12
|
+
return subclass
|
|
13
|
+
raise ValueError(f"Compute framework via {source} {framework_name} not found.")
|
|
14
|
+
|
|
15
|
+
@staticmethod
|
|
16
|
+
def validate_compute_frameworks_resolved(
|
|
17
|
+
compute_frameworks: Optional[Set[Type[ComputeFramework]]], feature_name: str
|
|
18
|
+
) -> None:
|
|
19
|
+
if compute_frameworks is None:
|
|
20
|
+
raise ValueError(
|
|
21
|
+
f"Feature {feature_name} does not have any compute framework. "
|
|
22
|
+
"This function can only be called when the frameworks were resolved."
|
|
23
|
+
)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING, Any, Optional, Set, Tuple, Union
|
|
2
|
+
|
|
3
|
+
if TYPE_CHECKING:
|
|
4
|
+
from mloda.core.abstract_plugins.components.link import Link
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class LinkValidator:
|
|
8
|
+
@staticmethod
|
|
9
|
+
def validate_index_not_empty(index: Union[str, Tuple[str, ...]], context: str = "index") -> None:
|
|
10
|
+
if not index:
|
|
11
|
+
raise ValueError(f"{context} cannot be empty")
|
|
12
|
+
|
|
13
|
+
@staticmethod
|
|
14
|
+
def validate_join_type(jointype: Any) -> None:
|
|
15
|
+
from mloda.core.abstract_plugins.components.link import JoinType
|
|
16
|
+
|
|
17
|
+
if not isinstance(jointype, JoinType):
|
|
18
|
+
raise ValueError(f"Join type {jointype} is not supported")
|
|
19
|
+
|
|
20
|
+
@staticmethod
|
|
21
|
+
def validate_no_double_joins(links: Set["Link"]) -> None:
|
|
22
|
+
from mloda.core.abstract_plugins.components.link import JoinType
|
|
23
|
+
|
|
24
|
+
for i_link in links:
|
|
25
|
+
for j_link in links:
|
|
26
|
+
if i_link == j_link:
|
|
27
|
+
continue
|
|
28
|
+
if (
|
|
29
|
+
i_link.left_feature_group == j_link.right_feature_group
|
|
30
|
+
and i_link.right_feature_group == j_link.left_feature_group
|
|
31
|
+
and i_link.jointype not in [JoinType.APPEND, JoinType.UNION]
|
|
32
|
+
):
|
|
33
|
+
raise ValueError(
|
|
34
|
+
f"Link {i_link} and {j_link} have at least two different defined joins. Please remove one."
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def validate_no_conflicting_join_types(links: Set["Link"]) -> None:
|
|
39
|
+
for i_link in links:
|
|
40
|
+
for j_link in links:
|
|
41
|
+
if i_link == j_link:
|
|
42
|
+
continue
|
|
43
|
+
if (
|
|
44
|
+
i_link.left_feature_group == j_link.left_feature_group
|
|
45
|
+
and i_link.right_feature_group == j_link.right_feature_group
|
|
46
|
+
and i_link.jointype != j_link.jointype
|
|
47
|
+
):
|
|
48
|
+
raise ValueError(
|
|
49
|
+
f"Link {i_link} and {j_link} have different join types for the same feature groups. Please remove one."
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def validate_right_join_constraints(links: Set["Link"]) -> None:
|
|
54
|
+
from mloda.core.abstract_plugins.components.link import JoinType
|
|
55
|
+
|
|
56
|
+
for i_link in links:
|
|
57
|
+
if i_link.jointype == JoinType.RIGHT:
|
|
58
|
+
for j_link in links:
|
|
59
|
+
if i_link == j_link:
|
|
60
|
+
continue
|
|
61
|
+
if (
|
|
62
|
+
i_link.left_feature_group == j_link.left_feature_group
|
|
63
|
+
or i_link.left_feature_group == j_link.right_feature_group
|
|
64
|
+
):
|
|
65
|
+
raise ValueError(
|
|
66
|
+
f"Link {i_link} and {j_link} have multiple right joins for the same feature group on the left side or switching from left to right side although using right join. Please reconsider your joinlogic and if possible, use left joins instead of rightjoins. This will currently break the planner or during execution."
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
@classmethod
|
|
70
|
+
def validate_links(cls, links: Optional[Set["Link"]]) -> None:
|
|
71
|
+
if links is None:
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
for link in links:
|
|
75
|
+
cls.validate_join_type(link.jointype)
|
|
76
|
+
|
|
77
|
+
cls.validate_no_double_joins(links)
|
|
78
|
+
cls.validate_no_conflicting_join_types(links)
|
|
79
|
+
cls.validate_right_join_constraints(links)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from typing import Any, Dict, Set
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class OptionsValidator:
|
|
5
|
+
"""Validates Options configuration consistency."""
|
|
6
|
+
|
|
7
|
+
@staticmethod
|
|
8
|
+
def validate_no_duplicate_keys(group: Dict[str, Any], context: Dict[str, Any]) -> None:
|
|
9
|
+
"""
|
|
10
|
+
Ensure no key exists in both group and context.
|
|
11
|
+
|
|
12
|
+
Raises ValueError if any key exists in both, with duplicate keys in message.
|
|
13
|
+
"""
|
|
14
|
+
duplicate_keys = set(group.keys()) & set(context.keys())
|
|
15
|
+
if duplicate_keys:
|
|
16
|
+
raise ValueError(f"Keys cannot exist in both group and context: {duplicate_keys}")
|
|
17
|
+
|
|
18
|
+
@staticmethod
|
|
19
|
+
def validate_can_add_to_group(key: str, value: Any, group: Dict[str, Any], context: Dict[str, Any]) -> None:
|
|
20
|
+
"""
|
|
21
|
+
Validate that a key can be added to group.
|
|
22
|
+
|
|
23
|
+
Checks:
|
|
24
|
+
1. If key exists in group with different value -> ValueError (include key in message)
|
|
25
|
+
2. If key exists in context -> ValueError (include key in message)
|
|
26
|
+
"""
|
|
27
|
+
if key in group:
|
|
28
|
+
if value != group[key]:
|
|
29
|
+
raise ValueError(f"Key {key} already exists in group options with a different value: {group[key]}")
|
|
30
|
+
if key in context:
|
|
31
|
+
raise ValueError(f"Key {key} already exists in context options. Cannot add to group.")
|
|
32
|
+
|
|
33
|
+
@staticmethod
|
|
34
|
+
def validate_can_add_to_context(key: str, value: Any, group: Dict[str, Any], context: Dict[str, Any]) -> None:
|
|
35
|
+
"""
|
|
36
|
+
Validate that a key can be added to context.
|
|
37
|
+
|
|
38
|
+
Checks:
|
|
39
|
+
1. If key exists in context with different value -> ValueError (include key in message)
|
|
40
|
+
2. If key exists in group -> ValueError (include key in message)
|
|
41
|
+
"""
|
|
42
|
+
if key in context:
|
|
43
|
+
if value != context[key]:
|
|
44
|
+
raise ValueError(f"Key {key} already exists in context options with a different value: {context[key]}")
|
|
45
|
+
if key in group:
|
|
46
|
+
raise ValueError(f"Key {key} already exists in group options. Cannot add to context.")
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def validate_no_group_context_conflicts(other_group_keys: Set[str], self_context_keys: Set[str]) -> None:
|
|
50
|
+
"""
|
|
51
|
+
Validate no conflicts between other's group keys and self's context keys.
|
|
52
|
+
|
|
53
|
+
Raises ValueError if any key exists in both, with conflicting keys in message.
|
|
54
|
+
"""
|
|
55
|
+
conflicting_keys = other_group_keys & self_context_keys
|
|
56
|
+
if conflicting_keys:
|
|
57
|
+
raise ValueError(f"Cannot update group: keys already exist in context: {conflicting_keys}")
|
mloda_core/abstract_plugins/compute_frame_work.py → mloda/core/abstract_plugins/compute_framework.py
RENAMED
|
@@ -1,23 +1,27 @@
|
|
|
1
1
|
from abc import ABC
|
|
2
2
|
from typing import Any, List, Optional, Set, Type, Union, final
|
|
3
3
|
from uuid import UUID, uuid4
|
|
4
|
-
from
|
|
5
|
-
from
|
|
4
|
+
from mloda.core.abstract_plugins.components.data_access_collection import DataAccessCollection
|
|
5
|
+
from mloda.core.abstract_plugins.components.framework_transformer.cfw_transformer import (
|
|
6
6
|
ComputeFrameworkTransformer,
|
|
7
7
|
)
|
|
8
|
-
from
|
|
8
|
+
from mloda.core.abstract_plugins.components.merge.base_merge_engine import BaseMergeEngine
|
|
9
9
|
import pyarrow as pa
|
|
10
10
|
|
|
11
|
-
from
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
11
|
+
from mloda.core.abstract_plugins.function_extender import (
|
|
12
|
+
Extender,
|
|
13
|
+
ExtenderHook,
|
|
14
|
+
_CompositeExtender,
|
|
15
|
+
)
|
|
16
|
+
from mloda.core.abstract_plugins.components.feature_name import FeatureName
|
|
17
|
+
from mloda.core.abstract_plugins.components.parallelization_modes import ParallelizationMode
|
|
18
|
+
from mloda.core.filter.filter_engine import BaseFilterEngine
|
|
19
|
+
from mloda.core.runtime.flight.flight_server import FlightServer
|
|
16
20
|
|
|
17
21
|
|
|
18
|
-
class
|
|
22
|
+
class ComputeFramework(ABC):
|
|
19
23
|
"""
|
|
20
|
-
Documentation
|
|
24
|
+
Documentation ComputeFramework:
|
|
21
25
|
|
|
22
26
|
This class is used to define the compute framework.
|
|
23
27
|
|
|
@@ -41,10 +45,10 @@ class ComputeFrameWork(ABC):
|
|
|
41
45
|
|
|
42
46
|
def __init__(
|
|
43
47
|
self,
|
|
44
|
-
mode:
|
|
48
|
+
mode: ParallelizationMode,
|
|
45
49
|
children_if_root: frozenset[UUID],
|
|
46
50
|
uuid: UUID = uuid4(),
|
|
47
|
-
function_extender: Optional[Set[
|
|
51
|
+
function_extender: Optional[Set[Extender]] = None,
|
|
48
52
|
) -> None:
|
|
49
53
|
"""This class is initialized step execution."""
|
|
50
54
|
self.mode = mode
|
|
@@ -64,18 +68,19 @@ class ComputeFrameWork(ABC):
|
|
|
64
68
|
# connection object for frameworks that need persistent connections (e.g., DuckDB, Spark)
|
|
65
69
|
self.framework_connection_object: Optional[Any] = None
|
|
66
70
|
|
|
67
|
-
@
|
|
68
|
-
def expected_data_framework() -> Any:
|
|
71
|
+
@classmethod
|
|
72
|
+
def expected_data_framework(cls) -> Any:
|
|
69
73
|
"""
|
|
70
74
|
This function should return the expected data framework for the compute framework.
|
|
71
75
|
However, we only need to set it if we really want to be sure that the datatype is correct.
|
|
72
76
|
"""
|
|
73
77
|
return None
|
|
74
78
|
|
|
75
|
-
|
|
79
|
+
@classmethod
|
|
80
|
+
def filter_engine(cls) -> Type[BaseFilterEngine]:
|
|
76
81
|
"""
|
|
77
82
|
This function should return the filtered data.
|
|
78
|
-
The BaseFilterEngine should be overwritten by the appropriate
|
|
83
|
+
The BaseFilterEngine should be overwritten by the appropriate ComputeFramework if needed
|
|
79
84
|
"""
|
|
80
85
|
raise NotImplementedError
|
|
81
86
|
|
|
@@ -122,16 +127,15 @@ class ComputeFrameWork(ABC):
|
|
|
122
127
|
"""
|
|
123
128
|
return data
|
|
124
129
|
|
|
125
|
-
|
|
130
|
+
@classmethod
|
|
131
|
+
def merge_engine(cls) -> Type[BaseMergeEngine]:
|
|
126
132
|
"""
|
|
127
133
|
This function should return a subclass of the BaseMergeEngine.
|
|
128
134
|
With this, we can merge data from the same compute framework.
|
|
129
135
|
|
|
130
136
|
This implementation is optional.
|
|
131
137
|
"""
|
|
132
|
-
raise NotImplementedError(
|
|
133
|
-
f"Merge functionality is for this compute framework not implemented {self.__class__.__name__}."
|
|
134
|
-
)
|
|
138
|
+
raise NotImplementedError(f"Merge functionality is for this compute framework not implemented {cls.__name__}.")
|
|
135
139
|
|
|
136
140
|
def set_framework_connection_object(self, framework_connection_object: Optional[Any] = None) -> None:
|
|
137
141
|
"""
|
|
@@ -241,7 +245,7 @@ class ComputeFrameWork(ABC):
|
|
|
241
245
|
if self.data is None:
|
|
242
246
|
return
|
|
243
247
|
|
|
244
|
-
extender = self.get_function_extender(
|
|
248
|
+
extender = self.get_function_extender(ExtenderHook.VALIDATE_INPUT_FEATURE)
|
|
245
249
|
if extender is None:
|
|
246
250
|
result = feature_group.validate_input_features(self.data, features)
|
|
247
251
|
else:
|
|
@@ -255,7 +259,12 @@ class ComputeFrameWork(ABC):
|
|
|
255
259
|
def run_validate_output_features(self, feature_group: Any, features: Any) -> Any:
|
|
256
260
|
if self.data is None:
|
|
257
261
|
return
|
|
258
|
-
|
|
262
|
+
|
|
263
|
+
from mloda.core.abstract_plugins.components.validators.datatype_validator import DataTypeValidator
|
|
264
|
+
|
|
265
|
+
DataTypeValidator.validate(self.data, features, strict_only=True)
|
|
266
|
+
|
|
267
|
+
extender = self.get_function_extender(ExtenderHook.VALIDATE_OUTPUT_FEATURE)
|
|
259
268
|
if extender is None:
|
|
260
269
|
result = feature_group.validate_output_features(self.data, features)
|
|
261
270
|
else:
|
|
@@ -276,7 +285,7 @@ class ComputeFrameWork(ABC):
|
|
|
276
285
|
|
|
277
286
|
@final
|
|
278
287
|
def __eq__(self, other: object) -> bool:
|
|
279
|
-
if not isinstance(other,
|
|
288
|
+
if not isinstance(other, ComputeFramework):
|
|
280
289
|
return False
|
|
281
290
|
return self.get_class_name() == other.get_class_name() and self.children_if_root == other.children_if_root
|
|
282
291
|
|
|
@@ -323,23 +332,23 @@ class ComputeFrameWork(ABC):
|
|
|
323
332
|
return False
|
|
324
333
|
|
|
325
334
|
@final
|
|
326
|
-
def get_function_extender(self, wrapper_function_enum:
|
|
327
|
-
|
|
335
|
+
def get_function_extender(self, wrapper_function_enum: ExtenderHook) -> Optional[Extender]:
|
|
336
|
+
matching_extenders = []
|
|
328
337
|
for extender in self.function_extender:
|
|
329
338
|
if wrapper_function_enum in extender.wraps():
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
return
|
|
339
|
+
matching_extenders.append(extender)
|
|
340
|
+
|
|
341
|
+
if len(matching_extenders) == 0:
|
|
342
|
+
return None
|
|
343
|
+
if len(matching_extenders) == 1:
|
|
344
|
+
return matching_extenders[0]
|
|
345
|
+
|
|
346
|
+
sorted_extenders = sorted(matching_extenders, key=lambda e: e.priority)
|
|
347
|
+
return _CompositeExtender(sorted_extenders, wrapper_function_enum)
|
|
339
348
|
|
|
340
349
|
@final
|
|
341
350
|
def run_calculate_feature(self, feature_group: Any, features: Any) -> Any:
|
|
342
|
-
extender = self.get_function_extender(
|
|
351
|
+
extender = self.get_function_extender(ExtenderHook.FEATURE_GROUP_CALCULATE_FEATURE)
|
|
343
352
|
|
|
344
353
|
try:
|
|
345
354
|
if extender is None:
|
|
@@ -365,8 +374,8 @@ When a feature depends on multiple input features, you must provide explicit Lin
|
|
|
365
374
|
how to merge them. Without Links, the framework cannot determine how to combine the data.
|
|
366
375
|
|
|
367
376
|
Example:
|
|
368
|
-
from
|
|
369
|
-
from
|
|
377
|
+
from mloda.core.abstract_plugins.components.link import Link
|
|
378
|
+
from mloda.core.abstract_plugins.components.index.index import Index
|
|
370
379
|
|
|
371
380
|
links = {{
|
|
372
381
|
Link.inner(
|