mloda 0.3.3__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mloda/__init__.py +17 -0
- {mloda_core → mloda/core}/abstract_plugins/components/base_artifact.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/base_validator.py +13 -0
- {mloda_core → mloda/core}/abstract_plugins/components/data_access_collection.py +1 -1
- {mloda_core → mloda/core}/abstract_plugins/components/data_types.py +39 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature.py +39 -33
- {mloda_core → mloda/core}/abstract_plugins/components/feature_chainer/feature_chain_parser.py +19 -19
- mloda/core/abstract_plugins/components/feature_chainer/feature_chain_parser_mixin.py +197 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature_collection.py +6 -6
- {mloda_core → mloda/core}/abstract_plugins/components/feature_group_version.py +8 -8
- {mloda_core → mloda/core}/abstract_plugins/components/feature_set.py +18 -24
- {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/cfw_transformer.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/index/add_index_feature.py +4 -4
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/api_input_data.py +3 -3
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/api_input_data_collection.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/base_api_data.py +1 -1
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/base_input_data.py +6 -6
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/creator/data_creator.py +3 -3
- mloda/core/abstract_plugins/components/link.py +437 -0
- {mloda_core → mloda/core}/abstract_plugins/components/match_data/match_data.py +3 -3
- {mloda_core → mloda/core}/abstract_plugins/components/merge/base_merge_engine.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/options.py +12 -36
- {mloda_core → mloda/core}/abstract_plugins/components/parallelization_modes.py +1 -1
- {mloda_core → mloda/core}/abstract_plugins/components/plugin_option/plugin_collector.py +14 -14
- mloda/core/abstract_plugins/components/validators/datatype_validator.py +96 -0
- mloda/core/abstract_plugins/components/validators/feature_set_validator.py +38 -0
- mloda/core/abstract_plugins/components/validators/feature_validator.py +23 -0
- mloda/core/abstract_plugins/components/validators/link_validator.py +79 -0
- mloda/core/abstract_plugins/components/validators/options_validator.py +57 -0
- mloda_core/abstract_plugins/compute_frame_work.py → mloda/core/abstract_plugins/compute_framework.py +45 -37
- mloda_core/abstract_plugins/abstract_feature_group.py → mloda/core/abstract_plugins/feature_group.py +56 -33
- mloda/core/abstract_plugins/function_extender.py +78 -0
- mloda/core/api/plugin_docs.py +220 -0
- mloda/core/api/plugin_info.py +32 -0
- {mloda_core → mloda/core}/api/prepare/setup_compute_framework.py +11 -11
- {mloda_core → mloda/core}/api/request.py +42 -33
- {mloda_core → mloda/core}/core/cfw_manager.py +8 -8
- {mloda_core → mloda/core}/core/engine.py +47 -47
- {mloda_core → mloda/core}/core/step/abstract_step.py +7 -7
- {mloda_core → mloda/core}/core/step/feature_group_step.py +12 -12
- {mloda_core → mloda/core}/core/step/join_step.py +14 -14
- {mloda_core → mloda/core}/core/step/transform_frame_work_step.py +16 -16
- {mloda_core → mloda/core}/filter/filter_engine.py +1 -1
- {mloda_core → mloda/core}/filter/filter_type_enum.py +1 -1
- {mloda_core → mloda/core}/filter/global_filter.py +23 -23
- {mloda_core → mloda/core}/filter/single_filter.py +6 -6
- {mloda_core → mloda/core}/prepare/accessible_plugins.py +15 -18
- {mloda_core → mloda/core}/prepare/execution_plan.py +65 -39
- {mloda_core → mloda/core}/prepare/graph/build_graph.py +6 -6
- {mloda_core → mloda/core}/prepare/graph/graph.py +1 -1
- {mloda_core → mloda/core}/prepare/graph/properties.py +5 -5
- {mloda_core → mloda/core}/prepare/identify_feature_group.py +12 -14
- {mloda_core → mloda/core}/prepare/joinstep_collection.py +3 -3
- {mloda_core → mloda/core}/prepare/resolve_compute_frameworks.py +6 -6
- {mloda_core → mloda/core}/prepare/resolve_graph.py +11 -11
- {mloda_core → mloda/core}/prepare/resolve_links.py +11 -31
- mloda/core/prepare/validators/resolve_link_validator.py +32 -0
- mloda/core/runtime/compute_framework_executor.py +271 -0
- mloda/core/runtime/data_lifecycle_manager.py +160 -0
- mloda/core/runtime/flight/__init__.py +0 -0
- {mloda_core → mloda/core}/runtime/flight/runner_flight_server.py +1 -1
- mloda/core/runtime/run.py +317 -0
- mloda/core/runtime/worker/__init__.py +0 -0
- {mloda_core → mloda/core}/runtime/worker/multiprocessing_worker.py +15 -10
- {mloda_core → mloda/core}/runtime/worker/thread_worker.py +2 -2
- mloda/core/runtime/worker_manager.py +96 -0
- mloda/provider/__init__.py +101 -0
- mloda/steward/__init__.py +25 -0
- mloda/user/__init__.py +57 -0
- {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/METADATA +24 -31
- mloda-0.4.1.dist-info/RECORD +248 -0
- {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/top_level.txt +1 -1
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_framework.py +15 -13
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_framework.py +12 -10
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/pandas/dataframe.py +18 -16
- mloda_plugins/compute_framework/base_implementations/pandas/pandas_filter_engine.py +36 -13
- mloda_plugins/compute_framework/base_implementations/pandas/pandas_merge_engine.py +7 -7
- mloda_plugins/compute_framework/base_implementations/pandas/pandaspyarrowtransformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/polars/dataframe.py +16 -14
- mloda_plugins/compute_framework/base_implementations/polars/lazy_dataframe.py +13 -12
- mloda_plugins/compute_framework/base_implementations/polars/polars_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/polars/polars_lazy_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/polars/polars_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/polars/polars_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/pyarrow/table.py +12 -11
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_framework.py +11 -9
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/spark/spark_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/spark/spark_framework.py +17 -15
- mloda_plugins/compute_framework/base_implementations/spark/spark_merge_engine.py +2 -3
- mloda_plugins/compute_framework/base_implementations/spark/spark_pyarrow_transformer.py +1 -1
- mloda_plugins/config/feature/loader.py +2 -2
- mloda_plugins/feature_group/experimental/aggregated_feature_group/base.py +45 -64
- mloda_plugins/feature_group/experimental/aggregated_feature_group/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/aggregated_feature_group/polars_lazy.py +2 -2
- mloda_plugins/feature_group/experimental/aggregated_feature_group/pyarrow.py +2 -2
- mloda_plugins/feature_group/experimental/clustering/base.py +67 -97
- mloda_plugins/feature_group/experimental/clustering/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/data_quality/missing_value/base.py +58 -82
- mloda_plugins/feature_group/experimental/data_quality/missing_value/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/data_quality/missing_value/pyarrow.py +2 -2
- mloda_plugins/feature_group/experimental/data_quality/missing_value/python_dict.py +2 -2
- mloda_plugins/feature_group/experimental/default_options_key.py +16 -19
- mloda_plugins/feature_group/experimental/dimensionality_reduction/base.py +81 -96
- mloda_plugins/feature_group/experimental/dimensionality_reduction/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/dynamic_feature_group_factory/dynamic_feature_group_factory.py +24 -24
- mloda_plugins/feature_group/experimental/forecasting/base.py +108 -106
- mloda_plugins/feature_group/experimental/forecasting/forecasting_artifact.py +2 -2
- mloda_plugins/feature_group/experimental/forecasting/pandas.py +15 -15
- mloda_plugins/feature_group/experimental/geo_distance/base.py +52 -44
- mloda_plugins/feature_group/experimental/geo_distance/pandas.py +2 -3
- mloda_plugins/feature_group/experimental/llm/cli.py +4 -4
- mloda_plugins/feature_group/experimental/llm/cli_features/refactor_git_cached.py +19 -19
- mloda_plugins/feature_group/experimental/llm/installed_packages_feature_group.py +8 -8
- mloda_plugins/feature_group/experimental/llm/list_directory_feature_group.py +5 -5
- mloda_plugins/feature_group/experimental/llm/llm_api/claude.py +3 -3
- mloda_plugins/feature_group/experimental/llm/llm_api/gemini.py +3 -3
- mloda_plugins/feature_group/experimental/llm/llm_api/llm_base_request.py +5 -5
- mloda_plugins/feature_group/experimental/llm/llm_api/openai.py +3 -3
- mloda_plugins/feature_group/experimental/llm/llm_api/request_loop.py +6 -6
- mloda_plugins/feature_group/experimental/llm/llm_file_selector.py +10 -10
- mloda_plugins/feature_group/experimental/llm/tools/tool_collection.py +1 -1
- mloda_plugins/feature_group/experimental/node_centrality/base.py +46 -74
- mloda_plugins/feature_group/experimental/node_centrality/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/encoding/base.py +53 -53
- mloda_plugins/feature_group/experimental/sklearn/encoding/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/pipeline/base.py +52 -39
- mloda_plugins/feature_group/experimental/sklearn/pipeline/pandas.py +3 -4
- mloda_plugins/feature_group/experimental/sklearn/scaling/base.py +44 -60
- mloda_plugins/feature_group/experimental/sklearn/scaling/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/sklearn_artifact.py +2 -3
- mloda_plugins/feature_group/experimental/source_input_feature.py +15 -15
- mloda_plugins/feature_group/experimental/text_cleaning/base.py +38 -63
- mloda_plugins/feature_group/experimental/text_cleaning/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/text_cleaning/python_dict.py +2 -2
- mloda_plugins/feature_group/experimental/time_window/base.py +108 -95
- mloda_plugins/feature_group/experimental/time_window/pandas.py +13 -13
- mloda_plugins/feature_group/experimental/time_window/pyarrow.py +12 -12
- mloda_plugins/feature_group/input_data/api_data/api_data.py +9 -11
- mloda_plugins/feature_group/input_data/read_context_files.py +7 -7
- mloda_plugins/feature_group/input_data/read_db.py +7 -9
- mloda_plugins/feature_group/input_data/read_db_feature.py +4 -4
- mloda_plugins/feature_group/input_data/read_dbs/sqlite.py +23 -13
- mloda_plugins/feature_group/input_data/read_file.py +8 -8
- mloda_plugins/feature_group/input_data/read_file_feature.py +4 -4
- mloda_plugins/feature_group/input_data/read_files/csv.py +6 -6
- mloda_plugins/feature_group/input_data/read_files/feather.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/json.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/orc.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/parquet.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/text_file_reader.py +5 -5
- mloda_plugins/function_extender/base_implementations/otel/otel_extender.py +4 -4
- mloda-0.3.3.dist-info/RECORD +0 -230
- mloda_core/abstract_plugins/components/link.py +0 -286
- mloda_core/abstract_plugins/function_extender.py +0 -34
- mloda_core/runtime/run.py +0 -617
- {mloda_core → mloda/core}/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/domain.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature_chainer/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature_name.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/base_transformer.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/hashable_dict.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/index/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/index/index.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/creator/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/match_data/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/merge/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/plugin_option/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/utils.py +0 -0
- {mloda_core/abstract_plugins/plugin_loader → mloda/core/abstract_plugins/components/validators}/__init__.py +0 -0
- {mloda_core/api → mloda/core/abstract_plugins/plugin_loader}/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/plugin_loader/plugin_loader.py +0 -0
- {mloda_core/api/prepare → mloda/core/api}/__init__.py +0 -0
- {mloda_core/core → mloda/core/api/prepare}/__init__.py +0 -0
- {mloda_core/core/step → mloda/core/core}/__init__.py +0 -0
- {mloda_core/filter → mloda/core/core/step}/__init__.py +0 -0
- {mloda_core/prepare → mloda/core/filter}/__init__.py +0 -0
- {mloda_core → mloda/core}/filter/filter_parameter.py +0 -0
- {mloda_core/prepare/graph → mloda/core/prepare}/__init__.py +0 -0
- {mloda_core/runtime → mloda/core/prepare/graph}/__init__.py +0 -0
- {mloda_core/runtime/flight → mloda/core/prepare/validators}/__init__.py +0 -0
- {mloda_core/runtime/worker → mloda/core/runtime}/__init__.py +0 -0
- {mloda_core → mloda/core}/runtime/flight/flight_server.py +0 -0
- {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/WHEEL +0 -0
- {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/entry_points.txt +0 -0
- {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/licenses/LICENSE.TXT +0 -0
- {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Any, Set, Type, Optional
|
|
3
|
-
from
|
|
3
|
+
from mloda.provider import BaseMergeEngine
|
|
4
4
|
from mloda_plugins.compute_framework.base_implementations.duckdb.duckdb_merge_engine import DuckDBMergeEngine
|
|
5
|
-
from
|
|
6
|
-
from
|
|
7
|
-
from
|
|
5
|
+
from mloda.user import FeatureName
|
|
6
|
+
from mloda import ComputeFramework
|
|
7
|
+
from mloda.provider import BaseFilterEngine
|
|
8
8
|
from mloda_plugins.compute_framework.base_implementations.duckdb.duckdb_filter_engine import DuckDBFilterEngine
|
|
9
9
|
|
|
10
10
|
try:
|
|
@@ -15,8 +15,8 @@ except ImportError:
|
|
|
15
15
|
logger = logging.getLogger(__name__)
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
class DuckDBFramework(
|
|
19
|
-
"""DuckDB framework implementation for
|
|
18
|
+
class DuckDBFramework(ComputeFramework):
|
|
19
|
+
"""DuckDB framework implementation for ComputeFramework.
|
|
20
20
|
|
|
21
21
|
This framework does not support multiprocessing, so it should not be used with multiprocessing.
|
|
22
22
|
"""
|
|
@@ -43,11 +43,12 @@ class DuckDBFramework(ComputeFrameWork):
|
|
|
43
43
|
except ImportError:
|
|
44
44
|
return False
|
|
45
45
|
|
|
46
|
-
@
|
|
47
|
-
def expected_data_framework() -> Any:
|
|
48
|
-
return
|
|
46
|
+
@classmethod
|
|
47
|
+
def expected_data_framework(cls) -> Any:
|
|
48
|
+
return cls.duckdb_relation()
|
|
49
49
|
|
|
50
|
-
|
|
50
|
+
@classmethod
|
|
51
|
+
def merge_engine(cls) -> Type[BaseMergeEngine]:
|
|
51
52
|
return DuckDBMergeEngine
|
|
52
53
|
|
|
53
54
|
def select_data_by_column_names(self, data: Any, selected_feature_names: Set[FeatureName]) -> Any:
|
|
@@ -61,8 +62,8 @@ class DuckDBFramework(ComputeFrameWork):
|
|
|
61
62
|
def set_column_names(self) -> None:
|
|
62
63
|
self.column_names = set(self.data.columns)
|
|
63
64
|
|
|
64
|
-
@
|
|
65
|
-
def duckdb_relation() -> Any:
|
|
65
|
+
@classmethod
|
|
66
|
+
def duckdb_relation(cls) -> Any:
|
|
66
67
|
if duckdb is None:
|
|
67
68
|
raise ImportError("DuckDB is not installed. To be able to use this framework, please install duckdb.")
|
|
68
69
|
return duckdb.DuckDBPyRelation
|
|
@@ -117,5 +118,6 @@ class DuckDBFramework(ComputeFrameWork):
|
|
|
117
118
|
|
|
118
119
|
raise ValueError(f"Data {type(data)} is not supported by {self.__class__.__name__}")
|
|
119
120
|
|
|
120
|
-
|
|
121
|
+
@classmethod
|
|
122
|
+
def filter_engine(cls) -> Type[BaseFilterEngine]:
|
|
121
123
|
return DuckDBFilterEngine
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
|
-
from
|
|
4
|
-
from
|
|
5
|
-
from
|
|
3
|
+
from mloda.user import Index
|
|
4
|
+
from mloda.user import JoinType
|
|
5
|
+
from mloda.provider import BaseMergeEngine
|
|
6
6
|
|
|
7
7
|
try:
|
|
8
8
|
import duckdb
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import Any, Optional, Type
|
|
2
|
-
from
|
|
3
|
-
from
|
|
2
|
+
from mloda.provider import BaseFilterEngine
|
|
3
|
+
from mloda.user import SingleFilter
|
|
4
4
|
|
|
5
5
|
try:
|
|
6
6
|
from pyiceberg.table import Table as IcebergTable
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from typing import Any, Set, Type, Optional
|
|
2
|
-
from
|
|
3
|
-
from
|
|
4
|
-
from
|
|
5
|
-
from
|
|
2
|
+
from mloda.provider import BaseMergeEngine
|
|
3
|
+
from mloda.user import FeatureName
|
|
4
|
+
from mloda import ComputeFramework
|
|
5
|
+
from mloda.provider import BaseFilterEngine
|
|
6
6
|
from mloda_plugins.compute_framework.base_implementations.iceberg.iceberg_filter_engine import IcebergFilterEngine
|
|
7
7
|
|
|
8
8
|
try:
|
|
@@ -15,7 +15,7 @@ except ImportError:
|
|
|
15
15
|
pa = None
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
class IcebergFramework(
|
|
18
|
+
class IcebergFramework(ComputeFramework):
|
|
19
19
|
"""
|
|
20
20
|
Iceberg compute framework implementation.
|
|
21
21
|
|
|
@@ -60,17 +60,18 @@ class IcebergFramework(ComputeFrameWork):
|
|
|
60
60
|
except ImportError:
|
|
61
61
|
return False
|
|
62
62
|
|
|
63
|
-
@
|
|
64
|
-
def expected_data_framework() -> Any:
|
|
63
|
+
@classmethod
|
|
64
|
+
def expected_data_framework(cls) -> Any:
|
|
65
65
|
"""Return the expected Iceberg table type."""
|
|
66
66
|
if IcebergTable is None:
|
|
67
67
|
raise ImportError("PyIceberg is not installed. To use this framework, please install pyiceberg.")
|
|
68
68
|
return IcebergTable
|
|
69
69
|
|
|
70
|
-
|
|
70
|
+
@classmethod
|
|
71
|
+
def merge_engine(cls) -> Type[BaseMergeEngine]:
|
|
71
72
|
"""Iceberg tables don't support direct merging in this framework context."""
|
|
72
73
|
raise NotImplementedError(
|
|
73
|
-
f"Merge functionality is not implemented for {
|
|
74
|
+
f"Merge functionality is not implemented for {cls.__name__}. "
|
|
74
75
|
"Iceberg tables are typically used for data lake scenarios where merging "
|
|
75
76
|
"is handled at the catalog/table/engine level, not at the compute framework level."
|
|
76
77
|
)
|
|
@@ -163,6 +164,7 @@ class IcebergFramework(ComputeFrameWork):
|
|
|
163
164
|
|
|
164
165
|
raise ValueError(f"Data type {type(self.data)} is not supported by {self.__class__.__name__}")
|
|
165
166
|
|
|
166
|
-
|
|
167
|
+
@classmethod
|
|
168
|
+
def filter_engine(cls) -> Type[BaseFilterEngine]:
|
|
167
169
|
"""Return the Iceberg filter engine."""
|
|
168
170
|
return IcebergFilterEngine
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
from typing import Any, Set, Type
|
|
2
|
-
from
|
|
2
|
+
from mloda.provider import BaseMergeEngine
|
|
3
3
|
from mloda_plugins.compute_framework.base_implementations.pandas.pandas_merge_engine import PandasMergeEngine
|
|
4
|
-
from
|
|
5
|
-
from
|
|
6
|
-
from
|
|
4
|
+
from mloda.user import FeatureName
|
|
5
|
+
from mloda import ComputeFramework
|
|
6
|
+
from mloda.provider import BaseFilterEngine
|
|
7
7
|
from mloda_plugins.compute_framework.base_implementations.pandas.pandas_filter_engine import PandasFilterEngine
|
|
8
8
|
|
|
9
9
|
try:
|
|
@@ -12,7 +12,7 @@ except ImportError:
|
|
|
12
12
|
pd = None
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
class PandasDataFrame(
|
|
15
|
+
class PandasDataFrame(ComputeFramework):
|
|
16
16
|
@staticmethod
|
|
17
17
|
def is_available() -> bool:
|
|
18
18
|
"""Check if Pandas is installed and available."""
|
|
@@ -23,11 +23,12 @@ class PandasDataFrame(ComputeFrameWork):
|
|
|
23
23
|
except ImportError:
|
|
24
24
|
return False
|
|
25
25
|
|
|
26
|
-
@
|
|
27
|
-
def expected_data_framework() -> Any:
|
|
28
|
-
return
|
|
26
|
+
@classmethod
|
|
27
|
+
def expected_data_framework(cls) -> Any:
|
|
28
|
+
return cls.pd_dataframe()
|
|
29
29
|
|
|
30
|
-
|
|
30
|
+
@classmethod
|
|
31
|
+
def merge_engine(cls) -> Type[BaseMergeEngine]:
|
|
31
32
|
return PandasMergeEngine
|
|
32
33
|
|
|
33
34
|
def select_data_by_column_names(self, data: Any, selected_feature_names: Set[FeatureName]) -> Any:
|
|
@@ -38,20 +39,20 @@ class PandasDataFrame(ComputeFrameWork):
|
|
|
38
39
|
def set_column_names(self) -> None:
|
|
39
40
|
self.column_names = set(self.data.columns)
|
|
40
41
|
|
|
41
|
-
@
|
|
42
|
-
def pd_dataframe() -> Any:
|
|
42
|
+
@classmethod
|
|
43
|
+
def pd_dataframe(cls) -> Any:
|
|
43
44
|
if pd is None:
|
|
44
45
|
raise ImportError("Pandas is not installed. To be able to use this framework, please install pandas.")
|
|
45
46
|
return pd.DataFrame
|
|
46
47
|
|
|
47
|
-
@
|
|
48
|
-
def pd_series() -> Any:
|
|
48
|
+
@classmethod
|
|
49
|
+
def pd_series(cls) -> Any:
|
|
49
50
|
if pd is None:
|
|
50
51
|
raise ImportError("Pandas is not installed. To be able to use this framework, please install pandas.")
|
|
51
52
|
return pd.Series
|
|
52
53
|
|
|
53
|
-
@
|
|
54
|
-
def pd_merge() -> Any:
|
|
54
|
+
@classmethod
|
|
55
|
+
def pd_merge(cls) -> Any:
|
|
55
56
|
if pd is None:
|
|
56
57
|
raise ImportError("Pandas is not installed. To be able to use this framework, please install pandas.")
|
|
57
58
|
return pd.merge
|
|
@@ -83,5 +84,6 @@ class PandasDataFrame(ComputeFrameWork):
|
|
|
83
84
|
|
|
84
85
|
raise ValueError(f"Data {type(data)} is not supported by {self.__class__.__name__}")
|
|
85
86
|
|
|
86
|
-
|
|
87
|
+
@classmethod
|
|
88
|
+
def filter_engine(cls) -> Type[BaseFilterEngine]:
|
|
87
89
|
return PandasFilterEngine
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import Any
|
|
2
|
-
from
|
|
3
|
-
from
|
|
2
|
+
from mloda.provider import BaseFilterEngine
|
|
3
|
+
from mloda.user import SingleFilter
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
class PandasFilterEngine(BaseFilterEngine):
|
|
@@ -23,11 +23,19 @@ class PandasFilterEngine(BaseFilterEngine):
|
|
|
23
23
|
|
|
24
24
|
@classmethod
|
|
25
25
|
def do_min_filter(cls, data: Any, filter_feature: SingleFilter) -> Any:
|
|
26
|
-
|
|
26
|
+
value = filter_feature.parameter.value
|
|
27
|
+
if value is None:
|
|
28
|
+
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
29
|
+
return data[data[filter_feature.name] >= value]
|
|
27
30
|
|
|
28
31
|
@classmethod
|
|
29
32
|
def do_max_filter(cls, data: Any, filter_feature: SingleFilter) -> Any:
|
|
30
|
-
if
|
|
33
|
+
# Check if this is a complex parameter with max/max_exclusive or a simple one with value
|
|
34
|
+
has_max = filter_feature.parameter.max_value is not None
|
|
35
|
+
has_value = filter_feature.parameter.value is not None
|
|
36
|
+
|
|
37
|
+
if has_max:
|
|
38
|
+
# Complex parameter - use get_min_max_operator
|
|
31
39
|
min_parameter, max_parameter, max_operator = cls.get_min_max_operator(filter_feature)
|
|
32
40
|
|
|
33
41
|
if min_parameter is not None:
|
|
@@ -40,21 +48,36 @@ class PandasFilterEngine(BaseFilterEngine):
|
|
|
40
48
|
f"Filter parameter {filter_feature.parameter} is None although expected: {filter_feature.name}"
|
|
41
49
|
)
|
|
42
50
|
|
|
43
|
-
|
|
44
|
-
data[data[filter_feature.name] < max_parameter]
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
51
|
+
if max_operator is True:
|
|
52
|
+
return data[data[filter_feature.name] < max_parameter]
|
|
53
|
+
else:
|
|
54
|
+
return data[data[filter_feature.name] <= max_parameter]
|
|
55
|
+
elif has_value:
|
|
56
|
+
# Simple parameter - extract the value
|
|
57
|
+
value = filter_feature.parameter.value
|
|
58
|
+
if value is None:
|
|
59
|
+
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
60
|
+
return data[data[filter_feature.name] <= value]
|
|
61
|
+
else:
|
|
62
|
+
raise ValueError(f"No valid filter parameter found in {filter_feature.parameter}")
|
|
49
63
|
|
|
50
64
|
@classmethod
|
|
51
65
|
def do_equal_filter(cls, data: Any, filter_feature: SingleFilter) -> Any:
|
|
52
|
-
|
|
66
|
+
value = filter_feature.parameter.value
|
|
67
|
+
if value is None:
|
|
68
|
+
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
69
|
+
return data[data[filter_feature.name] == value]
|
|
53
70
|
|
|
54
71
|
@classmethod
|
|
55
72
|
def do_regex_filter(cls, data: Any, filter_feature: SingleFilter) -> Any:
|
|
56
|
-
|
|
73
|
+
value = filter_feature.parameter.value
|
|
74
|
+
if value is None:
|
|
75
|
+
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
76
|
+
return data[data[filter_feature.name].astype(str).str.match(value)]
|
|
57
77
|
|
|
58
78
|
@classmethod
|
|
59
79
|
def do_categorical_inclusion_filter(cls, data: Any, filter_feature: SingleFilter) -> Any:
|
|
60
|
-
|
|
80
|
+
values = filter_feature.parameter.values
|
|
81
|
+
if values is None:
|
|
82
|
+
raise ValueError(f"Filter parameter 'values' not found in {filter_feature.parameter}")
|
|
83
|
+
return data[data[filter_feature.name].isin(values)]
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from typing import Any, Union
|
|
2
2
|
|
|
3
|
-
from
|
|
4
|
-
from
|
|
5
|
-
from
|
|
3
|
+
from mloda.user import Index
|
|
4
|
+
from mloda.user import JoinType
|
|
5
|
+
from mloda.provider import BaseMergeEngine
|
|
6
6
|
|
|
7
7
|
try:
|
|
8
8
|
import pandas as pd
|
|
@@ -49,14 +49,14 @@ class PandasMergeEngine(BaseMergeEngine):
|
|
|
49
49
|
left_data = self.pd_merge()(left_data, right_data, left_on=left_idx, right_on=right_idx, how=join_type)
|
|
50
50
|
return left_data
|
|
51
51
|
|
|
52
|
-
@
|
|
53
|
-
def pd_merge() -> Any:
|
|
52
|
+
@classmethod
|
|
53
|
+
def pd_merge(cls) -> Any:
|
|
54
54
|
if pd is None:
|
|
55
55
|
raise ImportError("Pandas is not installed. To be able to use this framework, please install pandas.")
|
|
56
56
|
return pd.merge
|
|
57
57
|
|
|
58
|
-
@
|
|
59
|
-
def pd_concat() -> Any:
|
|
58
|
+
@classmethod
|
|
59
|
+
def pd_concat(cls) -> Any:
|
|
60
60
|
if pd is None:
|
|
61
61
|
raise ImportError("Pandas is not installed. To be able to use this framework, please install pandas.")
|
|
62
62
|
return pd.concat
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
from typing import Any, Set, Type
|
|
2
|
-
from
|
|
2
|
+
from mloda.provider import BaseMergeEngine
|
|
3
3
|
from mloda_plugins.compute_framework.base_implementations.polars.polars_merge_engine import PolarsMergeEngine
|
|
4
|
-
from
|
|
5
|
-
from
|
|
6
|
-
from
|
|
4
|
+
from mloda.user import FeatureName
|
|
5
|
+
from mloda import ComputeFramework
|
|
6
|
+
from mloda.provider import BaseFilterEngine
|
|
7
7
|
from mloda_plugins.compute_framework.base_implementations.polars.polars_filter_engine import PolarsFilterEngine
|
|
8
8
|
|
|
9
9
|
try:
|
|
@@ -12,7 +12,7 @@ except ImportError:
|
|
|
12
12
|
pl = None # type: ignore[assignment]
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
class PolarsDataFrame(
|
|
15
|
+
class PolarsDataFrame(ComputeFramework):
|
|
16
16
|
@staticmethod
|
|
17
17
|
def is_available() -> bool:
|
|
18
18
|
"""Check if Polars is installed and available."""
|
|
@@ -23,11 +23,12 @@ class PolarsDataFrame(ComputeFrameWork):
|
|
|
23
23
|
except ImportError:
|
|
24
24
|
return False
|
|
25
25
|
|
|
26
|
-
@
|
|
27
|
-
def expected_data_framework() -> Any:
|
|
28
|
-
return
|
|
26
|
+
@classmethod
|
|
27
|
+
def expected_data_framework(cls) -> Any:
|
|
28
|
+
return cls.pl_dataframe()
|
|
29
29
|
|
|
30
|
-
|
|
30
|
+
@classmethod
|
|
31
|
+
def merge_engine(cls) -> Type[BaseMergeEngine]:
|
|
31
32
|
return PolarsMergeEngine
|
|
32
33
|
|
|
33
34
|
def select_data_by_column_names(self, data: Any, selected_feature_names: Set[FeatureName]) -> Any:
|
|
@@ -38,14 +39,14 @@ class PolarsDataFrame(ComputeFrameWork):
|
|
|
38
39
|
def set_column_names(self) -> None:
|
|
39
40
|
self.column_names = set(self.data.columns)
|
|
40
41
|
|
|
41
|
-
@
|
|
42
|
-
def pl_dataframe() -> Any:
|
|
42
|
+
@classmethod
|
|
43
|
+
def pl_dataframe(cls) -> Any:
|
|
43
44
|
if pl is None:
|
|
44
45
|
raise ImportError("Polars is not installed. To be able to use this framework, please install polars.")
|
|
45
46
|
return pl.DataFrame
|
|
46
47
|
|
|
47
|
-
@
|
|
48
|
-
def pl_series() -> Any:
|
|
48
|
+
@classmethod
|
|
49
|
+
def pl_series(cls) -> Any:
|
|
49
50
|
if pl is None:
|
|
50
51
|
raise ImportError("Polars is not installed. To be able to use this framework, please install polars.")
|
|
51
52
|
return pl.Series
|
|
@@ -77,5 +78,6 @@ class PolarsDataFrame(ComputeFrameWork):
|
|
|
77
78
|
|
|
78
79
|
raise ValueError(f"Data {type(data)} is not supported by {self.__class__.__name__}")
|
|
79
80
|
|
|
80
|
-
|
|
81
|
+
@classmethod
|
|
82
|
+
def filter_engine(cls) -> Type[BaseFilterEngine]:
|
|
81
83
|
return PolarsFilterEngine
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import Any, Set, Type
|
|
2
|
-
from
|
|
2
|
+
from mloda.user import FeatureName
|
|
3
3
|
from mloda_plugins.compute_framework.base_implementations.polars.dataframe import PolarsDataFrame
|
|
4
|
-
from
|
|
4
|
+
from mloda.provider import BaseMergeEngine
|
|
5
5
|
from mloda_plugins.compute_framework.base_implementations.polars.polars_lazy_merge_engine import PolarsLazyMergeEngine
|
|
6
6
|
|
|
7
7
|
try:
|
|
@@ -18,11 +18,12 @@ class PolarsLazyDataFrame(PolarsDataFrame):
|
|
|
18
18
|
requested, enabling query optimization and reduced memory usage for large datasets.
|
|
19
19
|
"""
|
|
20
20
|
|
|
21
|
-
@
|
|
22
|
-
def expected_data_framework() -> Any:
|
|
23
|
-
return
|
|
21
|
+
@classmethod
|
|
22
|
+
def expected_data_framework(cls) -> Any:
|
|
23
|
+
return cls.pl_lazy_frame()
|
|
24
24
|
|
|
25
|
-
|
|
25
|
+
@classmethod
|
|
26
|
+
def merge_engine(cls) -> Type[BaseMergeEngine]:
|
|
26
27
|
return PolarsLazyMergeEngine
|
|
27
28
|
|
|
28
29
|
def select_data_by_column_names(self, data: Any, selected_feature_names: Set[FeatureName]) -> Any:
|
|
@@ -39,20 +40,20 @@ class PolarsLazyDataFrame(PolarsDataFrame):
|
|
|
39
40
|
else:
|
|
40
41
|
raise ValueError("Data does not have a collect_schema method, cannot set column names.")
|
|
41
42
|
|
|
42
|
-
@
|
|
43
|
-
def pl_lazy_frame() -> Any:
|
|
43
|
+
@classmethod
|
|
44
|
+
def pl_lazy_frame(cls) -> Any:
|
|
44
45
|
if pl is None:
|
|
45
46
|
raise ImportError("Polars is not installed. To be able to use this framework, please install polars.")
|
|
46
47
|
return pl.LazyFrame
|
|
47
48
|
|
|
48
|
-
@
|
|
49
|
-
def pl_dataframe() -> Any:
|
|
49
|
+
@classmethod
|
|
50
|
+
def pl_dataframe(cls) -> Any:
|
|
50
51
|
if pl is None:
|
|
51
52
|
raise ImportError("Polars is not installed. To be able to use this framework, please install polars.")
|
|
52
53
|
return pl.DataFrame
|
|
53
54
|
|
|
54
|
-
@
|
|
55
|
-
def pl_series() -> Any:
|
|
55
|
+
@classmethod
|
|
56
|
+
def pl_series(cls) -> Any:
|
|
56
57
|
if pl is None:
|
|
57
58
|
raise ImportError("Polars is not installed. To be able to use this framework, please install polars.")
|
|
58
59
|
return pl.Series
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from typing import Any, Union
|
|
2
2
|
|
|
3
|
-
from
|
|
4
|
-
from
|
|
5
|
-
from
|
|
3
|
+
from mloda.user import Index
|
|
4
|
+
from mloda.user import JoinType
|
|
5
|
+
from mloda.provider import BaseMergeEngine
|
|
6
6
|
|
|
7
7
|
try:
|
|
8
8
|
import polars as pl
|
|
@@ -2,8 +2,8 @@ from typing import Any
|
|
|
2
2
|
import pyarrow as pa
|
|
3
3
|
import pyarrow.compute as pc
|
|
4
4
|
|
|
5
|
-
from
|
|
6
|
-
from
|
|
5
|
+
from mloda.provider import BaseFilterEngine
|
|
6
|
+
from mloda.user import SingleFilter
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class PyArrowFilterEngine(BaseFilterEngine):
|
|
@@ -3,9 +3,9 @@ from typing import Any
|
|
|
3
3
|
import pyarrow as pa
|
|
4
4
|
import pyarrow.compute as pc
|
|
5
5
|
|
|
6
|
-
from
|
|
7
|
-
from
|
|
8
|
-
from
|
|
6
|
+
from mloda.user import Index
|
|
7
|
+
from mloda.user import JoinType
|
|
8
|
+
from mloda.provider import BaseMergeEngine
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class PyArrowMergeEngine(BaseMergeEngine):
|
|
@@ -1,13 +1,12 @@
|
|
|
1
|
-
from typing import Any,
|
|
2
|
-
from
|
|
3
|
-
from
|
|
4
|
-
from mloda_core.filter.filter_engine import BaseFilterEngine
|
|
1
|
+
from typing import Any, Set, Type
|
|
2
|
+
from mloda.provider import BaseMergeEngine
|
|
3
|
+
from mloda.provider import BaseFilterEngine
|
|
5
4
|
from mloda_plugins.compute_framework.base_implementations.pyarrow.pyarrow_merge_engine import PyArrowMergeEngine
|
|
6
5
|
from mloda_plugins.compute_framework.base_implementations.pyarrow.pyarrow_filter_engine import PyArrowFilterEngine
|
|
7
6
|
import pyarrow as pa
|
|
8
7
|
|
|
9
|
-
from
|
|
10
|
-
from
|
|
8
|
+
from mloda.user import FeatureName
|
|
9
|
+
from mloda import ComputeFramework
|
|
11
10
|
|
|
12
11
|
|
|
13
12
|
try:
|
|
@@ -16,7 +15,7 @@ except ImportError:
|
|
|
16
15
|
pd = None
|
|
17
16
|
|
|
18
17
|
|
|
19
|
-
class PyArrowTable(
|
|
18
|
+
class PyArrowTable(ComputeFramework):
|
|
20
19
|
@staticmethod
|
|
21
20
|
def is_available() -> bool:
|
|
22
21
|
"""Check if PyArrow is installed and available."""
|
|
@@ -27,14 +26,16 @@ class PyArrowTable(ComputeFrameWork):
|
|
|
27
26
|
except ImportError:
|
|
28
27
|
return False
|
|
29
28
|
|
|
30
|
-
@
|
|
31
|
-
def expected_data_framework() -> Any:
|
|
29
|
+
@classmethod
|
|
30
|
+
def expected_data_framework(cls) -> Any:
|
|
32
31
|
return pa.Table
|
|
33
32
|
|
|
34
|
-
|
|
33
|
+
@classmethod
|
|
34
|
+
def merge_engine(cls) -> Type[BaseMergeEngine]:
|
|
35
35
|
return PyArrowMergeEngine
|
|
36
36
|
|
|
37
|
-
|
|
37
|
+
@classmethod
|
|
38
|
+
def filter_engine(cls) -> Type[BaseFilterEngine]:
|
|
38
39
|
return PyArrowFilterEngine
|
|
39
40
|
|
|
40
41
|
def select_data_by_column_names(self, data: Any, selected_feature_names: Set[FeatureName]) -> Any:
|
mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_filter_engine.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from typing import Any
|
|
3
|
-
from
|
|
4
|
-
from
|
|
3
|
+
from mloda.provider import BaseFilterEngine
|
|
4
|
+
from mloda.user import SingleFilter
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class PythonDictFilterEngine(BaseFilterEngine):
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
from typing import Any, Set, Type, List, Dict
|
|
2
|
-
from
|
|
2
|
+
from mloda.provider import BaseMergeEngine
|
|
3
3
|
from mloda_plugins.compute_framework.base_implementations.python_dict.python_dict_merge_engine import (
|
|
4
4
|
PythonDictMergeEngine,
|
|
5
5
|
)
|
|
6
|
-
from
|
|
7
|
-
from
|
|
8
|
-
from
|
|
6
|
+
from mloda.user import FeatureName
|
|
7
|
+
from mloda import ComputeFramework
|
|
8
|
+
from mloda.provider import BaseFilterEngine
|
|
9
9
|
from mloda_plugins.compute_framework.base_implementations.python_dict.python_dict_filter_engine import (
|
|
10
10
|
PythonDictFilterEngine,
|
|
11
11
|
)
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
class PythonDictFramework(
|
|
14
|
+
class PythonDictFramework(ComputeFramework):
|
|
15
15
|
"""
|
|
16
16
|
PythonDict Compute Framework
|
|
17
17
|
|
|
@@ -29,11 +29,12 @@ class PythonDictFramework(ComputeFrameWork):
|
|
|
29
29
|
]
|
|
30
30
|
"""
|
|
31
31
|
|
|
32
|
-
@
|
|
33
|
-
def expected_data_framework() -> Any:
|
|
32
|
+
@classmethod
|
|
33
|
+
def expected_data_framework(cls) -> Any:
|
|
34
34
|
return list
|
|
35
35
|
|
|
36
|
-
|
|
36
|
+
@classmethod
|
|
37
|
+
def merge_engine(cls) -> Type[BaseMergeEngine]:
|
|
37
38
|
return PythonDictMergeEngine
|
|
38
39
|
|
|
39
40
|
def select_data_by_column_names(
|
|
@@ -118,7 +119,8 @@ class PythonDictFramework(ComputeFrameWork):
|
|
|
118
119
|
|
|
119
120
|
raise ValueError(f"Data type {type(data)} is not supported by {self.__class__.__name__}")
|
|
120
121
|
|
|
121
|
-
|
|
122
|
+
@classmethod
|
|
123
|
+
def filter_engine(cls) -> Type[BaseFilterEngine]:
|
|
122
124
|
"""
|
|
123
125
|
Returns the filter engine for PythonDict framework.
|
|
124
126
|
|