mloda 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mloda/__init__.py +17 -0
- {mloda_core → mloda/core}/abstract_plugins/components/base_artifact.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/base_validator.py +13 -0
- {mloda_core → mloda/core}/abstract_plugins/components/data_access_collection.py +1 -1
- {mloda_core → mloda/core}/abstract_plugins/components/data_types.py +39 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature.py +39 -33
- {mloda_core → mloda/core}/abstract_plugins/components/feature_chainer/feature_chain_parser.py +19 -19
- mloda/core/abstract_plugins/components/feature_chainer/feature_chain_parser_mixin.py +197 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature_collection.py +6 -6
- {mloda_core → mloda/core}/abstract_plugins/components/feature_group_version.py +8 -8
- {mloda_core → mloda/core}/abstract_plugins/components/feature_set.py +18 -24
- {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/cfw_transformer.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/index/add_index_feature.py +4 -4
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/api_input_data.py +3 -3
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/api_input_data_collection.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/base_api_data.py +1 -1
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/base_input_data.py +6 -6
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/creator/data_creator.py +3 -3
- mloda/core/abstract_plugins/components/link.py +437 -0
- {mloda_core → mloda/core}/abstract_plugins/components/match_data/match_data.py +3 -3
- {mloda_core → mloda/core}/abstract_plugins/components/merge/base_merge_engine.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/options.py +12 -36
- {mloda_core → mloda/core}/abstract_plugins/components/parallelization_modes.py +1 -1
- {mloda_core → mloda/core}/abstract_plugins/components/plugin_option/plugin_collector.py +14 -14
- mloda/core/abstract_plugins/components/validators/datatype_validator.py +96 -0
- mloda/core/abstract_plugins/components/validators/feature_set_validator.py +38 -0
- mloda/core/abstract_plugins/components/validators/feature_validator.py +23 -0
- mloda/core/abstract_plugins/components/validators/link_validator.py +79 -0
- mloda/core/abstract_plugins/components/validators/options_validator.py +57 -0
- mloda_core/abstract_plugins/compute_frame_work.py → mloda/core/abstract_plugins/compute_framework.py +46 -37
- mloda_core/abstract_plugins/abstract_feature_group.py → mloda/core/abstract_plugins/feature_group.py +56 -33
- mloda/core/abstract_plugins/function_extender.py +78 -0
- mloda/core/api/plugin_docs.py +220 -0
- mloda/core/api/plugin_info.py +32 -0
- {mloda_core → mloda/core}/api/prepare/setup_compute_framework.py +11 -11
- {mloda_core → mloda/core}/api/request.py +42 -33
- {mloda_core → mloda/core}/core/cfw_manager.py +8 -8
- {mloda_core → mloda/core}/core/engine.py +47 -46
- {mloda_core → mloda/core}/core/step/abstract_step.py +7 -7
- {mloda_core → mloda/core}/core/step/feature_group_step.py +12 -12
- {mloda_core → mloda/core}/core/step/join_step.py +14 -14
- {mloda_core → mloda/core}/core/step/transform_frame_work_step.py +16 -16
- {mloda_core → mloda/core}/filter/filter_engine.py +1 -1
- {mloda_core → mloda/core}/filter/filter_type_enum.py +1 -1
- {mloda_core → mloda/core}/filter/global_filter.py +23 -23
- {mloda_core → mloda/core}/filter/single_filter.py +6 -6
- {mloda_core → mloda/core}/prepare/accessible_plugins.py +16 -18
- {mloda_core → mloda/core}/prepare/execution_plan.py +65 -39
- {mloda_core → mloda/core}/prepare/graph/build_graph.py +6 -6
- {mloda_core → mloda/core}/prepare/graph/graph.py +1 -1
- {mloda_core → mloda/core}/prepare/graph/properties.py +5 -5
- {mloda_core → mloda/core}/prepare/identify_feature_group.py +12 -14
- {mloda_core → mloda/core}/prepare/joinstep_collection.py +3 -3
- {mloda_core → mloda/core}/prepare/resolve_compute_frameworks.py +6 -6
- {mloda_core → mloda/core}/prepare/resolve_graph.py +11 -11
- {mloda_core → mloda/core}/prepare/resolve_links.py +31 -40
- mloda/core/prepare/validators/resolve_link_validator.py +32 -0
- mloda/core/runtime/compute_framework_executor.py +271 -0
- mloda/core/runtime/data_lifecycle_manager.py +160 -0
- mloda/core/runtime/flight/__init__.py +0 -0
- {mloda_core → mloda/core}/runtime/flight/runner_flight_server.py +1 -1
- mloda/core/runtime/run.py +317 -0
- mloda/core/runtime/worker/__init__.py +0 -0
- {mloda_core → mloda/core}/runtime/worker/multiprocessing_worker.py +15 -10
- {mloda_core → mloda/core}/runtime/worker/thread_worker.py +2 -2
- mloda/core/runtime/worker_manager.py +96 -0
- mloda/provider/__init__.py +101 -0
- mloda/steward/__init__.py +25 -0
- mloda/user/__init__.py +57 -0
- {mloda-0.3.2.dist-info → mloda-0.4.0.dist-info}/METADATA +18 -22
- mloda-0.4.0.dist-info/RECORD +248 -0
- {mloda-0.3.2.dist-info → mloda-0.4.0.dist-info}/top_level.txt +1 -1
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_framework.py +15 -13
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_framework.py +12 -10
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/pandas/dataframe.py +18 -16
- mloda_plugins/compute_framework/base_implementations/pandas/pandas_filter_engine.py +36 -13
- mloda_plugins/compute_framework/base_implementations/pandas/pandas_merge_engine.py +7 -7
- mloda_plugins/compute_framework/base_implementations/pandas/pandaspyarrowtransformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/polars/dataframe.py +16 -14
- mloda_plugins/compute_framework/base_implementations/polars/lazy_dataframe.py +13 -12
- mloda_plugins/compute_framework/base_implementations/polars/polars_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/polars/polars_lazy_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/polars/polars_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/polars/polars_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/pyarrow/table.py +12 -10
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_framework.py +11 -9
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/spark/spark_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/spark/spark_framework.py +17 -15
- mloda_plugins/compute_framework/base_implementations/spark/spark_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/spark/spark_pyarrow_transformer.py +1 -1
- mloda_plugins/config/feature/loader.py +2 -2
- mloda_plugins/feature_group/experimental/aggregated_feature_group/base.py +45 -62
- mloda_plugins/feature_group/experimental/aggregated_feature_group/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/aggregated_feature_group/polars_lazy.py +2 -2
- mloda_plugins/feature_group/experimental/aggregated_feature_group/pyarrow.py +2 -2
- mloda_plugins/feature_group/experimental/clustering/base.py +69 -97
- mloda_plugins/feature_group/experimental/clustering/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/data_quality/missing_value/base.py +58 -79
- mloda_plugins/feature_group/experimental/data_quality/missing_value/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/data_quality/missing_value/pyarrow.py +2 -2
- mloda_plugins/feature_group/experimental/data_quality/missing_value/python_dict.py +2 -2
- mloda_plugins/feature_group/experimental/default_options_key.py +16 -19
- mloda_plugins/feature_group/experimental/dimensionality_reduction/base.py +80 -94
- mloda_plugins/feature_group/experimental/dimensionality_reduction/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/dynamic_feature_group_factory/dynamic_feature_group_factory.py +24 -24
- mloda_plugins/feature_group/experimental/forecasting/base.py +106 -104
- mloda_plugins/feature_group/experimental/forecasting/forecasting_artifact.py +2 -2
- mloda_plugins/feature_group/experimental/forecasting/pandas.py +15 -15
- mloda_plugins/feature_group/experimental/geo_distance/base.py +50 -42
- mloda_plugins/feature_group/experimental/geo_distance/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/llm/cli.py +4 -4
- mloda_plugins/feature_group/experimental/llm/cli_features/refactor_git_cached.py +19 -19
- mloda_plugins/feature_group/experimental/llm/installed_packages_feature_group.py +8 -8
- mloda_plugins/feature_group/experimental/llm/list_directory_feature_group.py +5 -5
- mloda_plugins/feature_group/experimental/llm/llm_api/claude.py +3 -3
- mloda_plugins/feature_group/experimental/llm/llm_api/gemini.py +3 -3
- mloda_plugins/feature_group/experimental/llm/llm_api/llm_base_request.py +5 -5
- mloda_plugins/feature_group/experimental/llm/llm_api/openai.py +3 -3
- mloda_plugins/feature_group/experimental/llm/llm_api/request_loop.py +6 -6
- mloda_plugins/feature_group/experimental/llm/llm_file_selector.py +10 -10
- mloda_plugins/feature_group/experimental/llm/tools/tool_collection.py +1 -1
- mloda_plugins/feature_group/experimental/node_centrality/base.py +46 -72
- mloda_plugins/feature_group/experimental/node_centrality/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/encoding/base.py +51 -51
- mloda_plugins/feature_group/experimental/sklearn/encoding/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/pipeline/base.py +52 -39
- mloda_plugins/feature_group/experimental/sklearn/pipeline/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/scaling/base.py +44 -58
- mloda_plugins/feature_group/experimental/sklearn/scaling/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/sklearn_artifact.py +2 -2
- mloda_plugins/feature_group/experimental/source_input_feature.py +15 -15
- mloda_plugins/feature_group/experimental/text_cleaning/base.py +38 -61
- mloda_plugins/feature_group/experimental/text_cleaning/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/text_cleaning/python_dict.py +2 -2
- mloda_plugins/feature_group/experimental/time_window/base.py +106 -93
- mloda_plugins/feature_group/experimental/time_window/pandas.py +13 -13
- mloda_plugins/feature_group/experimental/time_window/pyarrow.py +12 -12
- mloda_plugins/feature_group/input_data/api_data/api_data.py +9 -11
- mloda_plugins/feature_group/input_data/read_context_files.py +7 -7
- mloda_plugins/feature_group/input_data/read_db.py +7 -9
- mloda_plugins/feature_group/input_data/read_db_feature.py +4 -4
- mloda_plugins/feature_group/input_data/read_dbs/sqlite.py +23 -13
- mloda_plugins/feature_group/input_data/read_file.py +8 -8
- mloda_plugins/feature_group/input_data/read_file_feature.py +4 -4
- mloda_plugins/feature_group/input_data/read_files/csv.py +6 -6
- mloda_plugins/feature_group/input_data/read_files/feather.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/json.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/orc.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/parquet.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/text_file_reader.py +5 -5
- mloda_plugins/function_extender/base_implementations/otel/otel_extender.py +4 -4
- mloda-0.3.2.dist-info/RECORD +0 -230
- mloda_core/abstract_plugins/components/link.py +0 -286
- mloda_core/abstract_plugins/function_extender.py +0 -34
- mloda_core/runtime/run.py +0 -617
- {mloda_core → mloda/core}/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/domain.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature_chainer/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature_name.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/base_transformer.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/hashable_dict.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/index/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/index/index.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/creator/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/match_data/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/merge/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/plugin_option/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/utils.py +0 -0
- {mloda_core/abstract_plugins/plugin_loader → mloda/core/abstract_plugins/components/validators}/__init__.py +0 -0
- {mloda_core/api → mloda/core/abstract_plugins/plugin_loader}/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/plugin_loader/plugin_loader.py +0 -0
- {mloda_core/api/prepare → mloda/core/api}/__init__.py +0 -0
- {mloda_core/core → mloda/core/api/prepare}/__init__.py +0 -0
- {mloda_core/core/step → mloda/core/core}/__init__.py +0 -0
- {mloda_core/filter → mloda/core/core/step}/__init__.py +0 -0
- {mloda_core/prepare → mloda/core/filter}/__init__.py +0 -0
- {mloda_core → mloda/core}/filter/filter_parameter.py +0 -0
- {mloda_core/prepare/graph → mloda/core/prepare}/__init__.py +0 -0
- {mloda_core/runtime → mloda/core/prepare/graph}/__init__.py +0 -0
- {mloda_core/runtime/flight → mloda/core/prepare/validators}/__init__.py +0 -0
- {mloda_core/runtime/worker → mloda/core/runtime}/__init__.py +0 -0
- {mloda_core → mloda/core}/runtime/flight/flight_server.py +0 -0
- {mloda-0.3.2.dist-info → mloda-0.4.0.dist-info}/WHEEL +0 -0
- {mloda-0.3.2.dist-info → mloda-0.4.0.dist-info}/entry_points.txt +0 -0
- {mloda-0.3.2.dist-info → mloda-0.4.0.dist-info}/licenses/LICENSE.TXT +0 -0
- {mloda-0.3.2.dist-info → mloda-0.4.0.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -7,16 +7,19 @@ from __future__ import annotations
|
|
|
7
7
|
from abc import abstractmethod
|
|
8
8
|
from typing import Any, List, Optional, Set, Type, Union
|
|
9
9
|
|
|
10
|
-
from
|
|
11
|
-
from
|
|
12
|
-
from
|
|
13
|
-
from
|
|
14
|
-
|
|
15
|
-
|
|
10
|
+
from mloda import FeatureGroup
|
|
11
|
+
from mloda import Feature
|
|
12
|
+
from mloda.provider import FeatureChainParser
|
|
13
|
+
from mloda.provider import (
|
|
14
|
+
FeatureChainParserMixin,
|
|
15
|
+
)
|
|
16
|
+
from mloda.user import FeatureName
|
|
17
|
+
from mloda.provider import FeatureSet
|
|
18
|
+
from mloda import Options
|
|
16
19
|
from mloda_plugins.feature_group.experimental.default_options_key import DefaultOptionKeys
|
|
17
20
|
|
|
18
21
|
|
|
19
|
-
class TimeWindowFeatureGroup(
|
|
22
|
+
class TimeWindowFeatureGroup(FeatureChainParserMixin, FeatureGroup):
|
|
20
23
|
# Option keys for time window configuration
|
|
21
24
|
WINDOW_FUNCTION = "window_function"
|
|
22
25
|
WINDOW_SIZE = "window_size"
|
|
@@ -66,21 +69,21 @@ class TimeWindowFeatureGroup(AbstractFeatureGroup):
|
|
|
66
69
|
|
|
67
70
|
## Requirements
|
|
68
71
|
- The input data must have a datetime column that can be used for time-based operations
|
|
69
|
-
- By default, the feature group will use DefaultOptionKeys.reference_time (default: "
|
|
72
|
+
- By default, the feature group will use DefaultOptionKeys.reference_time (default: "reference_time")
|
|
70
73
|
- You can specify a custom time column by setting the reference_time option in the feature group options
|
|
71
74
|
|
|
72
75
|
"""
|
|
73
76
|
|
|
74
77
|
@classmethod
|
|
75
|
-
def
|
|
78
|
+
def get_reference_time_column(cls, options: Optional[Options] = None) -> str:
|
|
76
79
|
"""
|
|
77
|
-
Get the time
|
|
80
|
+
Get the reference time column name from options or use the default.
|
|
78
81
|
|
|
79
82
|
Args:
|
|
80
|
-
options: Optional Options object that may contain a custom time
|
|
83
|
+
options: Optional Options object that may contain a custom reference time column name
|
|
81
84
|
|
|
82
85
|
Returns:
|
|
83
|
-
The time
|
|
86
|
+
The reference time column name to use
|
|
84
87
|
"""
|
|
85
88
|
reference_time_key = DefaultOptionKeys.reference_time.value
|
|
86
89
|
if options and options.get(reference_time_key):
|
|
@@ -90,7 +93,7 @@ class TimeWindowFeatureGroup(AbstractFeatureGroup):
|
|
|
90
93
|
f"Invalid reference_time option: {reference_time}. Must be string. Is: {type(reference_time)}."
|
|
91
94
|
)
|
|
92
95
|
return reference_time
|
|
93
|
-
return
|
|
96
|
+
return DefaultOptionKeys.reference_time.value
|
|
94
97
|
|
|
95
98
|
# Define supported window functions
|
|
96
99
|
WINDOW_FUNCTIONS = {
|
|
@@ -123,28 +126,28 @@ class TimeWindowFeatureGroup(AbstractFeatureGroup):
|
|
|
123
126
|
# Window function parameter (context parameter)
|
|
124
127
|
WINDOW_FUNCTION: {
|
|
125
128
|
**WINDOW_FUNCTIONS, # Reference existing WINDOW_FUNCTIONS dict
|
|
126
|
-
DefaultOptionKeys.
|
|
127
|
-
DefaultOptionKeys.
|
|
129
|
+
DefaultOptionKeys.context: True, # Mark as context parameter
|
|
130
|
+
DefaultOptionKeys.strict_validation: True, # Enable strict validation
|
|
128
131
|
},
|
|
129
132
|
# Window size parameter (context parameter)
|
|
130
133
|
WINDOW_SIZE: {
|
|
131
134
|
"explanation": "Size of the time window (must be positive integer)",
|
|
132
|
-
DefaultOptionKeys.
|
|
133
|
-
DefaultOptionKeys.
|
|
134
|
-
DefaultOptionKeys.
|
|
135
|
+
DefaultOptionKeys.context: True, # Mark as context parameter
|
|
136
|
+
DefaultOptionKeys.strict_validation: True, # Enable strict validation
|
|
137
|
+
DefaultOptionKeys.validation_function: lambda x: (isinstance(x, int) and x > 0)
|
|
135
138
|
or (isinstance(x, str) and x.isdigit() and int(x) > 0),
|
|
136
139
|
},
|
|
137
140
|
# Time unit parameter (context parameter)
|
|
138
141
|
TIME_UNIT: {
|
|
139
142
|
**TIME_UNITS, # Reference existing TIME_UNITS dict
|
|
140
|
-
DefaultOptionKeys.
|
|
141
|
-
DefaultOptionKeys.
|
|
143
|
+
DefaultOptionKeys.context: True, # Mark as context parameter
|
|
144
|
+
DefaultOptionKeys.strict_validation: True, # Enable strict validation
|
|
142
145
|
},
|
|
143
146
|
# Source feature parameter (context parameter)
|
|
144
147
|
DefaultOptionKeys.in_features: {
|
|
145
148
|
"explanation": "Source feature to apply time window operation to",
|
|
146
|
-
DefaultOptionKeys.
|
|
147
|
-
DefaultOptionKeys.
|
|
149
|
+
DefaultOptionKeys.context: True, # Mark as context parameter
|
|
150
|
+
DefaultOptionKeys.strict_validation: False, # Flexible validation
|
|
148
151
|
},
|
|
149
152
|
}
|
|
150
153
|
|
|
@@ -152,6 +155,11 @@ class TimeWindowFeatureGroup(AbstractFeatureGroup):
|
|
|
152
155
|
PATTERN = "__"
|
|
153
156
|
PREFIX_PATTERN = r".*__([\w]+)_(\d+)_([\w]+)_window$"
|
|
154
157
|
|
|
158
|
+
# In-feature configuration for FeatureChainParserMixin
|
|
159
|
+
MIN_IN_FEATURES = 1
|
|
160
|
+
MAX_IN_FEATURES = 1
|
|
161
|
+
|
|
162
|
+
# Custom input_features needed to add time_filter_feature
|
|
155
163
|
def input_features(self, options: Options, feature_name: FeatureName) -> Optional[Set[Feature]]:
|
|
156
164
|
"""Extract source feature from either configuration-based options or string parsing."""
|
|
157
165
|
|
|
@@ -160,7 +168,7 @@ class TimeWindowFeatureGroup(AbstractFeatureGroup):
|
|
|
160
168
|
# Try string-based parsing first
|
|
161
169
|
_, source_feature = FeatureChainParser.parse_feature_name(feature_name.name, [self.PREFIX_PATTERN])
|
|
162
170
|
if source_feature is not None:
|
|
163
|
-
time_filter_feature = Feature(self.
|
|
171
|
+
time_filter_feature = Feature(self.get_reference_time_column(options))
|
|
164
172
|
return {Feature(source_feature), time_filter_feature}
|
|
165
173
|
|
|
166
174
|
# Fall back to configuration-based approach
|
|
@@ -170,9 +178,66 @@ class TimeWindowFeatureGroup(AbstractFeatureGroup):
|
|
|
170
178
|
f"Expected exactly one source feature, but found {len(source_features)}: {source_features}"
|
|
171
179
|
)
|
|
172
180
|
|
|
173
|
-
time_filter_feature = Feature(self.
|
|
181
|
+
time_filter_feature = Feature(self.get_reference_time_column(options))
|
|
174
182
|
return set(source_features) | {time_filter_feature}
|
|
175
183
|
|
|
184
|
+
@classmethod
|
|
185
|
+
def _extract_time_window_params(cls, feature: Feature) -> tuple[Optional[str], Optional[int], Optional[str]]:
|
|
186
|
+
"""
|
|
187
|
+
Extract time window parameters (window_function, window_size, time_unit) from a feature.
|
|
188
|
+
|
|
189
|
+
Tries string-based parsing first using parse_time_window_prefix, falls back to configuration.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
feature: The feature to extract parameters from
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
Tuple of (window_function, window_size, time_unit), where any value may be None if not found
|
|
196
|
+
"""
|
|
197
|
+
feature_name = feature.get_name()
|
|
198
|
+
|
|
199
|
+
# Try string-based parsing first
|
|
200
|
+
try:
|
|
201
|
+
window_function, window_size, time_unit = cls.parse_time_window_prefix(feature_name)
|
|
202
|
+
return window_function, window_size, time_unit
|
|
203
|
+
except ValueError:
|
|
204
|
+
pass
|
|
205
|
+
|
|
206
|
+
# Fall back to configuration
|
|
207
|
+
window_function = feature.options.get(cls.WINDOW_FUNCTION)
|
|
208
|
+
window_size = feature.options.get(cls.WINDOW_SIZE)
|
|
209
|
+
time_unit = feature.options.get(cls.TIME_UNIT)
|
|
210
|
+
|
|
211
|
+
# Convert window_size to int if it's a string
|
|
212
|
+
if window_size is not None and isinstance(window_size, str):
|
|
213
|
+
window_size = int(window_size)
|
|
214
|
+
|
|
215
|
+
return window_function, window_size, time_unit
|
|
216
|
+
|
|
217
|
+
@classmethod
|
|
218
|
+
def _extract_time_window_params_and_source_features(cls, feature: Feature) -> tuple[str, int, str, str]:
|
|
219
|
+
"""
|
|
220
|
+
Extract time window parameters and source feature from a feature.
|
|
221
|
+
|
|
222
|
+
Tries string-based parsing first, falls back to configuration-based approach.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
feature: The feature to extract parameters from
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
Tuple of (window_function, window_size, time_unit, source_feature_name)
|
|
229
|
+
|
|
230
|
+
Raises:
|
|
231
|
+
ValueError: If parameters cannot be extracted
|
|
232
|
+
"""
|
|
233
|
+
source_features = cls._extract_source_features(feature)
|
|
234
|
+
window_function, window_size, time_unit = cls._extract_time_window_params(feature)
|
|
235
|
+
|
|
236
|
+
if window_function is None or window_size is None or time_unit is None:
|
|
237
|
+
raise ValueError(f"Could not extract time window parameters from: {feature.name}")
|
|
238
|
+
|
|
239
|
+
return window_function, window_size, time_unit, source_features[0]
|
|
240
|
+
|
|
176
241
|
@classmethod
|
|
177
242
|
def parse_time_window_prefix(cls, feature_name: str) -> tuple[str, int, str]:
|
|
178
243
|
"""
|
|
@@ -243,24 +308,7 @@ class TimeWindowFeatureGroup(AbstractFeatureGroup):
|
|
|
243
308
|
"""Extract the time unit from the feature name."""
|
|
244
309
|
return cls.parse_time_window_prefix(feature_name)[2]
|
|
245
310
|
|
|
246
|
-
|
|
247
|
-
def match_feature_group_criteria(
|
|
248
|
-
cls,
|
|
249
|
-
feature_name: Union[FeatureName, str],
|
|
250
|
-
options: Options,
|
|
251
|
-
data_access_collection: Optional[Any] = None,
|
|
252
|
-
) -> bool:
|
|
253
|
-
"""Check if feature name matches the expected pattern for time window features."""
|
|
254
|
-
if isinstance(feature_name, FeatureName):
|
|
255
|
-
feature_name = feature_name.name
|
|
256
|
-
|
|
257
|
-
# Use unified parser approach with PROPERTY_MAPPING
|
|
258
|
-
return FeatureChainParser.match_configuration_feature_chain_parser(
|
|
259
|
-
feature_name,
|
|
260
|
-
options,
|
|
261
|
-
property_mapping=cls.PROPERTY_MAPPING,
|
|
262
|
-
prefix_patterns=[cls.PREFIX_PATTERN],
|
|
263
|
-
)
|
|
311
|
+
# match_feature_group_criteria() inherited from FeatureChainParserMixin
|
|
264
312
|
|
|
265
313
|
@classmethod
|
|
266
314
|
def calculate_feature(cls, data: Any, features: FeatureSet) -> Any:
|
|
@@ -283,52 +331,17 @@ class TimeWindowFeatureGroup(AbstractFeatureGroup):
|
|
|
283
331
|
raise ValueError("All features must have the same options.")
|
|
284
332
|
_options = feature.options
|
|
285
333
|
|
|
286
|
-
|
|
334
|
+
reference_time_column = cls.get_reference_time_column(_options)
|
|
287
335
|
|
|
288
|
-
cls.
|
|
336
|
+
cls._check_reference_time_column_exists(data, reference_time_column)
|
|
289
337
|
|
|
290
|
-
cls.
|
|
338
|
+
cls._check_reference_time_column_is_datetime(data, reference_time_column)
|
|
291
339
|
|
|
292
340
|
# Process each requested feature
|
|
293
341
|
for feature in features.features:
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
parsed_params, in_features = FeatureChainParser.parse_feature_name(feature_name, [cls.PREFIX_PATTERN])
|
|
298
|
-
|
|
299
|
-
if in_features is not None:
|
|
300
|
-
# String-based approach succeeded
|
|
301
|
-
window_function, window_size, time_unit = cls.parse_time_window_prefix(feature_name)
|
|
302
|
-
else:
|
|
303
|
-
# Fall back to configuration-based approach
|
|
304
|
-
has_config_params = (
|
|
305
|
-
feature.options.get(cls.WINDOW_FUNCTION) is not None
|
|
306
|
-
and feature.options.get(cls.WINDOW_SIZE) is not None
|
|
307
|
-
and feature.options.get(cls.TIME_UNIT) is not None
|
|
308
|
-
)
|
|
309
|
-
|
|
310
|
-
if not has_config_params:
|
|
311
|
-
raise ValueError(
|
|
312
|
-
f"Feature '{feature_name}' does not match string pattern and lacks configuration parameters"
|
|
313
|
-
)
|
|
314
|
-
|
|
315
|
-
# Configuration-based approach
|
|
316
|
-
source_features = feature.options.get_in_features()
|
|
317
|
-
if len(source_features) != 1:
|
|
318
|
-
raise ValueError(
|
|
319
|
-
f"Expected exactly one source feature, but found {len(source_features)}: {source_features}"
|
|
320
|
-
)
|
|
321
|
-
source_feature = next(iter(source_features))
|
|
322
|
-
in_features = source_feature.get_name()
|
|
323
|
-
|
|
324
|
-
# Extract parameters from options
|
|
325
|
-
window_function = feature.options.get(cls.WINDOW_FUNCTION)
|
|
326
|
-
window_size = feature.options.get(cls.WINDOW_SIZE)
|
|
327
|
-
time_unit = feature.options.get(cls.TIME_UNIT)
|
|
328
|
-
|
|
329
|
-
# Convert window_size to int if it's a string
|
|
330
|
-
if isinstance(window_size, str):
|
|
331
|
-
window_size = int(window_size)
|
|
342
|
+
window_function, window_size, time_unit, in_features = cls._extract_time_window_params_and_source_features(
|
|
343
|
+
feature
|
|
344
|
+
)
|
|
332
345
|
|
|
333
346
|
# Resolve multi-column features automatically
|
|
334
347
|
# If in_features is "onehot_encoded__product", this discovers
|
|
@@ -340,7 +353,7 @@ class TimeWindowFeatureGroup(AbstractFeatureGroup):
|
|
|
340
353
|
cls._check_source_features_exist(data, resolved_columns)
|
|
341
354
|
|
|
342
355
|
result = cls._perform_window_operation(
|
|
343
|
-
data, window_function, window_size, time_unit, resolved_columns,
|
|
356
|
+
data, window_function, window_size, time_unit, resolved_columns, reference_time_column
|
|
344
357
|
)
|
|
345
358
|
|
|
346
359
|
data = cls._add_result_to_data(data, feature.get_name(), result)
|
|
@@ -349,31 +362,31 @@ class TimeWindowFeatureGroup(AbstractFeatureGroup):
|
|
|
349
362
|
|
|
350
363
|
@classmethod
|
|
351
364
|
@abstractmethod
|
|
352
|
-
def
|
|
365
|
+
def _check_reference_time_column_exists(cls, data: Any, reference_time_column: str) -> None:
|
|
353
366
|
"""
|
|
354
|
-
Check if the time
|
|
367
|
+
Check if the reference time column exists in the data.
|
|
355
368
|
|
|
356
369
|
Args:
|
|
357
370
|
data: The input data
|
|
358
|
-
|
|
371
|
+
reference_time_column: The name of the reference time column
|
|
359
372
|
|
|
360
373
|
Raises:
|
|
361
|
-
ValueError: If the time
|
|
374
|
+
ValueError: If the reference time column does not exist in the data
|
|
362
375
|
"""
|
|
363
376
|
...
|
|
364
377
|
|
|
365
378
|
@classmethod
|
|
366
379
|
@abstractmethod
|
|
367
|
-
def
|
|
380
|
+
def _check_reference_time_column_is_datetime(cls, data: Any, reference_time_column: str) -> None:
|
|
368
381
|
"""
|
|
369
|
-
Check if the time
|
|
382
|
+
Check if the reference time column is a datetime column.
|
|
370
383
|
|
|
371
384
|
Args:
|
|
372
385
|
data: The input data
|
|
373
|
-
|
|
386
|
+
reference_time_column: The name of the reference time column
|
|
374
387
|
|
|
375
388
|
Raises:
|
|
376
|
-
ValueError: If the time
|
|
389
|
+
ValueError: If the reference time column is not a datetime column
|
|
377
390
|
"""
|
|
378
391
|
...
|
|
379
392
|
|
|
@@ -447,7 +460,7 @@ class TimeWindowFeatureGroup(AbstractFeatureGroup):
|
|
|
447
460
|
time_unit: The time unit for the window
|
|
448
461
|
in_features: List of resolved source feature names to perform window operation on
|
|
449
462
|
time_filter_feature: The name of the time filter feature to use for time-based operations.
|
|
450
|
-
If None, uses the value from
|
|
463
|
+
If None, uses the value from get_reference_time_column().
|
|
451
464
|
|
|
452
465
|
Returns:
|
|
453
466
|
The result of the window operation
|
|
@@ -6,7 +6,7 @@ from __future__ import annotations
|
|
|
6
6
|
|
|
7
7
|
from typing import Any, List, Optional, Set, Type, Union
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from mloda import ComputeFramework
|
|
10
10
|
from mloda_plugins.compute_framework.base_implementations.pandas.dataframe import PandasDataFrame
|
|
11
11
|
from mloda_plugins.feature_group.experimental.time_window.base import TimeWindowFeatureGroup
|
|
12
12
|
|
|
@@ -19,25 +19,25 @@ except ImportError:
|
|
|
19
19
|
|
|
20
20
|
class PandasTimeWindowFeatureGroup(TimeWindowFeatureGroup):
|
|
21
21
|
@classmethod
|
|
22
|
-
def compute_framework_rule(cls) -> Union[bool, Set[Type[
|
|
22
|
+
def compute_framework_rule(cls) -> Union[bool, Set[Type[ComputeFramework]]]:
|
|
23
23
|
return {PandasDataFrame}
|
|
24
24
|
|
|
25
25
|
@classmethod
|
|
26
|
-
def
|
|
27
|
-
"""Check if the time
|
|
28
|
-
if
|
|
26
|
+
def _check_reference_time_column_exists(cls, data: pd.DataFrame, reference_time_column: str) -> None:
|
|
27
|
+
"""Check if the reference time column exists in the DataFrame."""
|
|
28
|
+
if reference_time_column not in data.columns:
|
|
29
29
|
raise ValueError(
|
|
30
|
-
f"
|
|
30
|
+
f"Reference time column '{reference_time_column}' not found in data. "
|
|
31
31
|
f"Please ensure the DataFrame contains this column."
|
|
32
32
|
)
|
|
33
33
|
|
|
34
34
|
@classmethod
|
|
35
|
-
def
|
|
36
|
-
"""Check if the time
|
|
37
|
-
if not pd.api.types.is_datetime64_any_dtype(data[
|
|
35
|
+
def _check_reference_time_column_is_datetime(cls, data: pd.DataFrame, reference_time_column: str) -> None:
|
|
36
|
+
"""Check if the reference time column is a datetime column."""
|
|
37
|
+
if not pd.api.types.is_datetime64_any_dtype(data[reference_time_column]):
|
|
38
38
|
raise ValueError(
|
|
39
|
-
f"
|
|
40
|
-
f"Current dtype: {data[
|
|
39
|
+
f"Reference time column '{reference_time_column}' must be a datetime column. "
|
|
40
|
+
f"Current dtype: {data[reference_time_column].dtype}"
|
|
41
41
|
)
|
|
42
42
|
|
|
43
43
|
@classmethod
|
|
@@ -93,14 +93,14 @@ class PandasTimeWindowFeatureGroup(TimeWindowFeatureGroup):
|
|
|
93
93
|
time_unit: The time unit for the window
|
|
94
94
|
in_features: List of source feature names (may be single or multiple columns)
|
|
95
95
|
time_filter_feature: The name of the time filter feature to use for time-based operations.
|
|
96
|
-
If None, uses the value from
|
|
96
|
+
If None, uses the value from get_reference_time_column().
|
|
97
97
|
|
|
98
98
|
Returns:
|
|
99
99
|
The result of the window operation
|
|
100
100
|
"""
|
|
101
101
|
# Use the default time filter feature if none is provided
|
|
102
102
|
if time_filter_feature is None:
|
|
103
|
-
time_filter_feature = cls.
|
|
103
|
+
time_filter_feature = cls.get_reference_time_column()
|
|
104
104
|
|
|
105
105
|
# Create a copy of the DataFrame with the time filter feature as the index
|
|
106
106
|
# This is necessary for time-based rolling operations
|
|
@@ -10,7 +10,7 @@ import datetime
|
|
|
10
10
|
import pyarrow as pa
|
|
11
11
|
import pyarrow.compute as pc
|
|
12
12
|
|
|
13
|
-
from
|
|
13
|
+
from mloda import ComputeFramework
|
|
14
14
|
|
|
15
15
|
from mloda_plugins.compute_framework.base_implementations.pyarrow.table import PyArrowTable
|
|
16
16
|
from mloda_plugins.feature_group.experimental.time_window.base import TimeWindowFeatureGroup
|
|
@@ -18,25 +18,25 @@ from mloda_plugins.feature_group.experimental.time_window.base import TimeWindow
|
|
|
18
18
|
|
|
19
19
|
class PyArrowTimeWindowFeatureGroup(TimeWindowFeatureGroup):
|
|
20
20
|
@classmethod
|
|
21
|
-
def compute_framework_rule(cls) -> Union[bool, Set[Type[
|
|
21
|
+
def compute_framework_rule(cls) -> Union[bool, Set[Type[ComputeFramework]]]:
|
|
22
22
|
return {PyArrowTable}
|
|
23
23
|
|
|
24
24
|
@classmethod
|
|
25
|
-
def
|
|
26
|
-
"""Check if the time
|
|
27
|
-
if
|
|
25
|
+
def _check_reference_time_column_exists(cls, data: pa.Table, reference_time_column: str) -> None:
|
|
26
|
+
"""Check if the reference time column exists in the Table."""
|
|
27
|
+
if reference_time_column not in data.schema.names:
|
|
28
28
|
raise ValueError(
|
|
29
|
-
f"
|
|
29
|
+
f"Reference time column '{reference_time_column}' not found in data. "
|
|
30
30
|
f"Please ensure the Table contains this column."
|
|
31
31
|
)
|
|
32
32
|
|
|
33
33
|
@classmethod
|
|
34
|
-
def
|
|
35
|
-
"""Check if the time
|
|
36
|
-
time_column = data.column(
|
|
34
|
+
def _check_reference_time_column_is_datetime(cls, data: pa.Table, reference_time_column: str) -> None:
|
|
35
|
+
"""Check if the reference time column is a datetime column."""
|
|
36
|
+
time_column = data.column(reference_time_column)
|
|
37
37
|
if not pa.types.is_timestamp(time_column.type):
|
|
38
38
|
raise ValueError(
|
|
39
|
-
f"
|
|
39
|
+
f"Reference time column '{reference_time_column}' must be a timestamp column. "
|
|
40
40
|
f"Current type: {time_column.type}"
|
|
41
41
|
)
|
|
42
42
|
|
|
@@ -103,14 +103,14 @@ class PyArrowTimeWindowFeatureGroup(TimeWindowFeatureGroup):
|
|
|
103
103
|
time_unit: The time unit for the window
|
|
104
104
|
in_features: List of source feature names (may be single or multiple columns)
|
|
105
105
|
time_filter_feature: The name of the time filter feature to use for time-based operations.
|
|
106
|
-
If None, uses the value from
|
|
106
|
+
If None, uses the value from get_reference_time_column().
|
|
107
107
|
|
|
108
108
|
Returns:
|
|
109
109
|
The result of the window operation as a PyArrow Array
|
|
110
110
|
"""
|
|
111
111
|
# Use the default time filter feature if none is provided
|
|
112
112
|
if time_filter_feature is None:
|
|
113
|
-
time_filter_feature = cls.
|
|
113
|
+
time_filter_feature = cls.get_reference_time_column()
|
|
114
114
|
|
|
115
115
|
# Get the time column
|
|
116
116
|
time_column = data.column(time_filter_feature)
|
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
from typing import Any, Optional
|
|
2
|
-
from
|
|
3
|
-
from
|
|
4
|
-
from mloda_core.abstract_plugins.components.input_data.api.api_input_data import ApiInputData
|
|
5
|
-
from mloda_core.abstract_plugins.components.input_data.base_input_data import BaseInputData
|
|
2
|
+
from mloda import FeatureGroup
|
|
3
|
+
from mloda.provider import FeatureSet, ApiData as ApiInputData, BaseInputData
|
|
6
4
|
|
|
7
5
|
|
|
8
|
-
class ApiInputDataFeature(
|
|
6
|
+
class ApiInputDataFeature(FeatureGroup):
|
|
9
7
|
"""
|
|
10
8
|
Base class for API-based input data feature groups.
|
|
11
9
|
|
|
@@ -39,9 +37,9 @@ class ApiInputDataFeature(AbstractFeatureGroup):
|
|
|
39
37
|
Uses Options to specify API data access configuration:
|
|
40
38
|
|
|
41
39
|
```python
|
|
42
|
-
from
|
|
43
|
-
from
|
|
44
|
-
from
|
|
40
|
+
from mloda import Feature
|
|
41
|
+
from mloda import Options
|
|
42
|
+
from mloda.core.abstract_plugins.components.input_data.api.api_input_data import ApiInputData
|
|
45
43
|
|
|
46
44
|
feature = Feature(
|
|
47
45
|
name="user_profile",
|
|
@@ -60,8 +58,8 @@ class ApiInputDataFeature(AbstractFeatureGroup):
|
|
|
60
58
|
### Basic API Data Access
|
|
61
59
|
|
|
62
60
|
```python
|
|
63
|
-
from
|
|
64
|
-
from
|
|
61
|
+
from mloda import Feature
|
|
62
|
+
from mloda import Options
|
|
65
63
|
|
|
66
64
|
# Simple API feature reference
|
|
67
65
|
feature = Feature(name="api_user_score")
|
|
@@ -70,7 +68,7 @@ class ApiInputDataFeature(AbstractFeatureGroup):
|
|
|
70
68
|
### Configuration-Based with Endpoint Mapping
|
|
71
69
|
|
|
72
70
|
```python
|
|
73
|
-
from
|
|
71
|
+
from mloda.core.abstract_plugins.components.input_data.api.api_input_data import ApiInputData
|
|
74
72
|
|
|
75
73
|
# Map multiple API response fields
|
|
76
74
|
feature = Feature(
|
|
@@ -3,12 +3,12 @@ from pathlib import Path
|
|
|
3
3
|
from typing import Any, List, Set
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
from
|
|
7
|
-
from
|
|
8
|
-
from
|
|
9
|
-
from
|
|
10
|
-
from
|
|
11
|
-
from
|
|
6
|
+
from mloda import FeatureGroup
|
|
7
|
+
from mloda import Feature
|
|
8
|
+
from mloda.user import FeatureName
|
|
9
|
+
from mloda.provider import FeatureSet
|
|
10
|
+
from mloda.user import JoinType
|
|
11
|
+
from mloda import Options
|
|
12
12
|
from mloda_plugins.feature_group.experimental.default_options_key import DefaultOptionKeys
|
|
13
13
|
from mloda_plugins.feature_group.experimental.dynamic_feature_group_factory.dynamic_feature_group_factory import (
|
|
14
14
|
DynamicFeatureGroupCreator,
|
|
@@ -27,7 +27,7 @@ except ImportError:
|
|
|
27
27
|
pd = None
|
|
28
28
|
|
|
29
29
|
|
|
30
|
-
class ConcatenatedFileContent(
|
|
30
|
+
class ConcatenatedFileContent(FeatureGroup):
|
|
31
31
|
"""
|
|
32
32
|
A feature group that reads and combines content from files within a directory (default: python files).
|
|
33
33
|
|
|
@@ -1,9 +1,7 @@
|
|
|
1
1
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
2
|
-
from
|
|
3
|
-
from
|
|
4
|
-
from
|
|
5
|
-
from mloda_core.abstract_plugins.components.input_data.base_input_data import BaseInputData
|
|
6
|
-
from mloda_core.abstract_plugins.components.options import Options
|
|
2
|
+
from mloda.user import DataAccessCollection
|
|
3
|
+
from mloda.provider import FeatureSet, HashableDict, BaseInputData
|
|
4
|
+
from mloda import Options
|
|
7
5
|
|
|
8
6
|
|
|
9
7
|
class ReadDB(BaseInputData):
|
|
@@ -85,11 +83,11 @@ class ReadDB(BaseInputData):
|
|
|
85
83
|
data_accesses.append(data_access)
|
|
86
84
|
|
|
87
85
|
if not data_accesses:
|
|
88
|
-
return
|
|
86
|
+
return None
|
|
89
87
|
|
|
90
88
|
matched_data_access = cls.match_read_db_data_access(data_accesses, feature_names)
|
|
91
|
-
if matched_data_access is
|
|
92
|
-
return
|
|
89
|
+
if matched_data_access is None:
|
|
90
|
+
return None
|
|
93
91
|
return matched_data_access
|
|
94
92
|
|
|
95
93
|
@classmethod
|
|
@@ -110,7 +108,7 @@ class ReadDB(BaseInputData):
|
|
|
110
108
|
return data_access
|
|
111
109
|
except NotImplementedError:
|
|
112
110
|
continue
|
|
113
|
-
return
|
|
111
|
+
return None
|
|
114
112
|
|
|
115
113
|
@classmethod
|
|
116
114
|
def get_connection(cls, credentials: Any) -> Any:
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
from typing import Any, Optional
|
|
2
2
|
|
|
3
|
-
from
|
|
4
|
-
from
|
|
5
|
-
from
|
|
3
|
+
from mloda import FeatureGroup
|
|
4
|
+
from mloda.provider import FeatureSet
|
|
5
|
+
from mloda.provider import BaseInputData
|
|
6
6
|
from mloda_plugins.feature_group.input_data.read_db import ReadDB
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
class ReadDBFeature(
|
|
9
|
+
class ReadDBFeature(FeatureGroup):
|
|
10
10
|
@classmethod
|
|
11
11
|
def input_data(cls) -> Optional[BaseInputData]:
|
|
12
12
|
return ReadDB()
|
|
@@ -4,10 +4,9 @@ from typing import Any
|
|
|
4
4
|
import pyarrow as pa
|
|
5
5
|
import sqlite3
|
|
6
6
|
|
|
7
|
-
from
|
|
8
|
-
from
|
|
9
|
-
from
|
|
10
|
-
from mloda_core.abstract_plugins.components.options import Options
|
|
7
|
+
from mloda.provider import FeatureSet, HashableDict
|
|
8
|
+
from mloda.user import DataType
|
|
9
|
+
from mloda import Options
|
|
11
10
|
from mloda_plugins.feature_group.input_data.read_db import ReadDB
|
|
12
11
|
|
|
13
12
|
|
|
@@ -46,9 +45,9 @@ class SQLITEReader(ReadDB):
|
|
|
46
45
|
Uses Options with database credentials and configuration:
|
|
47
46
|
|
|
48
47
|
```python
|
|
49
|
-
from
|
|
50
|
-
from
|
|
51
|
-
from
|
|
48
|
+
from mloda import Feature
|
|
49
|
+
from mloda import Options
|
|
50
|
+
from mloda.core.abstract_plugins.components.hashable_dict import HashableDict
|
|
52
51
|
|
|
53
52
|
feature = Feature(
|
|
54
53
|
name="customer_name",
|
|
@@ -68,8 +67,8 @@ class SQLITEReader(ReadDB):
|
|
|
68
67
|
### Basic SQLite Feature Access
|
|
69
68
|
|
|
70
69
|
```python
|
|
71
|
-
from
|
|
72
|
-
from
|
|
70
|
+
from mloda import Feature
|
|
71
|
+
from mloda.core.abstract_plugins.components.hashable_dict import HashableDict
|
|
73
72
|
|
|
74
73
|
# Simple column reference from SQLite database
|
|
75
74
|
feature = Feature(
|
|
@@ -117,7 +116,7 @@ class SQLITEReader(ReadDB):
|
|
|
117
116
|
### Using DataAccessCollection
|
|
118
117
|
|
|
119
118
|
```python
|
|
120
|
-
from
|
|
119
|
+
from mloda.user import DataAccessCollection
|
|
121
120
|
|
|
122
121
|
# Configure database access at the collection level
|
|
123
122
|
data_access = DataAccessCollection(
|
|
@@ -214,11 +213,22 @@ class SQLITEReader(ReadDB):
|
|
|
214
213
|
def load_data(cls, data_access: Any, features: FeatureSet) -> Any:
|
|
215
214
|
query = cls.build_query(features)
|
|
216
215
|
result, column_names = cls.read_db(data_access, query)
|
|
217
|
-
return cls.read_as_pa_data(result, column_names)
|
|
216
|
+
return cls.read_as_pa_data(result, column_names, features)
|
|
218
217
|
|
|
219
218
|
@classmethod
|
|
220
|
-
def read_as_pa_data(cls, result: Any, column_names: Any) -> Any:
|
|
221
|
-
|
|
219
|
+
def read_as_pa_data(cls, result: Any, column_names: Any, features: Any) -> Any:
|
|
220
|
+
feature_map = {f.get_name(): f for f in features.features}
|
|
221
|
+
|
|
222
|
+
schema_fields = []
|
|
223
|
+
for i, col_name in enumerate(column_names):
|
|
224
|
+
feature = feature_map.get(col_name)
|
|
225
|
+
if feature and feature.data_type:
|
|
226
|
+
arrow_type = DataType.to_arrow_type(feature.data_type)
|
|
227
|
+
else:
|
|
228
|
+
arrow_type = DataType.infer_arrow_type(result[0][i])
|
|
229
|
+
schema_fields.append((col_name, arrow_type))
|
|
230
|
+
|
|
231
|
+
schema = pa.schema(schema_fields)
|
|
222
232
|
data_dicts = [{column_names[i]: row[i] for i in range(len(row))} for row in result]
|
|
223
233
|
table = pa.Table.from_pylist(data_dicts, schema=schema)
|
|
224
234
|
return table
|