mloda 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mloda-0.3.0.dist-info → mloda-0.3.2.dist-info}/METADATA +10 -10
- {mloda-0.3.0.dist-info → mloda-0.3.2.dist-info}/RECORD +92 -91
- mloda_core/abstract_plugins/components/base_artifact.py +3 -1
- mloda_core/abstract_plugins/components/feature.py +4 -4
- mloda_core/abstract_plugins/components/feature_chainer/feature_chain_parser.py +44 -17
- mloda_core/abstract_plugins/components/feature_collection.py +2 -2
- mloda_core/abstract_plugins/components/feature_group_version.py +4 -4
- mloda_core/abstract_plugins/components/feature_name.py +0 -3
- mloda_core/abstract_plugins/components/input_data/base_input_data.py +3 -3
- mloda_core/abstract_plugins/components/link.py +113 -29
- mloda_core/abstract_plugins/components/options.py +10 -10
- mloda_core/api/prepare/setup_compute_framework.py +2 -2
- mloda_core/api/request.py +44 -13
- mloda_core/core/step/feature_group_step.py +2 -1
- mloda_core/filter/filter_engine.py +3 -12
- mloda_core/filter/filter_parameter.py +55 -0
- mloda_core/filter/single_filter.py +4 -4
- mloda_core/prepare/execution_plan.py +12 -6
- mloda_core/prepare/graph/graph.py +3 -3
- mloda_core/prepare/identify_feature_group.py +10 -3
- mloda_core/prepare/resolve_links.py +86 -18
- mloda_core/runtime/flight/flight_server.py +1 -1
- mloda_core/runtime/run.py +7 -5
- mloda_core/runtime/worker/multiprocessing_worker.py +11 -9
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_filter_engine.py +7 -33
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_filter_engine.py +22 -12
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_framework.py +2 -2
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_pyarrow_transformer.py +2 -2
- mloda_plugins/compute_framework/base_implementations/pandas/dataframe.py +2 -2
- mloda_plugins/compute_framework/base_implementations/pandas/pandaspyarrowtransformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/polars/dataframe.py +3 -3
- mloda_plugins/compute_framework/base_implementations/polars/lazy_dataframe.py +5 -5
- mloda_plugins/compute_framework/base_implementations/polars/polars_filter_engine.py +8 -34
- mloda_plugins/compute_framework/base_implementations/polars/polars_lazy_merge_engine.py +1 -1
- mloda_plugins/compute_framework/base_implementations/polars/polars_lazy_pyarrow_transformer.py +3 -3
- mloda_plugins/compute_framework/base_implementations/polars/polars_merge_engine.py +1 -1
- mloda_plugins/compute_framework/base_implementations/polars/polars_pyarrow_transformer.py +2 -2
- mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_filter_engine.py +7 -33
- mloda_plugins/compute_framework/base_implementations/pyarrow/table.py +1 -1
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_filter_engine.py +13 -32
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_framework.py +1 -1
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/spark/spark_filter_engine.py +13 -32
- mloda_plugins/compute_framework/base_implementations/spark/spark_framework.py +4 -4
- mloda_plugins/compute_framework/base_implementations/spark/spark_pyarrow_transformer.py +1 -1
- mloda_plugins/config/feature/loader.py +12 -18
- mloda_plugins/feature_group/experimental/aggregated_feature_group/base.py +20 -17
- mloda_plugins/feature_group/experimental/aggregated_feature_group/pandas.py +8 -8
- mloda_plugins/feature_group/experimental/aggregated_feature_group/polars_lazy.py +8 -8
- mloda_plugins/feature_group/experimental/aggregated_feature_group/pyarrow.py +7 -7
- mloda_plugins/feature_group/experimental/clustering/base.py +26 -26
- mloda_plugins/feature_group/experimental/clustering/pandas.py +31 -29
- mloda_plugins/feature_group/experimental/data_quality/missing_value/base.py +23 -22
- mloda_plugins/feature_group/experimental/data_quality/missing_value/pandas.py +16 -16
- mloda_plugins/feature_group/experimental/data_quality/missing_value/pyarrow.py +9 -11
- mloda_plugins/feature_group/experimental/data_quality/missing_value/python_dict.py +8 -8
- mloda_plugins/feature_group/experimental/default_options_key.py +1 -1
- mloda_plugins/feature_group/experimental/dimensionality_reduction/base.py +17 -15
- mloda_plugins/feature_group/experimental/dimensionality_reduction/pandas.py +30 -18
- mloda_plugins/feature_group/experimental/dynamic_feature_group_factory/dynamic_feature_group_factory.py +35 -35
- mloda_plugins/feature_group/experimental/forecasting/base.py +39 -29
- mloda_plugins/feature_group/experimental/forecasting/pandas.py +18 -18
- mloda_plugins/feature_group/experimental/geo_distance/base.py +18 -20
- mloda_plugins/feature_group/experimental/geo_distance/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/llm/cli_features/refactor_git_cached.py +6 -6
- mloda_plugins/feature_group/experimental/llm/installed_packages_feature_group.py +2 -2
- mloda_plugins/feature_group/experimental/llm/list_directory_feature_group.py +2 -2
- mloda_plugins/feature_group/experimental/llm/llm_api/llm_base_request.py +2 -2
- mloda_plugins/feature_group/experimental/llm/llm_api/request_loop.py +3 -2
- mloda_plugins/feature_group/experimental/llm/llm_file_selector.py +1 -1
- mloda_plugins/feature_group/experimental/node_centrality/base.py +8 -12
- mloda_plugins/feature_group/experimental/node_centrality/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/encoding/base.py +11 -12
- mloda_plugins/feature_group/experimental/sklearn/encoding/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/pipeline/base.py +9 -14
- mloda_plugins/feature_group/experimental/sklearn/pipeline/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/scaling/base.py +8 -9
- mloda_plugins/feature_group/experimental/sklearn/scaling/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/source_input_feature.py +10 -10
- mloda_plugins/feature_group/experimental/text_cleaning/base.py +8 -11
- mloda_plugins/feature_group/experimental/text_cleaning/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/time_window/base.py +27 -25
- mloda_plugins/feature_group/experimental/time_window/pandas.py +8 -8
- mloda_plugins/feature_group/experimental/time_window/pyarrow.py +6 -6
- mloda_plugins/feature_group/input_data/read_context_files.py +1 -1
- mloda_plugins/function_extender/base_implementations/otel/otel_extender.py +1 -1
- {mloda-0.3.0.dist-info → mloda-0.3.2.dist-info}/WHEEL +0 -0
- {mloda-0.3.0.dist-info → mloda-0.3.2.dist-info}/entry_points.txt +0 -0
- {mloda-0.3.0.dist-info → mloda-0.3.2.dist-info}/licenses/LICENSE.TXT +0 -0
- {mloda-0.3.0.dist-info → mloda-0.3.2.dist-info}/licenses/NOTICE.md +0 -0
- {mloda-0.3.0.dist-info → mloda-0.3.2.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from collections import defaultdict
|
|
2
2
|
from copy import copy
|
|
3
|
-
from typing import Dict, List, Set
|
|
3
|
+
from typing import DefaultDict, Dict, List, Set
|
|
4
4
|
from uuid import UUID
|
|
5
5
|
|
|
6
6
|
from mloda_core.prepare.graph.properties import EdgeProperties, NodeProperties
|
|
@@ -8,8 +8,8 @@ from mloda_core.prepare.graph.properties import EdgeProperties, NodeProperties
|
|
|
8
8
|
|
|
9
9
|
class Graph:
|
|
10
10
|
def __init__(self) -> None:
|
|
11
|
-
self.nodes:
|
|
12
|
-
self.edges:
|
|
11
|
+
self.nodes: DefaultDict[UUID, NodeProperties] = defaultdict(lambda: NodeProperties(None, None)) # type: ignore[arg-type]
|
|
12
|
+
self.edges: DefaultDict[tuple[UUID, UUID], EdgeProperties] = defaultdict(lambda: EdgeProperties(None, None)) # type: ignore[arg-type]
|
|
13
13
|
|
|
14
14
|
self.adjacency_list: Dict[UUID, list[UUID]] = defaultdict(list)
|
|
15
15
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from typing import Optional, Set, Tuple, Type
|
|
2
|
+
|
|
2
3
|
from mloda_core.prepare.accessible_plugins import FeatureGroupEnvironmentMapping
|
|
3
4
|
from mloda_core.abstract_plugins.components.data_access_collection import DataAccessCollection
|
|
4
5
|
from mloda_core.abstract_plugins.compute_frame_work import ComputeFrameWork
|
|
@@ -145,14 +146,20 @@ class IdentifyFeatureGroupClass:
|
|
|
145
146
|
Check if the code is running in a notebook environment.
|
|
146
147
|
"""
|
|
147
148
|
try:
|
|
148
|
-
from IPython import get_ipython # type: ignore
|
|
149
|
+
from IPython import get_ipython # type: ignore[attr-defined]
|
|
149
150
|
|
|
150
|
-
|
|
151
|
+
ipython_instance = get_ipython() # type: ignore[no-untyped-call]
|
|
152
|
+
if ipython_instance is None:
|
|
153
|
+
return ""
|
|
154
|
+
shell: str = ipython_instance.__class__.__name__
|
|
151
155
|
if shell == "ZMQInteractiveShell":
|
|
152
|
-
return """If you are running this in a notebook, please restart the kernel to clear any cached plugins.
|
|
156
|
+
return """If you are running this in a notebook, please restart the kernel to clear any cached plugins.
|
|
153
157
|
If you experience this multiple times, please open an issue or contact the maintainers for prioritization.
|
|
154
158
|
https://github.com/mloda-ai/mloda/issues
|
|
155
159
|
"""
|
|
160
|
+
except ImportError:
|
|
161
|
+
# IPython not installed
|
|
162
|
+
pass
|
|
156
163
|
except Exception:
|
|
157
164
|
# An exception here means we are not in a notebook environment.
|
|
158
165
|
pass # nosec B110
|
|
@@ -270,24 +270,92 @@ class ResolveLinks:
|
|
|
270
270
|
if not self.links:
|
|
271
271
|
return
|
|
272
272
|
|
|
273
|
-
for
|
|
274
|
-
for
|
|
275
|
-
for
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
273
|
+
for child, parents in self.graph.parent_to_children_mapping.items():
|
|
274
|
+
for parent_in in parents:
|
|
275
|
+
for parent_out in parents:
|
|
276
|
+
if parent_in == parent_out:
|
|
277
|
+
continue
|
|
278
|
+
|
|
279
|
+
r_left = self.graph.get_nodes()[parent_in]
|
|
280
|
+
r_right = self.graph.get_nodes()[parent_out]
|
|
281
|
+
left_fg = r_left.feature_group_class
|
|
282
|
+
right_fg = r_right.feature_group_class
|
|
283
|
+
|
|
284
|
+
# Two-pass matching: exact match first, then polymorphic
|
|
285
|
+
matched_links = self._find_matching_links(left_fg, right_fg)
|
|
286
|
+
|
|
287
|
+
for matched_link in matched_links:
|
|
288
|
+
key = self.create_link_trekker_key(
|
|
289
|
+
matched_link, r_left.feature.compute_frameworks, r_right.feature.compute_frameworks
|
|
290
|
+
)
|
|
291
|
+
self.set_link_trekker(key, child)
|
|
292
|
+
|
|
293
|
+
def _find_matching_links(self, left_fg: type, right_fg: type) -> List[Link]:
|
|
294
|
+
"""Find all matching links using two-pass matching: exact first, then polymorphic.
|
|
295
|
+
|
|
296
|
+
Returns all exact matches if any exist, otherwise returns the most specific
|
|
297
|
+
polymorphic matches (closest in inheritance hierarchy).
|
|
298
|
+
"""
|
|
299
|
+
if self.links is None:
|
|
300
|
+
return []
|
|
301
|
+
|
|
302
|
+
# Pass 1: Collect all exact matches
|
|
303
|
+
exact_matches = [link for link in self.links if link.matches_exact(left_fg, right_fg)]
|
|
304
|
+
if exact_matches:
|
|
305
|
+
return exact_matches
|
|
306
|
+
|
|
307
|
+
# Pass 2: If no exact matches, find most specific polymorphic matches
|
|
308
|
+
polymorphic_matches = [link for link in self.links if link.matches_polymorphic(left_fg, right_fg)]
|
|
309
|
+
if not polymorphic_matches:
|
|
310
|
+
return []
|
|
311
|
+
|
|
312
|
+
# Find the most specific match (smallest inheritance distance)
|
|
313
|
+
return self._select_most_specific_links(polymorphic_matches, left_fg, right_fg)
|
|
314
|
+
|
|
315
|
+
def _inheritance_distance(self, child: type, parent: type) -> int:
|
|
316
|
+
"""Calculate the inheritance distance from child to parent in the MRO.
|
|
317
|
+
|
|
318
|
+
Returns the number of steps in the Method Resolution Order from child to parent.
|
|
319
|
+
Returns a large number if parent is not in child's MRO.
|
|
320
|
+
"""
|
|
321
|
+
try:
|
|
322
|
+
mro = child.__mro__
|
|
323
|
+
return mro.index(parent)
|
|
324
|
+
except (ValueError, AttributeError):
|
|
325
|
+
return 9999 # Not in hierarchy
|
|
326
|
+
|
|
327
|
+
def _select_most_specific_links(self, links: List[Link], left_fg: type, right_fg: type) -> List[Link]:
|
|
328
|
+
"""Select links that are most specific (closest in inheritance hierarchy).
|
|
329
|
+
|
|
330
|
+
For each link, calculates the inheritance distance on both sides.
|
|
331
|
+
Only considers links where both sides have the same inheritance distance
|
|
332
|
+
(to avoid sibling mismatches). Returns links with the minimum distance.
|
|
333
|
+
"""
|
|
334
|
+
if not links:
|
|
335
|
+
return []
|
|
336
|
+
|
|
337
|
+
# Calculate distance for each link, filtering out unbalanced matches
|
|
338
|
+
link_distances: List[Tuple[Link, int]] = []
|
|
339
|
+
for link in links:
|
|
340
|
+
left_dist = self._inheritance_distance(left_fg, link.left_feature_group)
|
|
341
|
+
right_dist = self._inheritance_distance(right_fg, link.right_feature_group)
|
|
342
|
+
|
|
343
|
+
# Only consider links where both sides have the same inheritance level
|
|
344
|
+
# This prevents sibling class mismatches for self-join patterns
|
|
345
|
+
link_is_self_join = link.left_feature_group == link.right_feature_group
|
|
346
|
+
# For self-joins: require same concrete class to prevent sibling mismatches
|
|
347
|
+
# For different-class joins: balanced distance is sufficient
|
|
348
|
+
if left_dist == right_dist and (not link_is_self_join or left_fg == right_fg):
|
|
349
|
+
link_distances.append((link, left_dist))
|
|
350
|
+
|
|
351
|
+
if not link_distances:
|
|
352
|
+
return []
|
|
353
|
+
|
|
354
|
+
# Find minimum distance
|
|
355
|
+
min_dist = min(dist for _, dist in link_distances)
|
|
356
|
+
|
|
357
|
+
# Return all links with minimum distance
|
|
358
|
+
return [link for link, dist in link_distances if dist == min_dist]
|
|
291
359
|
|
|
292
360
|
def set_link_trekker(self, link_trekker_key: LinkFrameworkTrekker, uuid: UUID) -> None:
|
|
293
361
|
self.link_trekker.update(link_trekker_key, uuid)
|
|
@@ -18,7 +18,7 @@ def create_location(host: str = "0.0.0.0") -> str:
|
|
|
18
18
|
return f"grpc://{host}:{port}"
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
class FlightServer(flight.FlightServerBase): # type: ignore
|
|
21
|
+
class FlightServer(flight.FlightServerBase): # type: ignore[misc]
|
|
22
22
|
def __init__(self, location: Any = create_location()) -> None:
|
|
23
23
|
self.tables: Dict[str, Any] = {} # Dictionary to store tables
|
|
24
24
|
self.location = location
|
mloda_core/runtime/run.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
from collections import defaultdict
|
|
2
4
|
import multiprocessing
|
|
3
5
|
import queue
|
|
@@ -59,10 +61,10 @@ class Runner:
|
|
|
59
61
|
# multiprocessing
|
|
60
62
|
self.location: Optional[str] = None
|
|
61
63
|
self.tasks: List[Union[threading.Thread, multiprocessing.Process]] = []
|
|
62
|
-
self.process_register: Dict[
|
|
63
|
-
UUID, Tuple[multiprocessing.Process, multiprocessing.Queue, multiprocessing.Queue]
|
|
64
|
+
self.process_register: Dict[
|
|
65
|
+
UUID, Tuple[multiprocessing.Process, multiprocessing.Queue[Any], multiprocessing.Queue[Any]]
|
|
64
66
|
] = defaultdict()
|
|
65
|
-
self.result_queues_collection: Set[multiprocessing.Queue] = set()
|
|
67
|
+
self.result_queues_collection: Set[multiprocessing.Queue[Any]] = set()
|
|
66
68
|
self.result_uuids_collection: Set[UUID] = set()
|
|
67
69
|
|
|
68
70
|
# Initialize framework transformer
|
|
@@ -524,7 +526,7 @@ class Runner:
|
|
|
524
526
|
"""
|
|
525
527
|
MyManager.register("CfwManager", CfwManager)
|
|
526
528
|
self.manager = MyManager().__enter__()
|
|
527
|
-
self.cfw_register = self.manager.CfwManager(parallelization_modes, function_extender) # type: ignore
|
|
529
|
+
self.cfw_register = self.manager.CfwManager(parallelization_modes, function_extender) # type: ignore[attr-defined]
|
|
528
530
|
|
|
529
531
|
if self.flight_server:
|
|
530
532
|
if self.flight_server.flight_server_process is None:
|
|
@@ -587,7 +589,7 @@ class Runner:
|
|
|
587
589
|
|
|
588
590
|
def _get_execution_function(
|
|
589
591
|
self, mode_by_cfw_register: Set[ParallelizationModes], mode_by_step: Set[ParallelizationModes]
|
|
590
|
-
) -> Callable
|
|
592
|
+
) -> Callable[[Any], None]:
|
|
591
593
|
"""
|
|
592
594
|
Identifies the execution mode and returns the corresponding execute step function.
|
|
593
595
|
|
|
@@ -1,8 +1,10 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
import multiprocessing
|
|
3
5
|
import time
|
|
4
6
|
import traceback
|
|
5
|
-
from typing import Any, Union
|
|
7
|
+
from typing import Any, Set, Union
|
|
6
8
|
from uuid import UUID
|
|
7
9
|
from queue import Empty
|
|
8
10
|
|
|
@@ -16,16 +18,16 @@ from mloda_core.core.step.transform_frame_work_step import TransformFrameworkSte
|
|
|
16
18
|
logger = logging.getLogger(__name__)
|
|
17
19
|
|
|
18
20
|
|
|
19
|
-
def _handle_stop_command(command_queue: multiprocessing.Queue) -> None:
|
|
21
|
+
def _handle_stop_command(command_queue: multiprocessing.Queue[Any]) -> None:
|
|
20
22
|
"""Puts a 'STOP' command in the command queue."""
|
|
21
23
|
if command_queue:
|
|
22
24
|
command_queue.put("STOP", block=False)
|
|
23
25
|
|
|
24
26
|
|
|
25
27
|
def _handle_data_dropping(
|
|
26
|
-
command_queue: multiprocessing.Queue,
|
|
28
|
+
command_queue: multiprocessing.Queue[Any],
|
|
27
29
|
cfw: ComputeFrameWork,
|
|
28
|
-
command:
|
|
30
|
+
command: Set[Any],
|
|
29
31
|
location: str,
|
|
30
32
|
) -> bool:
|
|
31
33
|
"""Handles dropping already calculated data based on the provided command."""
|
|
@@ -46,7 +48,7 @@ def _execute_command(
|
|
|
46
48
|
"""Executes a given command based on its type."""
|
|
47
49
|
if isinstance(command, JoinStep):
|
|
48
50
|
# Left framework here, because it is already transformed beforehand
|
|
49
|
-
from_cfw = cfw_register.get_cfw_uuid(command.left_framework.get_class_name(), command.link.uuid) # type: ignore
|
|
51
|
+
from_cfw = cfw_register.get_cfw_uuid(command.left_framework.get_class_name(), command.link.uuid) # type: ignore[assignment]
|
|
50
52
|
|
|
51
53
|
if from_cfw is None:
|
|
52
54
|
from_cfw = cfw_register.get_cfw_uuid(
|
|
@@ -74,7 +76,7 @@ def _handle_command_result(
|
|
|
74
76
|
cfw: ComputeFrameWork,
|
|
75
77
|
location: str,
|
|
76
78
|
data: Any,
|
|
77
|
-
result_queue: multiprocessing.Queue,
|
|
79
|
+
result_queue: multiprocessing.Queue[Any],
|
|
78
80
|
) -> None:
|
|
79
81
|
"""Handles the result of a command execution, including uploading data if necessary."""
|
|
80
82
|
if not isinstance(data, str) and isinstance(command, FeatureGroupStep):
|
|
@@ -89,8 +91,8 @@ def _handle_command_result(
|
|
|
89
91
|
|
|
90
92
|
|
|
91
93
|
def worker(
|
|
92
|
-
command_queue: multiprocessing.Queue,
|
|
93
|
-
result_queue: multiprocessing.Queue,
|
|
94
|
+
command_queue: multiprocessing.Queue[Any],
|
|
95
|
+
result_queue: multiprocessing.Queue[Any],
|
|
94
96
|
cfw_register: CfwManager,
|
|
95
97
|
cfw: ComputeFrameWork,
|
|
96
98
|
from_cfw: UUID,
|
|
@@ -135,7 +137,7 @@ def worker(
|
|
|
135
137
|
time.sleep(0.0001)
|
|
136
138
|
|
|
137
139
|
|
|
138
|
-
def error_out(cfw_register: CfwManager, command_queue: multiprocessing.Queue) -> None:
|
|
140
|
+
def error_out(cfw_register: CfwManager, command_queue: multiprocessing.Queue[Any]) -> None:
|
|
139
141
|
msg = """This is a critical error, the location should not be None."""
|
|
140
142
|
logging.error(msg)
|
|
141
143
|
exc_info = traceback.format_exc()
|
|
@@ -31,11 +31,7 @@ class DuckDBFilterEngine(BaseFilterEngine):
|
|
|
31
31
|
column_name = filter_feature.name.name
|
|
32
32
|
|
|
33
33
|
# Extract the value from the parameter
|
|
34
|
-
value =
|
|
35
|
-
for param in filter_feature.parameter:
|
|
36
|
-
if param[0] == "value":
|
|
37
|
-
value = param[1]
|
|
38
|
-
break
|
|
34
|
+
value = filter_feature.parameter.value
|
|
39
35
|
|
|
40
36
|
if value is None:
|
|
41
37
|
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
@@ -49,14 +45,8 @@ class DuckDBFilterEngine(BaseFilterEngine):
|
|
|
49
45
|
column_name = filter_feature.name.name
|
|
50
46
|
|
|
51
47
|
# Check if this is a complex parameter with max/max_exclusive or a simple one with value
|
|
52
|
-
has_max =
|
|
53
|
-
has_value =
|
|
54
|
-
|
|
55
|
-
for param in filter_feature.parameter:
|
|
56
|
-
if param[0] == "max":
|
|
57
|
-
has_max = True
|
|
58
|
-
elif param[0] == "value":
|
|
59
|
-
has_value = True
|
|
48
|
+
has_max = filter_feature.parameter.max_value is not None
|
|
49
|
+
has_value = filter_feature.parameter.value is not None
|
|
60
50
|
|
|
61
51
|
if has_max:
|
|
62
52
|
# Complex parameter - use get_min_max_operator
|
|
@@ -78,11 +68,7 @@ class DuckDBFilterEngine(BaseFilterEngine):
|
|
|
78
68
|
condition = f'"{column_name}" <= {max_parameter}'
|
|
79
69
|
elif has_value:
|
|
80
70
|
# Simple parameter - extract the value
|
|
81
|
-
value =
|
|
82
|
-
for param in filter_feature.parameter:
|
|
83
|
-
if param[0] == "value":
|
|
84
|
-
value = param[1]
|
|
85
|
-
break
|
|
71
|
+
value = filter_feature.parameter.value
|
|
86
72
|
|
|
87
73
|
if value is None:
|
|
88
74
|
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
@@ -98,11 +84,7 @@ class DuckDBFilterEngine(BaseFilterEngine):
|
|
|
98
84
|
column_name = filter_feature.name.name
|
|
99
85
|
|
|
100
86
|
# Extract the value from the parameter
|
|
101
|
-
value =
|
|
102
|
-
for param in filter_feature.parameter:
|
|
103
|
-
if param[0] == "value":
|
|
104
|
-
value = param[1]
|
|
105
|
-
break
|
|
87
|
+
value = filter_feature.parameter.value
|
|
106
88
|
|
|
107
89
|
if value is None:
|
|
108
90
|
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
@@ -120,11 +102,7 @@ class DuckDBFilterEngine(BaseFilterEngine):
|
|
|
120
102
|
column_name = filter_feature.name.name
|
|
121
103
|
|
|
122
104
|
# Extract the value from the parameter
|
|
123
|
-
value =
|
|
124
|
-
for param in filter_feature.parameter:
|
|
125
|
-
if param[0] == "value":
|
|
126
|
-
value = param[1]
|
|
127
|
-
break
|
|
105
|
+
value = filter_feature.parameter.value
|
|
128
106
|
|
|
129
107
|
if value is None:
|
|
130
108
|
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
@@ -139,11 +117,7 @@ class DuckDBFilterEngine(BaseFilterEngine):
|
|
|
139
117
|
column_name = filter_feature.name.name
|
|
140
118
|
|
|
141
119
|
# Extract the values from the parameter
|
|
142
|
-
values =
|
|
143
|
-
for param in filter_feature.parameter:
|
|
144
|
-
if param[0] == "values":
|
|
145
|
-
values = param[1]
|
|
146
|
-
break
|
|
120
|
+
values = filter_feature.parameter.values
|
|
147
121
|
|
|
148
122
|
if values is None:
|
|
149
123
|
raise ValueError(f"Filter parameter 'values' not found in {filter_feature.parameter}")
|
|
@@ -6,13 +6,13 @@ try:
|
|
|
6
6
|
from pyiceberg.table import Table as IcebergTable
|
|
7
7
|
from pyiceberg.expressions import GreaterThan, LessThan, GreaterThanOrEqual, LessThanOrEqual, EqualTo, And
|
|
8
8
|
except ImportError:
|
|
9
|
-
IcebergTable: Optional[Type] = None # type: ignore
|
|
10
|
-
GreaterThan: Optional[Type] = None # type: ignore
|
|
11
|
-
LessThan: Optional[Type] = None # type: ignore
|
|
12
|
-
GreaterThanOrEqual: Optional[Type] = None # type: ignore
|
|
13
|
-
LessThanOrEqual: Optional[Type] = None # type: ignore
|
|
14
|
-
EqualTo: Optional[Type] = None # type: ignore
|
|
15
|
-
And: Optional[Type] = None # type: ignore
|
|
9
|
+
IcebergTable: Optional[Type[Any]] = None # type: ignore[no-redef]
|
|
10
|
+
GreaterThan: Optional[Type[Any]] = None # type: ignore[no-redef]
|
|
11
|
+
LessThan: Optional[Type[Any]] = None # type: ignore[no-redef]
|
|
12
|
+
GreaterThanOrEqual: Optional[Type[Any]] = None # type: ignore[no-redef]
|
|
13
|
+
LessThanOrEqual: Optional[Type[Any]] = None # type: ignore[no-redef]
|
|
14
|
+
EqualTo: Optional[Type[Any]] = None # type: ignore[no-redef]
|
|
15
|
+
And: Optional[Type[Any]] = None # type: ignore[no-redef]
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class IcebergFilterEngine(BaseFilterEngine):
|
|
@@ -111,7 +111,7 @@ class IcebergFilterEngine(BaseFilterEngine):
|
|
|
111
111
|
max_expr = (
|
|
112
112
|
LessThan(column_name, max_param) if is_max_exclusive else LessThanOrEqual(column_name, max_param)
|
|
113
113
|
)
|
|
114
|
-
expressions.append(max_expr) # type: ignore
|
|
114
|
+
expressions.append(max_expr) # type: ignore[arg-type]
|
|
115
115
|
|
|
116
116
|
if len(expressions) == 1:
|
|
117
117
|
return expressions[0]
|
|
@@ -123,15 +123,25 @@ class IcebergFilterEngine(BaseFilterEngine):
|
|
|
123
123
|
@classmethod
|
|
124
124
|
def _extract_parameter_value(cls, filter_feature: SingleFilter, param_name: str) -> Any:
|
|
125
125
|
"""Extract a parameter value from filter feature."""
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
126
|
+
if param_name == "value":
|
|
127
|
+
return filter_feature.parameter.value
|
|
128
|
+
elif param_name == "values":
|
|
129
|
+
return filter_feature.parameter.values
|
|
130
|
+
elif param_name == "min":
|
|
131
|
+
return filter_feature.parameter.min_value
|
|
132
|
+
elif param_name == "max":
|
|
133
|
+
return filter_feature.parameter.max_value
|
|
134
|
+
elif param_name == "max_exclusive":
|
|
135
|
+
return filter_feature.parameter.max_exclusive
|
|
129
136
|
return None
|
|
130
137
|
|
|
131
138
|
@classmethod
|
|
132
139
|
def _has_parameter(cls, filter_feature: SingleFilter, param_name: str) -> bool:
|
|
133
140
|
"""Check if filter feature has a specific parameter."""
|
|
134
|
-
|
|
141
|
+
value = cls._extract_parameter_value(filter_feature, param_name)
|
|
142
|
+
if param_name == "max_exclusive":
|
|
143
|
+
return True
|
|
144
|
+
return value is not None
|
|
135
145
|
|
|
136
146
|
# Standard filter methods - not used for Iceberg but required by interface
|
|
137
147
|
@classmethod
|
|
@@ -10,8 +10,8 @@ try:
|
|
|
10
10
|
from pyiceberg.table import Table as IcebergTable
|
|
11
11
|
import pyarrow as pa
|
|
12
12
|
except ImportError:
|
|
13
|
-
Catalog = None # type: ignore
|
|
14
|
-
IcebergTable = None # type: ignore
|
|
13
|
+
Catalog = None # type: ignore[assignment,misc]
|
|
14
|
+
IcebergTable = None # type: ignore[assignment,misc]
|
|
15
15
|
pa = None
|
|
16
16
|
|
|
17
17
|
|
|
@@ -5,11 +5,11 @@ try:
|
|
|
5
5
|
from pyiceberg.table import Table as IcebergTable
|
|
6
6
|
import pyarrow as pa
|
|
7
7
|
except ImportError:
|
|
8
|
-
IcebergTable = None # type: ignore
|
|
8
|
+
IcebergTable = None # type: ignore[assignment,misc]
|
|
9
9
|
pa = None
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
class
|
|
12
|
+
class IcebergPyArrowTransformer(BaseTransformer):
|
|
13
13
|
"""
|
|
14
14
|
Transformer for converting between Iceberg tables and PyArrow tables.
|
|
15
15
|
|
|
@@ -12,7 +12,7 @@ except ImportError:
|
|
|
12
12
|
pd = None
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
class
|
|
15
|
+
class PandasDataFrame(ComputeFrameWork):
|
|
16
16
|
@staticmethod
|
|
17
17
|
def is_available() -> bool:
|
|
18
18
|
"""Check if Pandas is installed and available."""
|
|
@@ -25,7 +25,7 @@ class PandasDataframe(ComputeFrameWork):
|
|
|
25
25
|
|
|
26
26
|
@staticmethod
|
|
27
27
|
def expected_data_framework() -> Any:
|
|
28
|
-
return
|
|
28
|
+
return PandasDataFrame.pd_dataframe()
|
|
29
29
|
|
|
30
30
|
def merge_engine(self) -> Type[BaseMergeEngine]:
|
|
31
31
|
return PandasMergeEngine
|
|
@@ -9,10 +9,10 @@ from mloda_plugins.compute_framework.base_implementations.polars.polars_filter_e
|
|
|
9
9
|
try:
|
|
10
10
|
import polars as pl
|
|
11
11
|
except ImportError:
|
|
12
|
-
pl = None # type: ignore
|
|
12
|
+
pl = None # type: ignore[assignment]
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
class
|
|
15
|
+
class PolarsDataFrame(ComputeFrameWork):
|
|
16
16
|
@staticmethod
|
|
17
17
|
def is_available() -> bool:
|
|
18
18
|
"""Check if Polars is installed and available."""
|
|
@@ -25,7 +25,7 @@ class PolarsDataframe(ComputeFrameWork):
|
|
|
25
25
|
|
|
26
26
|
@staticmethod
|
|
27
27
|
def expected_data_framework() -> Any:
|
|
28
|
-
return
|
|
28
|
+
return PolarsDataFrame.pl_dataframe()
|
|
29
29
|
|
|
30
30
|
def merge_engine(self) -> Type[BaseMergeEngine]:
|
|
31
31
|
return PolarsMergeEngine
|
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
from typing import Any, Set, Type
|
|
2
2
|
from mloda_core.abstract_plugins.components.feature_name import FeatureName
|
|
3
|
-
from mloda_plugins.compute_framework.base_implementations.polars.dataframe import
|
|
3
|
+
from mloda_plugins.compute_framework.base_implementations.polars.dataframe import PolarsDataFrame
|
|
4
4
|
from mloda_core.abstract_plugins.components.merge.base_merge_engine import BaseMergeEngine
|
|
5
5
|
from mloda_plugins.compute_framework.base_implementations.polars.polars_lazy_merge_engine import PolarsLazyMergeEngine
|
|
6
6
|
|
|
7
7
|
try:
|
|
8
8
|
import polars as pl
|
|
9
9
|
except ImportError:
|
|
10
|
-
pl = None # type: ignore
|
|
10
|
+
pl = None # type: ignore[assignment]
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
class
|
|
13
|
+
class PolarsLazyDataFrame(PolarsDataFrame):
|
|
14
14
|
"""
|
|
15
|
-
Lazy evaluation version of
|
|
15
|
+
Lazy evaluation version of PolarsDataFrame using pl.LazyFrame.
|
|
16
16
|
|
|
17
17
|
This compute framework defers execution of operations until results are explicitly
|
|
18
18
|
requested, enabling query optimization and reduced memory usage for large datasets.
|
|
@@ -20,7 +20,7 @@ class PolarsLazyDataframe(PolarsDataframe):
|
|
|
20
20
|
|
|
21
21
|
@staticmethod
|
|
22
22
|
def expected_data_framework() -> Any:
|
|
23
|
-
return
|
|
23
|
+
return PolarsLazyDataFrame.pl_lazy_frame()
|
|
24
24
|
|
|
25
25
|
def merge_engine(self) -> Type[BaseMergeEngine]:
|
|
26
26
|
return PolarsLazyMergeEngine
|
|
@@ -5,7 +5,7 @@ from mloda_core.filter.single_filter import SingleFilter
|
|
|
5
5
|
try:
|
|
6
6
|
import polars as pl
|
|
7
7
|
except ImportError:
|
|
8
|
-
pl = None # type: ignore
|
|
8
|
+
pl = None # type: ignore[assignment]
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class PolarsFilterEngine(BaseFilterEngine):
|
|
@@ -37,11 +37,7 @@ class PolarsFilterEngine(BaseFilterEngine):
|
|
|
37
37
|
column_name = filter_feature.name.name
|
|
38
38
|
|
|
39
39
|
# Extract the value from the parameter
|
|
40
|
-
value =
|
|
41
|
-
for param in filter_feature.parameter:
|
|
42
|
-
if param[0] == "value":
|
|
43
|
-
value = param[1]
|
|
44
|
-
break
|
|
40
|
+
value = filter_feature.parameter.value
|
|
45
41
|
|
|
46
42
|
if value is None:
|
|
47
43
|
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
@@ -53,14 +49,8 @@ class PolarsFilterEngine(BaseFilterEngine):
|
|
|
53
49
|
column_name = filter_feature.name.name
|
|
54
50
|
|
|
55
51
|
# Check if this is a complex parameter with max/max_exclusive or a simple one with value
|
|
56
|
-
has_max =
|
|
57
|
-
has_value =
|
|
58
|
-
|
|
59
|
-
for param in filter_feature.parameter:
|
|
60
|
-
if param[0] == "max":
|
|
61
|
-
has_max = True
|
|
62
|
-
elif param[0] == "value":
|
|
63
|
-
has_value = True
|
|
52
|
+
has_max = filter_feature.parameter.max_value is not None
|
|
53
|
+
has_value = filter_feature.parameter.value is not None
|
|
64
54
|
|
|
65
55
|
if has_max:
|
|
66
56
|
# Complex parameter - use get_min_max_operator
|
|
@@ -82,11 +72,7 @@ class PolarsFilterEngine(BaseFilterEngine):
|
|
|
82
72
|
return data.filter(pl.col(column_name) <= max_parameter)
|
|
83
73
|
elif has_value:
|
|
84
74
|
# Simple parameter - extract the value
|
|
85
|
-
value =
|
|
86
|
-
for param in filter_feature.parameter:
|
|
87
|
-
if param[0] == "value":
|
|
88
|
-
value = param[1]
|
|
89
|
-
break
|
|
75
|
+
value = filter_feature.parameter.value
|
|
90
76
|
|
|
91
77
|
if value is None:
|
|
92
78
|
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
@@ -100,11 +86,7 @@ class PolarsFilterEngine(BaseFilterEngine):
|
|
|
100
86
|
column_name = filter_feature.name.name
|
|
101
87
|
|
|
102
88
|
# Extract the value from the parameter
|
|
103
|
-
value =
|
|
104
|
-
for param in filter_feature.parameter:
|
|
105
|
-
if param[0] == "value":
|
|
106
|
-
value = param[1]
|
|
107
|
-
break
|
|
89
|
+
value = filter_feature.parameter.value
|
|
108
90
|
|
|
109
91
|
if value is None:
|
|
110
92
|
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
@@ -116,11 +98,7 @@ class PolarsFilterEngine(BaseFilterEngine):
|
|
|
116
98
|
column_name = filter_feature.name.name
|
|
117
99
|
|
|
118
100
|
# Extract the value from the parameter
|
|
119
|
-
value =
|
|
120
|
-
for param in filter_feature.parameter:
|
|
121
|
-
if param[0] == "value":
|
|
122
|
-
value = param[1]
|
|
123
|
-
break
|
|
101
|
+
value = filter_feature.parameter.value
|
|
124
102
|
|
|
125
103
|
if value is None:
|
|
126
104
|
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
@@ -132,11 +110,7 @@ class PolarsFilterEngine(BaseFilterEngine):
|
|
|
132
110
|
column_name = filter_feature.name.name
|
|
133
111
|
|
|
134
112
|
# Extract the values from the parameter
|
|
135
|
-
values =
|
|
136
|
-
for param in filter_feature.parameter:
|
|
137
|
-
if param[0] == "values":
|
|
138
|
-
values = param[1]
|
|
139
|
-
break
|
|
113
|
+
values = filter_feature.parameter.values
|
|
140
114
|
|
|
141
115
|
if values is None:
|
|
142
116
|
raise ValueError(f"Filter parameter 'values' not found in {filter_feature.parameter}")
|