mloda 0.3.3__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. mloda/__init__.py +17 -0
  2. {mloda_core → mloda/core}/abstract_plugins/components/base_artifact.py +2 -2
  3. {mloda_core → mloda/core}/abstract_plugins/components/base_validator.py +13 -0
  4. {mloda_core → mloda/core}/abstract_plugins/components/data_access_collection.py +1 -1
  5. {mloda_core → mloda/core}/abstract_plugins/components/data_types.py +39 -0
  6. {mloda_core → mloda/core}/abstract_plugins/components/feature.py +39 -33
  7. {mloda_core → mloda/core}/abstract_plugins/components/feature_chainer/feature_chain_parser.py +19 -19
  8. mloda/core/abstract_plugins/components/feature_chainer/feature_chain_parser_mixin.py +197 -0
  9. {mloda_core → mloda/core}/abstract_plugins/components/feature_collection.py +6 -6
  10. {mloda_core → mloda/core}/abstract_plugins/components/feature_group_version.py +8 -8
  11. {mloda_core → mloda/core}/abstract_plugins/components/feature_set.py +18 -24
  12. {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/cfw_transformer.py +2 -2
  13. {mloda_core → mloda/core}/abstract_plugins/components/index/add_index_feature.py +4 -4
  14. {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/api_input_data.py +3 -3
  15. {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/api_input_data_collection.py +2 -2
  16. {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/base_api_data.py +1 -1
  17. {mloda_core → mloda/core}/abstract_plugins/components/input_data/base_input_data.py +6 -6
  18. {mloda_core → mloda/core}/abstract_plugins/components/input_data/creator/data_creator.py +3 -3
  19. mloda/core/abstract_plugins/components/link.py +437 -0
  20. {mloda_core → mloda/core}/abstract_plugins/components/match_data/match_data.py +3 -3
  21. {mloda_core → mloda/core}/abstract_plugins/components/merge/base_merge_engine.py +2 -2
  22. {mloda_core → mloda/core}/abstract_plugins/components/options.py +12 -36
  23. {mloda_core → mloda/core}/abstract_plugins/components/parallelization_modes.py +1 -1
  24. {mloda_core → mloda/core}/abstract_plugins/components/plugin_option/plugin_collector.py +14 -14
  25. mloda/core/abstract_plugins/components/validators/datatype_validator.py +96 -0
  26. mloda/core/abstract_plugins/components/validators/feature_set_validator.py +38 -0
  27. mloda/core/abstract_plugins/components/validators/feature_validator.py +23 -0
  28. mloda/core/abstract_plugins/components/validators/link_validator.py +79 -0
  29. mloda/core/abstract_plugins/components/validators/options_validator.py +57 -0
  30. mloda_core/abstract_plugins/compute_frame_work.py → mloda/core/abstract_plugins/compute_framework.py +45 -37
  31. mloda_core/abstract_plugins/abstract_feature_group.py → mloda/core/abstract_plugins/feature_group.py +56 -33
  32. mloda/core/abstract_plugins/function_extender.py +78 -0
  33. mloda/core/api/plugin_docs.py +220 -0
  34. mloda/core/api/plugin_info.py +32 -0
  35. {mloda_core → mloda/core}/api/prepare/setup_compute_framework.py +11 -11
  36. {mloda_core → mloda/core}/api/request.py +42 -33
  37. {mloda_core → mloda/core}/core/cfw_manager.py +8 -8
  38. {mloda_core → mloda/core}/core/engine.py +47 -47
  39. {mloda_core → mloda/core}/core/step/abstract_step.py +7 -7
  40. {mloda_core → mloda/core}/core/step/feature_group_step.py +12 -12
  41. {mloda_core → mloda/core}/core/step/join_step.py +14 -14
  42. {mloda_core → mloda/core}/core/step/transform_frame_work_step.py +16 -16
  43. {mloda_core → mloda/core}/filter/filter_engine.py +1 -1
  44. {mloda_core → mloda/core}/filter/filter_type_enum.py +1 -1
  45. {mloda_core → mloda/core}/filter/global_filter.py +23 -23
  46. {mloda_core → mloda/core}/filter/single_filter.py +6 -6
  47. {mloda_core → mloda/core}/prepare/accessible_plugins.py +15 -18
  48. {mloda_core → mloda/core}/prepare/execution_plan.py +65 -39
  49. {mloda_core → mloda/core}/prepare/graph/build_graph.py +6 -6
  50. {mloda_core → mloda/core}/prepare/graph/graph.py +1 -1
  51. {mloda_core → mloda/core}/prepare/graph/properties.py +5 -5
  52. {mloda_core → mloda/core}/prepare/identify_feature_group.py +12 -14
  53. {mloda_core → mloda/core}/prepare/joinstep_collection.py +3 -3
  54. {mloda_core → mloda/core}/prepare/resolve_compute_frameworks.py +6 -6
  55. {mloda_core → mloda/core}/prepare/resolve_graph.py +11 -11
  56. {mloda_core → mloda/core}/prepare/resolve_links.py +11 -31
  57. mloda/core/prepare/validators/resolve_link_validator.py +32 -0
  58. mloda/core/runtime/compute_framework_executor.py +271 -0
  59. mloda/core/runtime/data_lifecycle_manager.py +160 -0
  60. mloda/core/runtime/flight/__init__.py +0 -0
  61. {mloda_core → mloda/core}/runtime/flight/runner_flight_server.py +1 -1
  62. mloda/core/runtime/run.py +317 -0
  63. mloda/core/runtime/worker/__init__.py +0 -0
  64. {mloda_core → mloda/core}/runtime/worker/multiprocessing_worker.py +15 -10
  65. {mloda_core → mloda/core}/runtime/worker/thread_worker.py +2 -2
  66. mloda/core/runtime/worker_manager.py +96 -0
  67. mloda/provider/__init__.py +101 -0
  68. mloda/steward/__init__.py +25 -0
  69. mloda/user/__init__.py +57 -0
  70. {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/METADATA +24 -31
  71. mloda-0.4.1.dist-info/RECORD +248 -0
  72. {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/top_level.txt +1 -1
  73. mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_filter_engine.py +2 -2
  74. mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_framework.py +15 -13
  75. mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_merge_engine.py +3 -3
  76. mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_pyarrow_transformer.py +1 -1
  77. mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_filter_engine.py +2 -2
  78. mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_framework.py +12 -10
  79. mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_pyarrow_transformer.py +1 -1
  80. mloda_plugins/compute_framework/base_implementations/pandas/dataframe.py +18 -16
  81. mloda_plugins/compute_framework/base_implementations/pandas/pandas_filter_engine.py +36 -13
  82. mloda_plugins/compute_framework/base_implementations/pandas/pandas_merge_engine.py +7 -7
  83. mloda_plugins/compute_framework/base_implementations/pandas/pandaspyarrowtransformer.py +1 -1
  84. mloda_plugins/compute_framework/base_implementations/polars/dataframe.py +16 -14
  85. mloda_plugins/compute_framework/base_implementations/polars/lazy_dataframe.py +13 -12
  86. mloda_plugins/compute_framework/base_implementations/polars/polars_filter_engine.py +2 -2
  87. mloda_plugins/compute_framework/base_implementations/polars/polars_lazy_pyarrow_transformer.py +1 -1
  88. mloda_plugins/compute_framework/base_implementations/polars/polars_merge_engine.py +3 -3
  89. mloda_plugins/compute_framework/base_implementations/polars/polars_pyarrow_transformer.py +1 -1
  90. mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_filter_engine.py +2 -2
  91. mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_merge_engine.py +3 -3
  92. mloda_plugins/compute_framework/base_implementations/pyarrow/table.py +12 -11
  93. mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_filter_engine.py +2 -2
  94. mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_framework.py +11 -9
  95. mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_merge_engine.py +3 -3
  96. mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_pyarrow_transformer.py +1 -1
  97. mloda_plugins/compute_framework/base_implementations/spark/spark_filter_engine.py +2 -2
  98. mloda_plugins/compute_framework/base_implementations/spark/spark_framework.py +17 -15
  99. mloda_plugins/compute_framework/base_implementations/spark/spark_merge_engine.py +2 -3
  100. mloda_plugins/compute_framework/base_implementations/spark/spark_pyarrow_transformer.py +1 -1
  101. mloda_plugins/config/feature/loader.py +2 -2
  102. mloda_plugins/feature_group/experimental/aggregated_feature_group/base.py +45 -64
  103. mloda_plugins/feature_group/experimental/aggregated_feature_group/pandas.py +2 -2
  104. mloda_plugins/feature_group/experimental/aggregated_feature_group/polars_lazy.py +2 -2
  105. mloda_plugins/feature_group/experimental/aggregated_feature_group/pyarrow.py +2 -2
  106. mloda_plugins/feature_group/experimental/clustering/base.py +67 -97
  107. mloda_plugins/feature_group/experimental/clustering/pandas.py +2 -2
  108. mloda_plugins/feature_group/experimental/data_quality/missing_value/base.py +58 -82
  109. mloda_plugins/feature_group/experimental/data_quality/missing_value/pandas.py +2 -2
  110. mloda_plugins/feature_group/experimental/data_quality/missing_value/pyarrow.py +2 -2
  111. mloda_plugins/feature_group/experimental/data_quality/missing_value/python_dict.py +2 -2
  112. mloda_plugins/feature_group/experimental/default_options_key.py +16 -19
  113. mloda_plugins/feature_group/experimental/dimensionality_reduction/base.py +81 -96
  114. mloda_plugins/feature_group/experimental/dimensionality_reduction/pandas.py +2 -2
  115. mloda_plugins/feature_group/experimental/dynamic_feature_group_factory/dynamic_feature_group_factory.py +24 -24
  116. mloda_plugins/feature_group/experimental/forecasting/base.py +108 -106
  117. mloda_plugins/feature_group/experimental/forecasting/forecasting_artifact.py +2 -2
  118. mloda_plugins/feature_group/experimental/forecasting/pandas.py +15 -15
  119. mloda_plugins/feature_group/experimental/geo_distance/base.py +52 -44
  120. mloda_plugins/feature_group/experimental/geo_distance/pandas.py +2 -3
  121. mloda_plugins/feature_group/experimental/llm/cli.py +4 -4
  122. mloda_plugins/feature_group/experimental/llm/cli_features/refactor_git_cached.py +19 -19
  123. mloda_plugins/feature_group/experimental/llm/installed_packages_feature_group.py +8 -8
  124. mloda_plugins/feature_group/experimental/llm/list_directory_feature_group.py +5 -5
  125. mloda_plugins/feature_group/experimental/llm/llm_api/claude.py +3 -3
  126. mloda_plugins/feature_group/experimental/llm/llm_api/gemini.py +3 -3
  127. mloda_plugins/feature_group/experimental/llm/llm_api/llm_base_request.py +5 -5
  128. mloda_plugins/feature_group/experimental/llm/llm_api/openai.py +3 -3
  129. mloda_plugins/feature_group/experimental/llm/llm_api/request_loop.py +6 -6
  130. mloda_plugins/feature_group/experimental/llm/llm_file_selector.py +10 -10
  131. mloda_plugins/feature_group/experimental/llm/tools/tool_collection.py +1 -1
  132. mloda_plugins/feature_group/experimental/node_centrality/base.py +46 -74
  133. mloda_plugins/feature_group/experimental/node_centrality/pandas.py +2 -2
  134. mloda_plugins/feature_group/experimental/sklearn/encoding/base.py +53 -53
  135. mloda_plugins/feature_group/experimental/sklearn/encoding/pandas.py +2 -2
  136. mloda_plugins/feature_group/experimental/sklearn/pipeline/base.py +52 -39
  137. mloda_plugins/feature_group/experimental/sklearn/pipeline/pandas.py +3 -4
  138. mloda_plugins/feature_group/experimental/sklearn/scaling/base.py +44 -60
  139. mloda_plugins/feature_group/experimental/sklearn/scaling/pandas.py +2 -2
  140. mloda_plugins/feature_group/experimental/sklearn/sklearn_artifact.py +2 -3
  141. mloda_plugins/feature_group/experimental/source_input_feature.py +15 -15
  142. mloda_plugins/feature_group/experimental/text_cleaning/base.py +38 -63
  143. mloda_plugins/feature_group/experimental/text_cleaning/pandas.py +2 -2
  144. mloda_plugins/feature_group/experimental/text_cleaning/python_dict.py +2 -2
  145. mloda_plugins/feature_group/experimental/time_window/base.py +108 -95
  146. mloda_plugins/feature_group/experimental/time_window/pandas.py +13 -13
  147. mloda_plugins/feature_group/experimental/time_window/pyarrow.py +12 -12
  148. mloda_plugins/feature_group/input_data/api_data/api_data.py +9 -11
  149. mloda_plugins/feature_group/input_data/read_context_files.py +7 -7
  150. mloda_plugins/feature_group/input_data/read_db.py +7 -9
  151. mloda_plugins/feature_group/input_data/read_db_feature.py +4 -4
  152. mloda_plugins/feature_group/input_data/read_dbs/sqlite.py +23 -13
  153. mloda_plugins/feature_group/input_data/read_file.py +8 -8
  154. mloda_plugins/feature_group/input_data/read_file_feature.py +4 -4
  155. mloda_plugins/feature_group/input_data/read_files/csv.py +6 -6
  156. mloda_plugins/feature_group/input_data/read_files/feather.py +5 -5
  157. mloda_plugins/feature_group/input_data/read_files/json.py +5 -5
  158. mloda_plugins/feature_group/input_data/read_files/orc.py +5 -5
  159. mloda_plugins/feature_group/input_data/read_files/parquet.py +5 -5
  160. mloda_plugins/feature_group/input_data/read_files/text_file_reader.py +5 -5
  161. mloda_plugins/function_extender/base_implementations/otel/otel_extender.py +4 -4
  162. mloda-0.3.3.dist-info/RECORD +0 -230
  163. mloda_core/abstract_plugins/components/link.py +0 -286
  164. mloda_core/abstract_plugins/function_extender.py +0 -34
  165. mloda_core/runtime/run.py +0 -617
  166. {mloda_core → mloda/core}/__init__.py +0 -0
  167. {mloda_core → mloda/core}/abstract_plugins/__init__.py +0 -0
  168. {mloda_core → mloda/core}/abstract_plugins/components/__init__.py +0 -0
  169. {mloda_core → mloda/core}/abstract_plugins/components/domain.py +0 -0
  170. {mloda_core → mloda/core}/abstract_plugins/components/feature_chainer/__init__.py +0 -0
  171. {mloda_core → mloda/core}/abstract_plugins/components/feature_name.py +0 -0
  172. {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/__init__.py +0 -0
  173. {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/base_transformer.py +0 -0
  174. {mloda_core → mloda/core}/abstract_plugins/components/hashable_dict.py +0 -0
  175. {mloda_core → mloda/core}/abstract_plugins/components/index/__init__.py +0 -0
  176. {mloda_core → mloda/core}/abstract_plugins/components/index/index.py +0 -0
  177. {mloda_core → mloda/core}/abstract_plugins/components/input_data/__init__.py +0 -0
  178. {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/__init__.py +0 -0
  179. {mloda_core → mloda/core}/abstract_plugins/components/input_data/creator/__init__.py +0 -0
  180. {mloda_core → mloda/core}/abstract_plugins/components/match_data/__init__.py +0 -0
  181. {mloda_core → mloda/core}/abstract_plugins/components/merge/__init__.py +0 -0
  182. {mloda_core → mloda/core}/abstract_plugins/components/plugin_option/__init__.py +0 -0
  183. {mloda_core → mloda/core}/abstract_plugins/components/utils.py +0 -0
  184. {mloda_core/abstract_plugins/plugin_loader → mloda/core/abstract_plugins/components/validators}/__init__.py +0 -0
  185. {mloda_core/api → mloda/core/abstract_plugins/plugin_loader}/__init__.py +0 -0
  186. {mloda_core → mloda/core}/abstract_plugins/plugin_loader/plugin_loader.py +0 -0
  187. {mloda_core/api/prepare → mloda/core/api}/__init__.py +0 -0
  188. {mloda_core/core → mloda/core/api/prepare}/__init__.py +0 -0
  189. {mloda_core/core/step → mloda/core/core}/__init__.py +0 -0
  190. {mloda_core/filter → mloda/core/core/step}/__init__.py +0 -0
  191. {mloda_core/prepare → mloda/core/filter}/__init__.py +0 -0
  192. {mloda_core → mloda/core}/filter/filter_parameter.py +0 -0
  193. {mloda_core/prepare/graph → mloda/core/prepare}/__init__.py +0 -0
  194. {mloda_core/runtime → mloda/core/prepare/graph}/__init__.py +0 -0
  195. {mloda_core/runtime/flight → mloda/core/prepare/validators}/__init__.py +0 -0
  196. {mloda_core/runtime/worker → mloda/core/runtime}/__init__.py +0 -0
  197. {mloda_core → mloda/core}/runtime/flight/flight_server.py +0 -0
  198. {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/WHEEL +0 -0
  199. {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/entry_points.txt +0 -0
  200. {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/licenses/LICENSE.TXT +0 -0
  201. {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/licenses/NOTICE.md +0 -0
@@ -0,0 +1,160 @@
1
+ from typing import Any, Dict, List, Optional, Set
2
+ from uuid import UUID
3
+
4
+ from mloda.core.abstract_plugins.components.framework_transformer.cfw_transformer import ComputeFrameworkTransformer
5
+ from mloda.core.abstract_plugins.components.feature_name import FeatureName
6
+ from mloda.core.abstract_plugins.compute_framework import ComputeFramework
7
+ from mloda.core.abstract_plugins.components.feature_set import FeatureSet
8
+ from mloda.core.runtime.flight.flight_server import FlightServer
9
+
10
+
11
+ class DataLifecycleManager:
12
+ """
13
+ Manages data lifecycle including dropping, result collection, and artifacts.
14
+
15
+ This class handles the lifecycle of data in compute frameworks (CFWs),
16
+ including tracking data to drop, collecting results, and managing artifacts.
17
+ """
18
+
19
+ def __init__(self, transformer: Optional[ComputeFrameworkTransformer] = None) -> None:
20
+ """
21
+ Initializes DataLifecycleManager with empty state and transformer.
22
+
23
+ Args:
24
+ transformer: Optional transformer for CFW data conversion.
25
+ If None, a new ComputeFrameworkTransformer is created.
26
+ """
27
+ self.result_data_collection: Dict[UUID, Any] = {}
28
+ self.track_data_to_drop: Dict[UUID, Set[UUID]] = {}
29
+ self.artifacts: Dict[str, Any] = {}
30
+ self.transformer = transformer if transformer is not None else ComputeFrameworkTransformer()
31
+
32
+ def drop_data_for_finished_cfws(
33
+ self, finished_ids: Set[UUID], cfw_collection: Dict[UUID, ComputeFramework], location: Optional[str] = None
34
+ ) -> None:
35
+ """
36
+ Drops data for CFWs when all their dependent steps are finished.
37
+
38
+ Args:
39
+ finished_ids: Set of step UUIDs that have been completed.
40
+ cfw_collection: Dictionary of CFWs keyed by UUID.
41
+ location: Optional location string for remote data dropping.
42
+ """
43
+ if not finished_ids:
44
+ return
45
+
46
+ cfw_to_delete = set()
47
+ for cfw_uuid, step_uuids in self.track_data_to_drop.items():
48
+ if all(step_id in finished_ids for step_id in step_uuids):
49
+ self.drop_cfw_data(cfw_uuid, cfw_collection, location)
50
+ cfw_to_delete.add(cfw_uuid)
51
+
52
+ for cfw_uuid in cfw_to_delete:
53
+ del self.track_data_to_drop[cfw_uuid]
54
+
55
+ def drop_cfw_data(
56
+ self, cfw_uuid: UUID, cfw_collection: Dict[UUID, ComputeFramework], location: Optional[str] = None
57
+ ) -> None:
58
+ """
59
+ Drops data associated with a specific CFW.
60
+
61
+ Args:
62
+ cfw_uuid: The UUID of the CFW to drop data for.
63
+ cfw_collection: Dictionary of CFWs keyed by UUID.
64
+ location: Optional location string for remote data dropping.
65
+ """
66
+ cfw = cfw_collection[cfw_uuid]
67
+ if location:
68
+ cfw.drop_last_data(location)
69
+ else:
70
+ cfw.drop_last_data(None)
71
+
72
+ def track_flyway_datasets(self, cfw_uuid: UUID, datasets: Set[UUID]) -> None:
73
+ """
74
+ Stores flyway datasets for a CFW UUID for later dropping.
75
+
76
+ Args:
77
+ cfw_uuid: The UUID of the CFW.
78
+ datasets: Set of dataset UUIDs to track for dropping.
79
+ """
80
+ self.track_data_to_drop[cfw_uuid] = datasets
81
+
82
+ def add_to_result_data_collection(
83
+ self, cfw: ComputeFramework, features: FeatureSet, step_uuid: UUID, location: Optional[str] = None
84
+ ) -> None:
85
+ """
86
+ Adds result data to the collection if features are requested.
87
+
88
+ Args:
89
+ cfw: The compute framework containing the data.
90
+ features: The feature set to extract from the CFW.
91
+ step_uuid: The UUID of the step to associate with the result.
92
+ location: Optional location string for remote data access.
93
+ """
94
+ initial_requested_features = features.get_initial_requested_features()
95
+ if not initial_requested_features:
96
+ return
97
+
98
+ result = self.get_result_data(cfw, initial_requested_features, location)
99
+ if result is not None:
100
+ self.result_data_collection[step_uuid] = result
101
+
102
+ def get_result_data(
103
+ self, cfw: ComputeFramework, selected_feature_names: Set[FeatureName], location: Optional[str] = None
104
+ ) -> Any:
105
+ """
106
+ Gets result data from the compute framework.
107
+
108
+ Args:
109
+ cfw: The compute framework containing the data.
110
+ selected_feature_names: Set of feature names to select.
111
+ location: Optional location string for remote data access.
112
+
113
+ Returns:
114
+ The selected data from the CFW.
115
+
116
+ Raises:
117
+ ValueError: If CFW has no data and no location is provided.
118
+ """
119
+ if cfw.data is not None:
120
+ data = cfw.data
121
+ elif location:
122
+ data = FlightServer.download_table(location, str(cfw.uuid))
123
+ data = cfw.convert_flyserver_data_back(data, self.transformer)
124
+ else:
125
+ raise ValueError("Not implemented")
126
+
127
+ return cfw.select_data_by_column_names(data, selected_feature_names)
128
+
129
+ def get_results(self) -> List[Any]:
130
+ """
131
+ Returns list of all collected results.
132
+
133
+ Returns:
134
+ List of all result data.
135
+
136
+ Raises:
137
+ ValueError: If no results have been collected.
138
+ """
139
+ if not self.result_data_collection:
140
+ raise ValueError("No results found")
141
+
142
+ return list(self.result_data_collection.values())
143
+
144
+ def set_artifacts(self, artifacts: Dict[str, Any]) -> None:
145
+ """
146
+ Stores artifacts dictionary.
147
+
148
+ Args:
149
+ artifacts: Dictionary of artifacts to store.
150
+ """
151
+ self.artifacts = artifacts
152
+
153
+ def get_artifacts(self) -> Dict[str, Any]:
154
+ """
155
+ Returns stored artifacts.
156
+
157
+ Returns:
158
+ Dictionary of artifacts.
159
+ """
160
+ return self.artifacts
File without changes
@@ -4,7 +4,7 @@ from typing import Any, List
4
4
 
5
5
  import logging
6
6
 
7
- from mloda_core.runtime.flight.flight_server import FlightServer, create_location
7
+ from mloda.core.runtime.flight.flight_server import FlightServer, create_location
8
8
 
9
9
  logger = logging.getLogger(__name__)
10
10
 
@@ -0,0 +1,317 @@
1
+ from __future__ import annotations
2
+
3
+ import multiprocessing
4
+ import threading
5
+ import time
6
+ from typing import Any, Dict, List, Optional, Set, Union
7
+ from uuid import UUID
8
+ import logging
9
+
10
+ from mloda.core.abstract_plugins.function_extender import Extender
11
+ from mloda.core.abstract_plugins.components.feature_name import FeatureName
12
+ from mloda.core.abstract_plugins.compute_framework import ComputeFramework
13
+ from mloda.core.prepare.execution_plan import ExecutionPlan
14
+ from mloda.core.runtime.worker_manager import WorkerManager
15
+ from mloda.core.runtime.data_lifecycle_manager import DataLifecycleManager
16
+ from mloda.core.runtime.compute_framework_executor import ComputeFrameworkExecutor
17
+ from mloda.core.core.cfw_manager import CfwManager, MyManager
18
+ from mloda.core.abstract_plugins.components.parallelization_modes import ParallelizationMode
19
+ from mloda.core.runtime.flight.runner_flight_server import ParallelRunnerFlightServer
20
+ from mloda.core.core.step.feature_group_step import FeatureGroupStep
21
+ from mloda.core.core.step.join_step import JoinStep
22
+ from mloda.core.core.step.transform_frame_work_step import TransformFrameworkStep
23
+ from mloda.core.abstract_plugins.components.feature_set import FeatureSet
24
+
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class ExecutionOrchestrator:
30
+ """
31
+ Orchestrates the execution of an mloda based on a given execution plan.
32
+
33
+ This class manages compute frameworks (CFWs), data dependencies, and parallel execution
34
+ using threads or multiprocessing. It handles the execution of feature group steps,
35
+ transform framework steps, and join steps, while also managing data dropping and result collection.
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ execution_planner: ExecutionPlan,
41
+ flight_server: Optional[ParallelRunnerFlightServer] = None,
42
+ ) -> None:
43
+ """
44
+ Initializes the ExecutionOrchestrator with an execution plan and optional flight server.
45
+
46
+ Args:
47
+ execution_planner: The execution plan that defines the steps to be executed.
48
+ flight_server: An optional flight server for data transfer.
49
+ """
50
+ self.execution_planner = execution_planner
51
+
52
+ self.cfw_register: CfwManager
53
+
54
+ # multiprocessing - delegate to WorkerManager
55
+ self.location: Optional[str] = None
56
+ self.worker_manager = WorkerManager()
57
+
58
+ # Data lifecycle - delegate to DataLifecycleManager
59
+ self.data_lifecycle_manager = DataLifecycleManager()
60
+
61
+ self.flight_server = None
62
+ if flight_server:
63
+ self.flight_server = flight_server
64
+
65
+ def _is_step_done(self, step_uuids: Set[UUID], finished_ids: Set[UUID]) -> bool:
66
+ """
67
+ Checks if all steps identified by the given UUIDs have already been finished.
68
+ """
69
+ return all(uuid in finished_ids for uuid in step_uuids)
70
+
71
+ def _drop_data_for_finished_cfws(self, finished_ids: Set[UUID]) -> None:
72
+ """
73
+ Handles the dropping of intermediate data based on finished steps.
74
+ """
75
+ self.data_lifecycle_manager.drop_data_for_finished_cfws(
76
+ finished_ids, self.executor.cfw_collection, self.location
77
+ )
78
+
79
+ def compute(self) -> None:
80
+ """
81
+ Executes the mloda pipeline based on the execution plan.
82
+
83
+ This method iterates through the execution plan, checks dependencies,
84
+ and executes steps using the appropriate parallelization mode.
85
+ It also handles errors, result collection, and data dropping.
86
+ """
87
+ if self.cfw_register is None:
88
+ raise ValueError("CfwManager not initialized")
89
+
90
+ self.executor = ComputeFrameworkExecutor(self.cfw_register, self.worker_manager)
91
+
92
+ finished_ids: Set[UUID] = set()
93
+ to_finish_ids: Set[UUID] = set()
94
+ currently_running_steps: Set[UUID] = set()
95
+
96
+ try:
97
+ while to_finish_ids != finished_ids or len(finished_ids) == 0:
98
+ if self.cfw_register:
99
+ error = self.cfw_register.get_error()
100
+ if error:
101
+ logger.error(self.cfw_register.get_error_exc_info())
102
+ raise Exception(self.cfw_register.get_error_exc_info(), self.cfw_register.get_error_msg())
103
+ else:
104
+ break
105
+
106
+ for step in self.execution_planner:
107
+ to_finish_ids.update(step.get_uuids())
108
+
109
+ if isinstance(step, FeatureGroupStep):
110
+ self._drop_data_for_finished_cfws(finished_ids)
111
+
112
+ if self._is_step_done(step.get_uuids(), finished_ids):
113
+ continue
114
+
115
+ # check if step is currently running
116
+ if self.currently_running_step(step.get_uuids(), currently_running_steps):
117
+ if self._process_step_result(step):
118
+ self._mark_step_as_finished(step.get_uuids(), finished_ids, currently_running_steps)
119
+ continue
120
+
121
+ if not self._can_run_step(
122
+ step.required_uuids, step.get_uuids(), finished_ids, currently_running_steps
123
+ ):
124
+ continue
125
+ self._execute_step(step)
126
+
127
+ time.sleep(0.01)
128
+
129
+ finally:
130
+ self.data_lifecycle_manager.set_artifacts(self.cfw_register.get_artifacts())
131
+ self.join()
132
+
133
+ def _process_step_result(self, step: Any) -> Union[Any, bool]:
134
+ """
135
+ Handles the result of a step based on its type.
136
+
137
+ This method checks if a step is done, then performs specific actions based
138
+ on the step's type, such as adding results to the data collection or dropping data.
139
+ """
140
+ # set step.is_done from other processes via result queue
141
+ self.worker_manager.poll_result_queues()
142
+ if step.uuid in self.worker_manager.result_uuids_collection:
143
+ step.step_is_done = True
144
+
145
+ if not step.step_is_done:
146
+ return False
147
+
148
+ if isinstance(step, (TransformFrameworkStep, JoinStep)):
149
+ return True
150
+
151
+ if isinstance(step, FeatureGroupStep):
152
+ if step.features.any_uuid is None:
153
+ raise ValueError(f"from_feature_uuid should not be none. {step}")
154
+
155
+ cfw = self.executor.get_cfw(step.compute_framework, step.features.any_uuid)
156
+ self.add_to_result_data_collection(cfw, step.features, step.uuid)
157
+ self._drop_data_if_possible(cfw, step)
158
+
159
+ return True
160
+
161
+ def _drop_data_if_possible(self, cfw: ComputeFramework, step: Any) -> None:
162
+ """
163
+ Drops data associated with a compute framework if possible.
164
+
165
+ This method checks if data can be dropped based on the CFW's dependencies
166
+ and either drops the data directly or sends a command to a worker process to do so.
167
+ """
168
+ process, command_queue, result_queue = self.worker_manager.process_register.get(cfw.uuid, (None, None, None))
169
+
170
+ feature_uuids_to_possible_drop = {f.uuid for f in step.features.features}
171
+
172
+ if command_queue is None:
173
+ data_to_drop = cfw.add_already_calculated_children_and_drop_if_possible(
174
+ feature_uuids_to_possible_drop, self.location
175
+ )
176
+ if isinstance(data_to_drop, frozenset):
177
+ self.data_lifecycle_manager.track_data_to_drop[cfw.uuid] = set(data_to_drop)
178
+ else:
179
+ command_queue.put(feature_uuids_to_possible_drop)
180
+
181
+ flyway_datasets = self.cfw_register.get_uuid_flyway_datasets(cfw.uuid)
182
+ if flyway_datasets:
183
+ self.data_lifecycle_manager.track_data_to_drop[cfw.uuid] = flyway_datasets
184
+
185
+ if result_queue is not None:
186
+ self._wait_for_drop_completion(result_queue, cfw.uuid)
187
+
188
+ def _wait_for_drop_completion(
189
+ self, result_queue: multiprocessing.Queue[Any], cfw_uuid: UUID, timeout: float = 5.0
190
+ ) -> None:
191
+ """
192
+ Wait for drop operation to complete from worker process.
193
+
194
+ Args:
195
+ result_queue: The queue to receive completion signals from the worker.
196
+ cfw_uuid: The UUID of the compute framework being dropped.
197
+ timeout: Maximum time to wait for completion in seconds.
198
+ """
199
+ self.worker_manager.wait_for_drop_completion(result_queue, cfw_uuid, timeout)
200
+
201
+ def _execute_step(self, step: Any) -> None:
202
+ """
203
+ Executes a step based on its parallelization mode.
204
+ """
205
+ execution_function = self.executor._get_execution_function(
206
+ self.cfw_register.get_parallelization_modes(), step.get_parallelization_mode()
207
+ )
208
+ execution_function(step)
209
+
210
+ def join(self) -> None:
211
+ """
212
+ Joins all tasks (threads or processes) and terminates multiprocessing processes.
213
+ """
214
+ self.worker_manager.join_all()
215
+
216
+ def add_to_result_data_collection(self, cfw: ComputeFramework, features: FeatureSet, step_uuid: UUID) -> None:
217
+ """
218
+ Adds the result data to the result data collection.
219
+ """
220
+ self.data_lifecycle_manager.add_to_result_data_collection(cfw, features, step_uuid, self.location)
221
+
222
+ def get_result_data(
223
+ self, cfw: ComputeFramework, selected_feature_names: Set[FeatureName], location: Optional[str] = None
224
+ ) -> Any:
225
+ """
226
+ Gets result data from the compute framework.
227
+ """
228
+ return self.data_lifecycle_manager.get_result_data(cfw, selected_feature_names, location)
229
+
230
+ def currently_running_step(self, step_uuids: Set[UUID], currently_running_steps: Set[UUID]) -> bool:
231
+ """
232
+ Checks if a step is currently running.
233
+
234
+ Returns:
235
+ True if the step is currently running, False otherwise.
236
+ """
237
+ if next(iter(step_uuids)) not in currently_running_steps:
238
+ return False
239
+ return True
240
+
241
+ def __enter__(
242
+ self,
243
+ parallelization_modes: Set[ParallelizationMode] = {ParallelizationMode.SYNC},
244
+ function_extender: Optional[Set[Extender]] = None,
245
+ api_data: Optional[Dict[str, Any]] = None,
246
+ ) -> None:
247
+ """
248
+ Enters the context of the ExecutionOrchestrator.
249
+ """
250
+ MyManager.register("CfwManager", CfwManager)
251
+ self.manager = MyManager().__enter__()
252
+ self.cfw_register = self.manager.CfwManager(parallelization_modes, function_extender) # type: ignore[attr-defined]
253
+
254
+ if self.flight_server:
255
+ if self.flight_server.flight_server_process is None:
256
+ self.flight_server.start_flight_server_process()
257
+
258
+ if self.flight_server:
259
+ self.location = self.flight_server.get_location()
260
+
261
+ if self.location is None:
262
+ raise ValueError("Location should not be None.")
263
+
264
+ self.cfw_register.set_location(self.location)
265
+
266
+ if api_data:
267
+ self.cfw_register.set_api_data(api_data)
268
+
269
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
270
+ """
271
+ Exits the context of the ExecutionOrchestrator.
272
+
273
+ Args:
274
+ exc_type: The exception type.
275
+ exc_val: The exception value.
276
+ exc_tb: The exception traceback.
277
+ """
278
+ self.manager.shutdown()
279
+
280
+ def get_artifacts(self) -> Dict[str, Any]:
281
+ """
282
+ Gets the artifacts.
283
+ """
284
+ return self.data_lifecycle_manager.get_artifacts()
285
+
286
+ def _can_run_step(
287
+ self,
288
+ required_uuids: Set[UUID],
289
+ step_uuid: Set[UUID],
290
+ finished_steps: Set[UUID],
291
+ currently_running_steps: Set[UUID],
292
+ ) -> bool:
293
+ """
294
+ Checks if a step can be run. If it can, add it to the currently_running_steps set.
295
+ """
296
+
297
+ with threading.Lock():
298
+ if required_uuids.issubset(finished_steps) and not step_uuid.intersection(currently_running_steps):
299
+ currently_running_steps.update(step_uuid)
300
+ return True
301
+ return False
302
+
303
+ def _mark_step_as_finished(
304
+ self, step_uuid: Set[UUID], finished_steps: Set[UUID], currently_running_steps: Set[UUID]
305
+ ) -> None:
306
+ """
307
+ Marks a step as finished.
308
+ """
309
+ with threading.Lock():
310
+ currently_running_steps.difference_update(step_uuid)
311
+ finished_steps.update(step_uuid)
312
+
313
+ def get_result(self) -> List[Any]:
314
+ """
315
+ Gets the results.
316
+ """
317
+ return self.data_lifecycle_manager.get_results()
File without changes
@@ -8,11 +8,11 @@ from typing import Any, Set, Union
8
8
  from uuid import UUID
9
9
  from queue import Empty
10
10
 
11
- from mloda_core.abstract_plugins.compute_frame_work import ComputeFrameWork
12
- from mloda_core.core.cfw_manager import CfwManager
13
- from mloda_core.core.step.feature_group_step import FeatureGroupStep
14
- from mloda_core.core.step.join_step import JoinStep
15
- from mloda_core.core.step.transform_frame_work_step import TransformFrameworkStep
11
+ from mloda.core.abstract_plugins.compute_framework import ComputeFramework
12
+ from mloda.core.core.cfw_manager import CfwManager
13
+ from mloda.core.core.step.feature_group_step import FeatureGroupStep
14
+ from mloda.core.core.step.join_step import JoinStep
15
+ from mloda.core.core.step.transform_frame_work_step import TransformFrameworkStep
16
16
 
17
17
 
18
18
  logger = logging.getLogger(__name__)
@@ -26,12 +26,17 @@ def _handle_stop_command(command_queue: multiprocessing.Queue[Any]) -> None:
26
26
 
27
27
  def _handle_data_dropping(
28
28
  command_queue: multiprocessing.Queue[Any],
29
- cfw: ComputeFrameWork,
29
+ cfw: ComputeFramework,
30
30
  command: Set[Any],
31
31
  location: str,
32
+ result_queue: multiprocessing.Queue[Any],
32
33
  ) -> bool:
33
34
  """Handles dropping already calculated data based on the provided command."""
34
35
  data_to_drop = cfw.add_already_calculated_children_and_drop_if_possible(command, location)
36
+
37
+ # Signal completion back to main thread
38
+ result_queue.put(("DROP_COMPLETE", cfw.uuid), block=False)
39
+
35
40
  if data_to_drop is True:
36
41
  _handle_stop_command(command_queue)
37
42
  return True
@@ -41,7 +46,7 @@ def _handle_data_dropping(
41
46
  def _execute_command(
42
47
  command: Union[JoinStep, TransformFrameworkStep, FeatureGroupStep],
43
48
  cfw_register: CfwManager,
44
- cfw: ComputeFrameWork,
49
+ cfw: ComputeFramework,
45
50
  data: Any,
46
51
  from_cfw: UUID,
47
52
  ) -> Any:
@@ -73,7 +78,7 @@ def _execute_command(
73
78
 
74
79
  def _handle_command_result(
75
80
  command: FeatureGroupStep,
76
- cfw: ComputeFrameWork,
81
+ cfw: ComputeFramework,
77
82
  location: str,
78
83
  data: Any,
79
84
  result_queue: multiprocessing.Queue[Any],
@@ -94,7 +99,7 @@ def worker(
94
99
  command_queue: multiprocessing.Queue[Any],
95
100
  result_queue: multiprocessing.Queue[Any],
96
101
  cfw_register: CfwManager,
97
- cfw: ComputeFrameWork,
102
+ cfw: ComputeFramework,
98
103
  from_cfw: UUID,
99
104
  ) -> None:
100
105
  data = None
@@ -115,7 +120,7 @@ def worker(
115
120
  break
116
121
 
117
122
  if isinstance(command, set):
118
- if _handle_data_dropping(command_queue, cfw, command, location):
123
+ if _handle_data_dropping(command_queue, cfw, command, location, result_queue):
119
124
  break
120
125
  continue
121
126
 
@@ -2,13 +2,13 @@ import logging
2
2
  import traceback
3
3
  from typing import Any
4
4
 
5
- from mloda_core.abstract_plugins.compute_frame_work import ComputeFrameWork
5
+ from mloda.core.abstract_plugins.compute_framework import ComputeFramework
6
6
 
7
7
 
8
8
  logger = logging.getLogger(__name__)
9
9
 
10
10
 
11
- def thread_worker(command: Any, cfw_register: Any, cfw: ComputeFrameWork, from_cfw: ComputeFrameWork) -> None:
11
+ def thread_worker(command: Any, cfw_register: Any, cfw: ComputeFramework, from_cfw: ComputeFramework) -> None:
12
12
  try:
13
13
  command.execute(cfw_register, cfw, from_cfw=from_cfw)
14
14
  command.step_is_done = True
@@ -0,0 +1,96 @@
1
+ from __future__ import annotations
2
+
3
+ import multiprocessing
4
+ import queue
5
+ import threading
6
+ import time
7
+ import logging
8
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
9
+ from uuid import UUID
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class WorkerManager:
15
+ """Manages thread/process lifecycle for parallel execution."""
16
+
17
+ def __init__(self) -> None:
18
+ """Initialize empty state."""
19
+ self.tasks: List[Union[threading.Thread, multiprocessing.Process]] = []
20
+ self.process_register: Dict[UUID, Tuple[Any, Any, Any]] = {}
21
+ self.result_queues_collection: Set[Any] = set()
22
+ self.result_uuids_collection: Set[UUID] = set()
23
+
24
+ def add_thread_task(self, task: threading.Thread) -> None:
25
+ """Add task to list and call task.start()."""
26
+ self.tasks.append(task)
27
+ task.start()
28
+
29
+ def create_worker_process(
30
+ self, cfw_uuid: UUID, target: Callable[..., None], args: Tuple[Any, ...]
31
+ ) -> Tuple[Any, Any, Any]:
32
+ """Create worker process with command and result queues."""
33
+ command_queue: multiprocessing.Queue[Any] = multiprocessing.Queue()
34
+ result_queue: multiprocessing.Queue[Any] = multiprocessing.Queue()
35
+
36
+ process = multiprocessing.Process(target=target, args=(command_queue, result_queue, *args))
37
+
38
+ self.process_register[cfw_uuid] = (process, command_queue, result_queue)
39
+ self.result_queues_collection.add(result_queue)
40
+ self.tasks.append(process)
41
+ process.start()
42
+
43
+ return process, command_queue, result_queue
44
+
45
+ def get_process_queues(self, cfw_uuid: UUID) -> Optional[Tuple[Any, Any, Any]]:
46
+ """Return registered tuple or None."""
47
+ return self.process_register.get(cfw_uuid)
48
+
49
+ def send_command(self, cfw_uuid: UUID, command: Any) -> None:
50
+ """Put command in command_queue, raise ValueError if not found."""
51
+ result = self.process_register.get(cfw_uuid)
52
+ if result is None:
53
+ raise ValueError(f"No process found for CFW UUID: {cfw_uuid}")
54
+ _, command_queue, _ = result
55
+ command_queue.put(command)
56
+
57
+ def poll_result_queues(self) -> None:
58
+ """Non-blocking poll all result queues, add UUIDs to result_uuids_collection."""
59
+ for r_queue in self.result_queues_collection:
60
+ try:
61
+ result_uuid = r_queue.get(block=False)
62
+ self.result_uuids_collection.add(UUID(result_uuid))
63
+ except queue.Empty:
64
+ continue
65
+
66
+ def is_step_done(self, step_uuid: UUID) -> bool:
67
+ """Return step_uuid in result_uuids_collection."""
68
+ return step_uuid in self.result_uuids_collection
69
+
70
+ def wait_for_drop_completion(self, result_queue: Any, cfw_uuid: UUID, timeout: float = 5.0) -> None:
71
+ """Poll queue until ("DROP_COMPLETE", cfw_uuid) received or timeout."""
72
+ start_time = time.time()
73
+ while time.time() - start_time < timeout:
74
+ try:
75
+ msg = result_queue.get(block=False)
76
+ if isinstance(msg, tuple) and len(msg) == 2 and msg[0] == "DROP_COMPLETE" and msg[1] == cfw_uuid:
77
+ return
78
+ result_queue.put(msg, block=False)
79
+ except queue.Empty:
80
+ time.sleep(0.001)
81
+ logger.warning(f"Drop operation for CFW {cfw_uuid} timed out after {timeout}s")
82
+
83
+ def join_all(self) -> None:
84
+ """Terminate processes (not threads), join all tasks, raise Exception if any fail."""
85
+ failed = False
86
+ for task in self.tasks:
87
+ try:
88
+ if isinstance(task, multiprocessing.Process):
89
+ task.terminate()
90
+ task.join()
91
+ except Exception as e:
92
+ logger.error(f"Error joining task: {e}")
93
+ failed = True
94
+
95
+ if failed:
96
+ raise Exception("Error while joining tasks")