mloda 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. mloda/__init__.py +17 -0
  2. {mloda_core → mloda/core}/abstract_plugins/components/base_artifact.py +2 -2
  3. {mloda_core → mloda/core}/abstract_plugins/components/base_validator.py +13 -0
  4. {mloda_core → mloda/core}/abstract_plugins/components/data_access_collection.py +1 -1
  5. {mloda_core → mloda/core}/abstract_plugins/components/data_types.py +39 -0
  6. {mloda_core → mloda/core}/abstract_plugins/components/feature.py +39 -33
  7. {mloda_core → mloda/core}/abstract_plugins/components/feature_chainer/feature_chain_parser.py +19 -19
  8. mloda/core/abstract_plugins/components/feature_chainer/feature_chain_parser_mixin.py +197 -0
  9. {mloda_core → mloda/core}/abstract_plugins/components/feature_collection.py +6 -6
  10. {mloda_core → mloda/core}/abstract_plugins/components/feature_group_version.py +8 -8
  11. {mloda_core → mloda/core}/abstract_plugins/components/feature_set.py +18 -24
  12. {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/cfw_transformer.py +2 -2
  13. {mloda_core → mloda/core}/abstract_plugins/components/index/add_index_feature.py +4 -4
  14. {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/api_input_data.py +3 -3
  15. {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/api_input_data_collection.py +2 -2
  16. {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/base_api_data.py +1 -1
  17. {mloda_core → mloda/core}/abstract_plugins/components/input_data/base_input_data.py +6 -6
  18. {mloda_core → mloda/core}/abstract_plugins/components/input_data/creator/data_creator.py +3 -3
  19. mloda/core/abstract_plugins/components/link.py +437 -0
  20. {mloda_core → mloda/core}/abstract_plugins/components/match_data/match_data.py +3 -3
  21. {mloda_core → mloda/core}/abstract_plugins/components/merge/base_merge_engine.py +2 -2
  22. {mloda_core → mloda/core}/abstract_plugins/components/options.py +12 -36
  23. {mloda_core → mloda/core}/abstract_plugins/components/parallelization_modes.py +1 -1
  24. {mloda_core → mloda/core}/abstract_plugins/components/plugin_option/plugin_collector.py +14 -14
  25. mloda/core/abstract_plugins/components/validators/datatype_validator.py +96 -0
  26. mloda/core/abstract_plugins/components/validators/feature_set_validator.py +38 -0
  27. mloda/core/abstract_plugins/components/validators/feature_validator.py +23 -0
  28. mloda/core/abstract_plugins/components/validators/link_validator.py +79 -0
  29. mloda/core/abstract_plugins/components/validators/options_validator.py +57 -0
  30. mloda_core/abstract_plugins/compute_frame_work.py → mloda/core/abstract_plugins/compute_framework.py +46 -37
  31. mloda_core/abstract_plugins/abstract_feature_group.py → mloda/core/abstract_plugins/feature_group.py +56 -33
  32. mloda/core/abstract_plugins/function_extender.py +78 -0
  33. mloda/core/api/plugin_docs.py +220 -0
  34. mloda/core/api/plugin_info.py +32 -0
  35. {mloda_core → mloda/core}/api/prepare/setup_compute_framework.py +11 -11
  36. {mloda_core → mloda/core}/api/request.py +42 -33
  37. {mloda_core → mloda/core}/core/cfw_manager.py +8 -8
  38. {mloda_core → mloda/core}/core/engine.py +47 -46
  39. {mloda_core → mloda/core}/core/step/abstract_step.py +7 -7
  40. {mloda_core → mloda/core}/core/step/feature_group_step.py +12 -12
  41. {mloda_core → mloda/core}/core/step/join_step.py +14 -14
  42. {mloda_core → mloda/core}/core/step/transform_frame_work_step.py +16 -16
  43. {mloda_core → mloda/core}/filter/filter_engine.py +1 -1
  44. {mloda_core → mloda/core}/filter/filter_type_enum.py +1 -1
  45. {mloda_core → mloda/core}/filter/global_filter.py +23 -23
  46. {mloda_core → mloda/core}/filter/single_filter.py +6 -6
  47. {mloda_core → mloda/core}/prepare/accessible_plugins.py +16 -18
  48. {mloda_core → mloda/core}/prepare/execution_plan.py +65 -39
  49. {mloda_core → mloda/core}/prepare/graph/build_graph.py +6 -6
  50. {mloda_core → mloda/core}/prepare/graph/graph.py +1 -1
  51. {mloda_core → mloda/core}/prepare/graph/properties.py +5 -5
  52. {mloda_core → mloda/core}/prepare/identify_feature_group.py +12 -14
  53. {mloda_core → mloda/core}/prepare/joinstep_collection.py +3 -3
  54. {mloda_core → mloda/core}/prepare/resolve_compute_frameworks.py +6 -6
  55. {mloda_core → mloda/core}/prepare/resolve_graph.py +11 -11
  56. {mloda_core → mloda/core}/prepare/resolve_links.py +11 -31
  57. mloda/core/prepare/validators/resolve_link_validator.py +32 -0
  58. mloda/core/runtime/compute_framework_executor.py +271 -0
  59. mloda/core/runtime/data_lifecycle_manager.py +160 -0
  60. mloda/core/runtime/flight/__init__.py +0 -0
  61. {mloda_core → mloda/core}/runtime/flight/runner_flight_server.py +1 -1
  62. mloda/core/runtime/run.py +317 -0
  63. mloda/core/runtime/worker/__init__.py +0 -0
  64. {mloda_core → mloda/core}/runtime/worker/multiprocessing_worker.py +15 -10
  65. {mloda_core → mloda/core}/runtime/worker/thread_worker.py +2 -2
  66. mloda/core/runtime/worker_manager.py +96 -0
  67. mloda/provider/__init__.py +101 -0
  68. mloda/steward/__init__.py +25 -0
  69. mloda/user/__init__.py +57 -0
  70. {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/METADATA +18 -22
  71. mloda-0.4.0.dist-info/RECORD +248 -0
  72. {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/top_level.txt +1 -1
  73. mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_filter_engine.py +2 -2
  74. mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_framework.py +15 -13
  75. mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_merge_engine.py +3 -3
  76. mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_pyarrow_transformer.py +1 -1
  77. mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_filter_engine.py +2 -2
  78. mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_framework.py +12 -10
  79. mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_pyarrow_transformer.py +1 -1
  80. mloda_plugins/compute_framework/base_implementations/pandas/dataframe.py +18 -16
  81. mloda_plugins/compute_framework/base_implementations/pandas/pandas_filter_engine.py +36 -13
  82. mloda_plugins/compute_framework/base_implementations/pandas/pandas_merge_engine.py +7 -7
  83. mloda_plugins/compute_framework/base_implementations/pandas/pandaspyarrowtransformer.py +1 -1
  84. mloda_plugins/compute_framework/base_implementations/polars/dataframe.py +16 -14
  85. mloda_plugins/compute_framework/base_implementations/polars/lazy_dataframe.py +13 -12
  86. mloda_plugins/compute_framework/base_implementations/polars/polars_filter_engine.py +2 -2
  87. mloda_plugins/compute_framework/base_implementations/polars/polars_lazy_pyarrow_transformer.py +1 -1
  88. mloda_plugins/compute_framework/base_implementations/polars/polars_merge_engine.py +3 -3
  89. mloda_plugins/compute_framework/base_implementations/polars/polars_pyarrow_transformer.py +1 -1
  90. mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_filter_engine.py +2 -2
  91. mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_merge_engine.py +3 -3
  92. mloda_plugins/compute_framework/base_implementations/pyarrow/table.py +12 -10
  93. mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_filter_engine.py +2 -2
  94. mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_framework.py +11 -9
  95. mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_merge_engine.py +3 -3
  96. mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_pyarrow_transformer.py +1 -1
  97. mloda_plugins/compute_framework/base_implementations/spark/spark_filter_engine.py +2 -2
  98. mloda_plugins/compute_framework/base_implementations/spark/spark_framework.py +17 -15
  99. mloda_plugins/compute_framework/base_implementations/spark/spark_merge_engine.py +3 -3
  100. mloda_plugins/compute_framework/base_implementations/spark/spark_pyarrow_transformer.py +1 -1
  101. mloda_plugins/config/feature/loader.py +2 -2
  102. mloda_plugins/feature_group/experimental/aggregated_feature_group/base.py +45 -62
  103. mloda_plugins/feature_group/experimental/aggregated_feature_group/pandas.py +2 -2
  104. mloda_plugins/feature_group/experimental/aggregated_feature_group/polars_lazy.py +2 -2
  105. mloda_plugins/feature_group/experimental/aggregated_feature_group/pyarrow.py +2 -2
  106. mloda_plugins/feature_group/experimental/clustering/base.py +69 -97
  107. mloda_plugins/feature_group/experimental/clustering/pandas.py +2 -2
  108. mloda_plugins/feature_group/experimental/data_quality/missing_value/base.py +58 -79
  109. mloda_plugins/feature_group/experimental/data_quality/missing_value/pandas.py +2 -2
  110. mloda_plugins/feature_group/experimental/data_quality/missing_value/pyarrow.py +2 -2
  111. mloda_plugins/feature_group/experimental/data_quality/missing_value/python_dict.py +2 -2
  112. mloda_plugins/feature_group/experimental/default_options_key.py +16 -19
  113. mloda_plugins/feature_group/experimental/dimensionality_reduction/base.py +80 -94
  114. mloda_plugins/feature_group/experimental/dimensionality_reduction/pandas.py +2 -2
  115. mloda_plugins/feature_group/experimental/dynamic_feature_group_factory/dynamic_feature_group_factory.py +24 -24
  116. mloda_plugins/feature_group/experimental/forecasting/base.py +106 -104
  117. mloda_plugins/feature_group/experimental/forecasting/forecasting_artifact.py +2 -2
  118. mloda_plugins/feature_group/experimental/forecasting/pandas.py +15 -15
  119. mloda_plugins/feature_group/experimental/geo_distance/base.py +50 -42
  120. mloda_plugins/feature_group/experimental/geo_distance/pandas.py +2 -2
  121. mloda_plugins/feature_group/experimental/llm/cli.py +4 -4
  122. mloda_plugins/feature_group/experimental/llm/cli_features/refactor_git_cached.py +19 -19
  123. mloda_plugins/feature_group/experimental/llm/installed_packages_feature_group.py +8 -8
  124. mloda_plugins/feature_group/experimental/llm/list_directory_feature_group.py +5 -5
  125. mloda_plugins/feature_group/experimental/llm/llm_api/claude.py +3 -3
  126. mloda_plugins/feature_group/experimental/llm/llm_api/gemini.py +3 -3
  127. mloda_plugins/feature_group/experimental/llm/llm_api/llm_base_request.py +5 -5
  128. mloda_plugins/feature_group/experimental/llm/llm_api/openai.py +3 -3
  129. mloda_plugins/feature_group/experimental/llm/llm_api/request_loop.py +6 -6
  130. mloda_plugins/feature_group/experimental/llm/llm_file_selector.py +10 -10
  131. mloda_plugins/feature_group/experimental/llm/tools/tool_collection.py +1 -1
  132. mloda_plugins/feature_group/experimental/node_centrality/base.py +46 -72
  133. mloda_plugins/feature_group/experimental/node_centrality/pandas.py +2 -2
  134. mloda_plugins/feature_group/experimental/sklearn/encoding/base.py +51 -51
  135. mloda_plugins/feature_group/experimental/sklearn/encoding/pandas.py +2 -2
  136. mloda_plugins/feature_group/experimental/sklearn/pipeline/base.py +52 -39
  137. mloda_plugins/feature_group/experimental/sklearn/pipeline/pandas.py +2 -2
  138. mloda_plugins/feature_group/experimental/sklearn/scaling/base.py +44 -58
  139. mloda_plugins/feature_group/experimental/sklearn/scaling/pandas.py +2 -2
  140. mloda_plugins/feature_group/experimental/sklearn/sklearn_artifact.py +2 -2
  141. mloda_plugins/feature_group/experimental/source_input_feature.py +15 -15
  142. mloda_plugins/feature_group/experimental/text_cleaning/base.py +38 -61
  143. mloda_plugins/feature_group/experimental/text_cleaning/pandas.py +2 -2
  144. mloda_plugins/feature_group/experimental/text_cleaning/python_dict.py +2 -2
  145. mloda_plugins/feature_group/experimental/time_window/base.py +106 -93
  146. mloda_plugins/feature_group/experimental/time_window/pandas.py +13 -13
  147. mloda_plugins/feature_group/experimental/time_window/pyarrow.py +12 -12
  148. mloda_plugins/feature_group/input_data/api_data/api_data.py +9 -11
  149. mloda_plugins/feature_group/input_data/read_context_files.py +7 -7
  150. mloda_plugins/feature_group/input_data/read_db.py +7 -9
  151. mloda_plugins/feature_group/input_data/read_db_feature.py +4 -4
  152. mloda_plugins/feature_group/input_data/read_dbs/sqlite.py +23 -13
  153. mloda_plugins/feature_group/input_data/read_file.py +8 -8
  154. mloda_plugins/feature_group/input_data/read_file_feature.py +4 -4
  155. mloda_plugins/feature_group/input_data/read_files/csv.py +6 -6
  156. mloda_plugins/feature_group/input_data/read_files/feather.py +5 -5
  157. mloda_plugins/feature_group/input_data/read_files/json.py +5 -5
  158. mloda_plugins/feature_group/input_data/read_files/orc.py +5 -5
  159. mloda_plugins/feature_group/input_data/read_files/parquet.py +5 -5
  160. mloda_plugins/feature_group/input_data/read_files/text_file_reader.py +5 -5
  161. mloda_plugins/function_extender/base_implementations/otel/otel_extender.py +4 -4
  162. mloda-0.3.3.dist-info/RECORD +0 -230
  163. mloda_core/abstract_plugins/components/link.py +0 -286
  164. mloda_core/abstract_plugins/function_extender.py +0 -34
  165. mloda_core/runtime/run.py +0 -617
  166. {mloda_core → mloda/core}/__init__.py +0 -0
  167. {mloda_core → mloda/core}/abstract_plugins/__init__.py +0 -0
  168. {mloda_core → mloda/core}/abstract_plugins/components/__init__.py +0 -0
  169. {mloda_core → mloda/core}/abstract_plugins/components/domain.py +0 -0
  170. {mloda_core → mloda/core}/abstract_plugins/components/feature_chainer/__init__.py +0 -0
  171. {mloda_core → mloda/core}/abstract_plugins/components/feature_name.py +0 -0
  172. {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/__init__.py +0 -0
  173. {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/base_transformer.py +0 -0
  174. {mloda_core → mloda/core}/abstract_plugins/components/hashable_dict.py +0 -0
  175. {mloda_core → mloda/core}/abstract_plugins/components/index/__init__.py +0 -0
  176. {mloda_core → mloda/core}/abstract_plugins/components/index/index.py +0 -0
  177. {mloda_core → mloda/core}/abstract_plugins/components/input_data/__init__.py +0 -0
  178. {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/__init__.py +0 -0
  179. {mloda_core → mloda/core}/abstract_plugins/components/input_data/creator/__init__.py +0 -0
  180. {mloda_core → mloda/core}/abstract_plugins/components/match_data/__init__.py +0 -0
  181. {mloda_core → mloda/core}/abstract_plugins/components/merge/__init__.py +0 -0
  182. {mloda_core → mloda/core}/abstract_plugins/components/plugin_option/__init__.py +0 -0
  183. {mloda_core → mloda/core}/abstract_plugins/components/utils.py +0 -0
  184. {mloda_core/abstract_plugins/plugin_loader → mloda/core/abstract_plugins/components/validators}/__init__.py +0 -0
  185. {mloda_core/api → mloda/core/abstract_plugins/plugin_loader}/__init__.py +0 -0
  186. {mloda_core → mloda/core}/abstract_plugins/plugin_loader/plugin_loader.py +0 -0
  187. {mloda_core/api/prepare → mloda/core/api}/__init__.py +0 -0
  188. {mloda_core/core → mloda/core/api/prepare}/__init__.py +0 -0
  189. {mloda_core/core/step → mloda/core/core}/__init__.py +0 -0
  190. {mloda_core/filter → mloda/core/core/step}/__init__.py +0 -0
  191. {mloda_core/prepare → mloda/core/filter}/__init__.py +0 -0
  192. {mloda_core → mloda/core}/filter/filter_parameter.py +0 -0
  193. {mloda_core/prepare/graph → mloda/core/prepare}/__init__.py +0 -0
  194. {mloda_core/runtime → mloda/core/prepare/graph}/__init__.py +0 -0
  195. {mloda_core/runtime/flight → mloda/core/prepare/validators}/__init__.py +0 -0
  196. {mloda_core/runtime/worker → mloda/core/runtime}/__init__.py +0 -0
  197. {mloda_core → mloda/core}/runtime/flight/flight_server.py +0 -0
  198. {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/WHEEL +0 -0
  199. {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/entry_points.txt +0 -0
  200. {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/licenses/LICENSE.TXT +0 -0
  201. {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/licenses/NOTICE.md +0 -0
mloda_core/runtime/run.py DELETED
@@ -1,617 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from collections import defaultdict
4
- import multiprocessing
5
- import queue
6
- import threading
7
- import time
8
- import traceback
9
- from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
10
- from uuid import UUID, uuid4
11
- import logging
12
-
13
- from mloda_core.abstract_plugins.components.framework_transformer.cfw_transformer import ComputeFrameworkTransformer
14
- from mloda_core.abstract_plugins.function_extender import WrapperFunctionExtender
15
- from mloda_core.abstract_plugins.components.feature_name import FeatureName
16
- from mloda_core.abstract_plugins.compute_frame_work import ComputeFrameWork
17
- from mloda_core.prepare.execution_plan import ExecutionPlan
18
- from mloda_core.runtime.worker.multiprocessing_worker import worker
19
- from mloda_core.runtime.worker.thread_worker import thread_worker
20
- from mloda_core.core.cfw_manager import CfwManager, MyManager
21
- from mloda_core.abstract_plugins.components.parallelization_modes import ParallelizationModes
22
- from mloda_core.runtime.flight.runner_flight_server import ParallelRunnerFlightServer
23
- from mloda_core.core.step.feature_group_step import FeatureGroupStep
24
- from mloda_core.core.step.join_step import JoinStep
25
- from mloda_core.core.step.transform_frame_work_step import TransformFrameworkStep
26
- from mloda_core.abstract_plugins.components.feature_set import FeatureSet
27
- from mloda_core.runtime.flight.flight_server import FlightServer
28
-
29
-
30
- logger = logging.getLogger(__name__)
31
-
32
-
33
- class Runner:
34
- """
35
- Orchestrates the execution of an mloda based on a given execution plan.
36
-
37
- This class manages compute frameworks (CFWs), data dependencies, and parallel execution
38
- using threads or multiprocessing. It handles the execution of feature group steps,
39
- transform framework steps, and join steps, while also managing data dropping and result collection.
40
- """
41
-
42
- def __init__(
43
- self,
44
- execution_planner: ExecutionPlan,
45
- flight_server: Optional[ParallelRunnerFlightServer] = None,
46
- ) -> None:
47
- """
48
- Initializes the Runner with an execution plan and optional flight server.
49
-
50
- Args:
51
- execution_planner: The execution plan that defines the steps to be executed.
52
- flight_server: An optional flight server for data transfer.
53
- """
54
- self.execution_planner = execution_planner
55
-
56
- self.cfw_register: CfwManager
57
- self.result_data_collection: Dict[UUID, Any] = {}
58
- self.track_data_to_drop: Dict[UUID, Set[UUID]] = {}
59
- self.artifacts: Dict[str, Any] = {}
60
-
61
- # multiprocessing
62
- self.location: Optional[str] = None
63
- self.tasks: List[Union[threading.Thread, multiprocessing.Process]] = []
64
- self.process_register: Dict[
65
- UUID, Tuple[multiprocessing.Process, multiprocessing.Queue[Any], multiprocessing.Queue[Any]]
66
- ] = defaultdict()
67
- self.result_queues_collection: Set[multiprocessing.Queue[Any]] = set()
68
- self.result_uuids_collection: Set[UUID] = set()
69
-
70
- # Initialize framework transformer
71
- self.transformer = ComputeFrameworkTransformer()
72
-
73
- self.flight_server = None
74
- if flight_server:
75
- self.flight_server = flight_server
76
-
77
- # This can be reduced in realtime.
78
- # It is set currently for convenience on this high level
79
- self.wait_for_drop_data = 0.01
80
-
81
- def _is_step_done(self, step_uuids: Set[UUID], finished_ids: Set[UUID]) -> bool:
82
- """
83
- Checks if all steps identified by the given UUIDs have already been finished.
84
- """
85
- return all(uuid in finished_ids for uuid in step_uuids)
86
-
87
- def _drop_data_for_finished_cfws(self, finished_ids: Set[UUID]) -> None:
88
- """
89
- Handles the dropping of intermediate data based on finished steps.
90
- """
91
- if not finished_ids:
92
- return
93
-
94
- cfw_to_delete = set()
95
- for cfw_uuid, step_uuids in self.track_data_to_drop.items():
96
- if all(step_id in finished_ids for step_id in step_uuids):
97
- self._drop_cfw_data(cfw_uuid)
98
- cfw_to_delete.add(cfw_uuid)
99
-
100
- for cfw_uuid in cfw_to_delete:
101
- del self.track_data_to_drop[cfw_uuid]
102
-
103
- def _drop_cfw_data(self, cfw_uuid: UUID) -> None:
104
- """Drops data associated with a CFW."""
105
- if self.location:
106
- # FlightServer.drop_tables(self.location, {str(self.cfw_collection[cfw_uuid].uuid)})
107
- pass
108
- else:
109
- self.cfw_collection[cfw_uuid].drop_last_data()
110
-
111
- def compute(self) -> None:
112
- """
113
- Executes the mloda pipeline based on the execution plan.
114
-
115
- This method iterates through the execution plan, checks dependencies,
116
- and executes steps using the appropriate parallelization mode.
117
- It also handles errors, result collection, and data dropping.
118
- """
119
- if self.cfw_register is None:
120
- raise ValueError("CfwManager not initialized")
121
-
122
- finished_ids: Set[UUID] = set()
123
- to_finish_ids: Set[UUID] = set()
124
- currently_running_steps: Set[UUID] = set()
125
-
126
- self.cfw_collection: Dict[UUID, ComputeFrameWork] = {}
127
-
128
- try:
129
- while to_finish_ids != finished_ids or len(finished_ids) == 0:
130
- if self.cfw_register:
131
- error = self.cfw_register.get_error()
132
- if error:
133
- logger.error(self.cfw_register.get_error_exc_info())
134
- raise Exception(self.cfw_register.get_error_exc_info(), self.cfw_register.get_error_msg())
135
- else:
136
- break
137
-
138
- for step in self.execution_planner:
139
- to_finish_ids.update(step.get_uuids())
140
-
141
- if isinstance(step, FeatureGroupStep):
142
- self._drop_data_for_finished_cfws(finished_ids)
143
-
144
- if self._is_step_done(step.get_uuids(), finished_ids):
145
- continue
146
-
147
- # check if step is currently running
148
- if self.currently_running_step(step.get_uuids(), currently_running_steps):
149
- if self._process_step_result(step):
150
- self._mark_step_as_finished(step.get_uuids(), finished_ids, currently_running_steps)
151
- continue
152
-
153
- if not self._can_run_step(
154
- step.required_uuids, step.get_uuids(), finished_ids, currently_running_steps
155
- ):
156
- continue
157
- self._execute_step(step)
158
-
159
- time.sleep(0.01)
160
-
161
- finally:
162
- self.artifacts = self.cfw_register.get_artifacts()
163
- self.join()
164
-
165
- def get_done_steps_of_multiprocessing_result_queue(self) -> None:
166
- """
167
- Retrieves UUIDs of finished steps from multiprocessing result queues.
168
-
169
- This method iterates through the result queues and adds any available UUIDs
170
- to the collection of finished UUIDs.
171
- """
172
- for r_queue in self.result_queues_collection:
173
- try:
174
- result_uuid = r_queue.get(block=False)
175
- self.result_uuids_collection.add(UUID(result_uuid))
176
- except queue.Empty:
177
- continue
178
-
179
- def _process_step_result(self, step: Any) -> Union[Any, bool]:
180
- """
181
- Handles the result of a step based on its type.
182
-
183
- This method checks if a step is done, then performs specific actions based
184
- on the step's type, such as adding results to the data collection or dropping data.
185
- """
186
- # set step.is_done from other processes via result queue
187
- self.get_done_steps_of_multiprocessing_result_queue()
188
- if step.uuid in self.result_uuids_collection:
189
- step.step_is_done = True
190
-
191
- if not step.step_is_done:
192
- return False
193
-
194
- if isinstance(step, (TransformFrameworkStep, JoinStep)):
195
- return True
196
-
197
- if isinstance(step, FeatureGroupStep):
198
- if step.features.any_uuid is None:
199
- raise ValueError(f"from_feature_uuid should not be none. {step}")
200
-
201
- cfw = self.get_cfw(step.compute_framework, step.features.any_uuid)
202
- self.add_to_result_data_collection(cfw, step.features, step.uuid)
203
- self._drop_data_if_possible(cfw, step)
204
-
205
- return True
206
-
207
- def _drop_data_if_possible(self, cfw: ComputeFrameWork, step: Any) -> None:
208
- """
209
- Drops data associated with a compute framework if possible.
210
-
211
- This method checks if data can be dropped based on the CFW's dependencies
212
- and either drops the data directly or sends a command to a worker process to do so.
213
- """
214
- process, command_queue, result_queue = self.process_register.get(cfw.uuid, (None, None, None))
215
-
216
- feature_uuids_to_possible_drop = {f.uuid for f in step.features.features}
217
-
218
- if command_queue is None:
219
- data_to_drop = cfw.add_already_calculated_children_and_drop_if_possible(
220
- feature_uuids_to_possible_drop, self.location
221
- )
222
- if isinstance(data_to_drop, frozenset):
223
- self.track_data_to_drop[cfw.uuid] = set(data_to_drop)
224
- else:
225
- command_queue.put(feature_uuids_to_possible_drop)
226
-
227
- flyway_datasets = self.cfw_register.get_uuid_flyway_datasets(cfw.uuid)
228
- if flyway_datasets:
229
- self.track_data_to_drop[cfw.uuid] = flyway_datasets
230
-
231
- time.sleep(self.wait_for_drop_data)
232
-
233
- def get_cfw(self, compute_framework: Type[ComputeFrameWork], feature_uuid: UUID) -> ComputeFrameWork:
234
- """
235
- Retrieves a compute framework based on its type and a feature UUID.
236
-
237
- Args:
238
- compute_framework: The type of compute framework to retrieve.
239
- feature_uuid: The UUID of the feature associated with the compute framework.
240
- """
241
- cfw_uuid = self.cfw_register.get_initialized_compute_framework_uuid(
242
- compute_framework, feature_uuid=feature_uuid
243
- )
244
- if cfw_uuid is None:
245
- raise ValueError(f"cfw_uuid should not be none: {compute_framework}.")
246
- return self.cfw_collection[cfw_uuid]
247
-
248
- def prepare_execute_step(self, step: Any, parallelization_mode: ParallelizationModes) -> UUID:
249
- """
250
- Prepares a step for execution by initializing or retrieving the associated CFW.
251
- """
252
- cfw_uuid: Optional[UUID] = None
253
-
254
- if isinstance(step, FeatureGroupStep):
255
- for tfs_id in step.tfs_ids:
256
- cfw_uuid = self.cfw_register.get_cfw_uuid(step.compute_framework.get_class_name(), tfs_id)
257
- if cfw_uuid:
258
- return cfw_uuid
259
-
260
- feature_uuid = step.features.any_uuid
261
-
262
- if feature_uuid is None:
263
- raise ValueError(f"from_feature_uuid should not be none. {step, feature_uuid}")
264
-
265
- cfw_uuid = self.add_compute_framework(step, parallelization_mode, feature_uuid, set(step.children_if_root))
266
- elif isinstance(step, TransformFrameworkStep):
267
- from_feature_uuid, from_cfw_uuid = None, None
268
- for r_f in step.required_uuids:
269
- from_cfw_uuid = self.cfw_register.get_cfw_uuid(step.from_framework.get_class_name(), r_f)
270
- if from_cfw_uuid:
271
- from_feature_uuid = r_f
272
- break
273
-
274
- if from_feature_uuid is None or from_cfw_uuid is None:
275
- raise ValueError(
276
- f"from_feature_uuid or from_cfw_uuid should not be none. {step, from_feature_uuid, from_cfw_uuid}"
277
- )
278
-
279
- from_cfw = self.cfw_collection[from_cfw_uuid]
280
- childrens = set(from_cfw.children_if_root)
281
-
282
- if step.link_id:
283
- from_feature_uuid = step.link_id
284
- childrens.add(from_feature_uuid)
285
-
286
- with multiprocessing.Lock():
287
- cfw_uuid = self.init_compute_framework(step.to_framework, parallelization_mode, childrens, step.uuid)
288
-
289
- elif isinstance(step, JoinStep):
290
- cfw_uuid = self.cfw_register.get_cfw_uuid(
291
- step.left_framework.get_class_name(), next(iter(step.left_framework_uuids))
292
- )
293
-
294
- if cfw_uuid is None:
295
- raise ValueError(f"This should not occur. {step}")
296
-
297
- return cfw_uuid
298
-
299
- def prepare_tfs_right_cfw(self, step: TransformFrameworkStep) -> UUID:
300
- """
301
- Prepares the right CFW for a TransformFrameworkStep.
302
- """
303
- uuid = step.right_framework_uuid if step.right_framework_uuid else next(iter(step.required_uuids))
304
-
305
- cfw_uuid = self.cfw_register.get_cfw_uuid(step.from_framework.get_class_name(), uuid)
306
-
307
- if cfw_uuid is None or isinstance(cfw_uuid, UUID) is False:
308
- raise ValueError(
309
- f"cfw_uuid should not be none in prepare_tfs: {step.from_framework.get_class_name()}, {uuid}"
310
- )
311
-
312
- return cfw_uuid
313
-
314
- def prepare_tfs_and_joinstep(self, step: Any) -> Any:
315
- """
316
- Prepares CFWs required for TransformFrameworkStep or JoinStep.
317
- """
318
- from_cfw: Optional[Union[ComputeFrameWork, UUID]] = None
319
- if isinstance(step, TransformFrameworkStep):
320
- from_cfw = self.prepare_tfs_right_cfw(step)
321
- from_cfw = self.cfw_collection[from_cfw]
322
- elif isinstance(step, JoinStep):
323
- # Left framework here, because it is already transformed beforehand
324
- from_cfw_uuid = self.cfw_register.get_cfw_uuid(step.left_framework.get_class_name(), step.link.uuid)
325
-
326
- if from_cfw_uuid is None:
327
- from_cfw_uuid = self.cfw_register.get_cfw_uuid(
328
- step.left_framework.get_class_name(), next(iter(step.right_framework_uuids))
329
- )
330
-
331
- if from_cfw_uuid is None:
332
- raise ValueError(
333
- f"from_cfw_uuid should not be none: {step.left_framework.get_class_name()}, {step.link.uuid}"
334
- )
335
-
336
- from_cfw = self.cfw_collection[from_cfw_uuid]
337
- return from_cfw
338
-
339
- def _execute_step(self, step: Any) -> None:
340
- """
341
- Executes a step based on its parallelization mode.
342
- """
343
- execution_function = self._get_execution_function(
344
- self.cfw_register.get_parallelization_modes(), step.get_parallelization_mode()
345
- )
346
- execution_function(step)
347
-
348
- def sync_execute_step(self, step: Any) -> None:
349
- """
350
- Executes a step synchronously.
351
- """
352
- cfw_uuid = self.prepare_execute_step(step, ParallelizationModes.SYNC)
353
-
354
- try:
355
- from_cfw = self.prepare_tfs_and_joinstep(step) or None
356
- step.execute(self.cfw_register, self.cfw_collection[cfw_uuid], from_cfw=from_cfw)
357
- step.step_is_done = True
358
-
359
- except Exception as e:
360
- error_message = f"An error occurred: {e}"
361
- msg = f"{error_message}\nFull traceback:\n{traceback.format_exc()}"
362
- logging.error(msg)
363
- exc_info = traceback.format_exc()
364
- self.cfw_register.set_error(msg, exc_info)
365
-
366
- def thread_execute_step(self, step: Any) -> None:
367
- """
368
- Executes a step in a separate thread.
369
- """
370
- cfw_uuid = self.prepare_execute_step(step, ParallelizationModes.THREADING)
371
- from_cfw = self.prepare_tfs_and_joinstep(step) or None
372
-
373
- task = threading.Thread(
374
- target=thread_worker,
375
- args=(step, self.cfw_register, self.cfw_collection[cfw_uuid], from_cfw),
376
- )
377
-
378
- self.tasks.append(task)
379
- task.start()
380
-
381
- def multi_execute_step(self, step: Any) -> None:
382
- """
383
- Executes a step in a separate process.
384
- """
385
- cfw_uuid = self.prepare_execute_step(step, ParallelizationModes.MULTIPROCESSING)
386
-
387
- from_cfw = None
388
- if isinstance(step, TransformFrameworkStep):
389
- from_cfw = self.prepare_tfs_right_cfw(step)
390
-
391
- process, command_queue, result_queue = self.process_register.get(
392
- cfw_uuid, (None, multiprocessing.Queue(), multiprocessing.Queue())
393
- )
394
-
395
- if process is None:
396
- process = multiprocessing.Process(
397
- target=worker,
398
- args=(command_queue, result_queue, self.cfw_register, self.cfw_collection[cfw_uuid], from_cfw),
399
- )
400
- self.process_register[cfw_uuid] = (process, command_queue, result_queue)
401
- process.start()
402
- self.tasks.append(process)
403
- self.result_queues_collection.add(result_queue)
404
-
405
- if command_queue:
406
- command_queue.put(step)
407
- else:
408
- raise ValueError("Command queue should not be None.")
409
-
410
- def join(self) -> None:
411
- """
412
- Joins all tasks (threads or processes) and terminates multiprocessing processes.
413
- """
414
- failed = False
415
- for task in self.tasks:
416
- try:
417
- if isinstance(task, multiprocessing.Process):
418
- task.terminate()
419
-
420
- task.join()
421
- except Exception as e:
422
- logger.error(f"Error joining task: {e}")
423
- failed = True
424
-
425
- if failed:
426
- raise Exception("Error while joining tasks")
427
-
428
- def add_to_result_data_collection(self, cfw: ComputeFrameWork, features: FeatureSet, step_uuid: UUID) -> None:
429
- """
430
- Adds the result data to the result data collection.
431
- """
432
- if initial_requested_features := features.get_initial_requested_features():
433
- result = None
434
- result = self.get_result_data(cfw, initial_requested_features, self.location)
435
- if result is not None:
436
- self.result_data_collection[step_uuid] = result
437
-
438
- def get_result_data(
439
- self, cfw: ComputeFrameWork, selected_feature_names: Set[FeatureName], location: Optional[str] = None
440
- ) -> Any:
441
- """
442
- Gets result data from the compute framework.
443
- """
444
- if cfw.data is not None:
445
- data = cfw.data
446
- elif location:
447
- data = FlightServer.download_table(location, str(cfw.uuid))
448
- data = cfw.convert_flyserver_data_back(data, self.transformer)
449
- else:
450
- raise ValueError("Not implemented.")
451
-
452
- return cfw.select_data_by_column_names(data, selected_feature_names)
453
-
454
- def add_compute_framework(
455
- self,
456
- step: Any,
457
- parallelization_mode: ParallelizationModes,
458
- feature_uuid: UUID,
459
- children_if_root: Set[UUID],
460
- ) -> UUID:
461
- """
462
- Adds a compute framework to the CFW register and CFW collection.
463
-
464
- Returns:
465
- The UUID of the compute framework.
466
- """
467
- with multiprocessing.Lock():
468
- cfw_uuid = self.cfw_register.get_cfw_uuid(step.compute_framework.get_class_name(), feature_uuid)
469
- # if cfw does not exist, create a new one
470
- if cfw_uuid is None:
471
- cfw_uuid = self.init_compute_framework(step.compute_framework, parallelization_mode, children_if_root)
472
-
473
- return cfw_uuid
474
-
475
- def init_compute_framework(
476
- self,
477
- cf_class: Type[ComputeFrameWork],
478
- parallelization_mode: ParallelizationModes,
479
- children_if_root: Set[UUID],
480
- uuid: Optional[UUID] = None,
481
- ) -> UUID:
482
- """
483
- Initializes a compute framework.
484
-
485
- Returns:
486
- The UUID of the compute framework.
487
- """
488
- # get function_extender
489
- function_extender = self.cfw_register.get_function_extender()
490
-
491
- # init framework
492
- new_cfw = cf_class(
493
- parallelization_mode,
494
- frozenset(children_if_root),
495
- uuid or uuid4(),
496
- function_extender=function_extender,
497
- )
498
-
499
- # add to register
500
- self.cfw_register.add_cfw_to_compute_frameworks(new_cfw.get_uuid(), cf_class.get_class_name(), children_if_root)
501
-
502
- # add to collection
503
- self.cfw_collection[new_cfw.get_uuid()] = new_cfw
504
-
505
- return new_cfw.get_uuid()
506
-
507
- def currently_running_step(self, step_uuids: Set[UUID], currently_running_steps: Set[UUID]) -> bool:
508
- """
509
- Checks if a step is currently running.
510
-
511
- Returns:
512
- True if the step is currently running, False otherwise.
513
- """
514
- if next(iter(step_uuids)) not in currently_running_steps:
515
- return False
516
- return True
517
-
518
- def __enter__(
519
- self,
520
- parallelization_modes: Set[ParallelizationModes] = {ParallelizationModes.SYNC},
521
- function_extender: Optional[Set[WrapperFunctionExtender]] = None,
522
- api_data: Optional[Dict[str, Any]] = None,
523
- ) -> None:
524
- """
525
- Enters the context of the Runner.
526
- """
527
- MyManager.register("CfwManager", CfwManager)
528
- self.manager = MyManager().__enter__()
529
- self.cfw_register = self.manager.CfwManager(parallelization_modes, function_extender) # type: ignore[attr-defined]
530
-
531
- if self.flight_server:
532
- if self.flight_server.flight_server_process is None:
533
- self.flight_server.start_flight_server_process()
534
-
535
- if self.flight_server:
536
- self.location = self.flight_server.get_location()
537
-
538
- if self.location is None:
539
- raise ValueError("Location should not be None.")
540
-
541
- self.cfw_register.set_location(self.location)
542
-
543
- if api_data:
544
- self.cfw_register.set_api_data(api_data)
545
-
546
- def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
547
- """
548
- Exits the context of the Runner.
549
-
550
- Args:
551
- exc_type: The exception type.
552
- exc_val: The exception value.
553
- exc_tb: The exception traceback.
554
- """
555
- self.manager.shutdown()
556
-
557
- def get_artifacts(self) -> Dict[str, Any]:
558
- """
559
- Gets the artifacts.
560
- """
561
- return self.artifacts
562
-
563
- def _can_run_step(
564
- self,
565
- required_uuids: Set[UUID],
566
- step_uuid: Set[UUID],
567
- finished_steps: Set[UUID],
568
- currently_running_steps: Set[UUID],
569
- ) -> bool:
570
- """
571
- Checks if a step can be run. If it can, add it to the currently_running_steps set.
572
- """
573
-
574
- with threading.Lock():
575
- if required_uuids.issubset(finished_steps) and not step_uuid.intersection(currently_running_steps):
576
- currently_running_steps.update(step_uuid)
577
- return True
578
- return False
579
-
580
- def _mark_step_as_finished(
581
- self, step_uuid: Set[UUID], finished_steps: Set[UUID], currently_running_steps: Set[UUID]
582
- ) -> None:
583
- """
584
- Marks a step as finished.
585
- """
586
- with threading.Lock():
587
- currently_running_steps.difference_update(step_uuid)
588
- finished_steps.update(step_uuid)
589
-
590
- def _get_execution_function(
591
- self, mode_by_cfw_register: Set[ParallelizationModes], mode_by_step: Set[ParallelizationModes]
592
- ) -> Callable[[Any], None]:
593
- """
594
- Identifies the execution mode and returns the corresponding execute step function.
595
-
596
- Returns:
597
- The execute step function corresponding to the identified mode.
598
- """
599
- modes = mode_by_cfw_register.intersection(mode_by_step)
600
-
601
- if ParallelizationModes.MULTIPROCESSING in modes:
602
- return self.multi_execute_step
603
- elif ParallelizationModes.THREADING in modes:
604
- return self.thread_execute_step
605
- return self.sync_execute_step
606
-
607
- def get_result(self) -> List[Any]:
608
- """
609
- Gets the results.
610
- """
611
- # TODO: This is a temporary solution. We need to return the data in a more structured way.
612
- # Idea: return a dictionary with the feature name as key and the data as value.
613
- # Idea: list can keep history for debug more
614
- results = [v for k, v in self.result_data_collection.items()]
615
- if len(results) > 0:
616
- return results
617
- raise ValueError("No results found.")
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes