mloda 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. mloda/__init__.py +17 -0
  2. {mloda_core → mloda/core}/abstract_plugins/components/base_artifact.py +2 -2
  3. {mloda_core → mloda/core}/abstract_plugins/components/base_validator.py +13 -0
  4. {mloda_core → mloda/core}/abstract_plugins/components/data_access_collection.py +1 -1
  5. {mloda_core → mloda/core}/abstract_plugins/components/data_types.py +39 -0
  6. {mloda_core → mloda/core}/abstract_plugins/components/feature.py +39 -33
  7. {mloda_core → mloda/core}/abstract_plugins/components/feature_chainer/feature_chain_parser.py +19 -19
  8. mloda/core/abstract_plugins/components/feature_chainer/feature_chain_parser_mixin.py +197 -0
  9. {mloda_core → mloda/core}/abstract_plugins/components/feature_collection.py +6 -6
  10. {mloda_core → mloda/core}/abstract_plugins/components/feature_group_version.py +8 -8
  11. {mloda_core → mloda/core}/abstract_plugins/components/feature_set.py +18 -24
  12. {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/cfw_transformer.py +2 -2
  13. {mloda_core → mloda/core}/abstract_plugins/components/index/add_index_feature.py +4 -4
  14. {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/api_input_data.py +3 -3
  15. {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/api_input_data_collection.py +2 -2
  16. {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/base_api_data.py +1 -1
  17. {mloda_core → mloda/core}/abstract_plugins/components/input_data/base_input_data.py +6 -6
  18. {mloda_core → mloda/core}/abstract_plugins/components/input_data/creator/data_creator.py +3 -3
  19. mloda/core/abstract_plugins/components/link.py +437 -0
  20. {mloda_core → mloda/core}/abstract_plugins/components/match_data/match_data.py +3 -3
  21. {mloda_core → mloda/core}/abstract_plugins/components/merge/base_merge_engine.py +2 -2
  22. {mloda_core → mloda/core}/abstract_plugins/components/options.py +12 -36
  23. {mloda_core → mloda/core}/abstract_plugins/components/parallelization_modes.py +1 -1
  24. {mloda_core → mloda/core}/abstract_plugins/components/plugin_option/plugin_collector.py +14 -14
  25. mloda/core/abstract_plugins/components/validators/datatype_validator.py +96 -0
  26. mloda/core/abstract_plugins/components/validators/feature_set_validator.py +38 -0
  27. mloda/core/abstract_plugins/components/validators/feature_validator.py +23 -0
  28. mloda/core/abstract_plugins/components/validators/link_validator.py +79 -0
  29. mloda/core/abstract_plugins/components/validators/options_validator.py +57 -0
  30. mloda_core/abstract_plugins/compute_frame_work.py → mloda/core/abstract_plugins/compute_framework.py +46 -37
  31. mloda_core/abstract_plugins/abstract_feature_group.py → mloda/core/abstract_plugins/feature_group.py +56 -33
  32. mloda/core/abstract_plugins/function_extender.py +78 -0
  33. mloda/core/api/plugin_docs.py +220 -0
  34. mloda/core/api/plugin_info.py +32 -0
  35. {mloda_core → mloda/core}/api/prepare/setup_compute_framework.py +11 -11
  36. {mloda_core → mloda/core}/api/request.py +42 -33
  37. {mloda_core → mloda/core}/core/cfw_manager.py +8 -8
  38. {mloda_core → mloda/core}/core/engine.py +47 -46
  39. {mloda_core → mloda/core}/core/step/abstract_step.py +7 -7
  40. {mloda_core → mloda/core}/core/step/feature_group_step.py +12 -12
  41. {mloda_core → mloda/core}/core/step/join_step.py +14 -14
  42. {mloda_core → mloda/core}/core/step/transform_frame_work_step.py +16 -16
  43. {mloda_core → mloda/core}/filter/filter_engine.py +1 -1
  44. {mloda_core → mloda/core}/filter/filter_type_enum.py +1 -1
  45. {mloda_core → mloda/core}/filter/global_filter.py +23 -23
  46. {mloda_core → mloda/core}/filter/single_filter.py +6 -6
  47. {mloda_core → mloda/core}/prepare/accessible_plugins.py +16 -18
  48. {mloda_core → mloda/core}/prepare/execution_plan.py +65 -39
  49. {mloda_core → mloda/core}/prepare/graph/build_graph.py +6 -6
  50. {mloda_core → mloda/core}/prepare/graph/graph.py +1 -1
  51. {mloda_core → mloda/core}/prepare/graph/properties.py +5 -5
  52. {mloda_core → mloda/core}/prepare/identify_feature_group.py +12 -14
  53. {mloda_core → mloda/core}/prepare/joinstep_collection.py +3 -3
  54. {mloda_core → mloda/core}/prepare/resolve_compute_frameworks.py +6 -6
  55. {mloda_core → mloda/core}/prepare/resolve_graph.py +11 -11
  56. {mloda_core → mloda/core}/prepare/resolve_links.py +11 -31
  57. mloda/core/prepare/validators/resolve_link_validator.py +32 -0
  58. mloda/core/runtime/compute_framework_executor.py +271 -0
  59. mloda/core/runtime/data_lifecycle_manager.py +160 -0
  60. mloda/core/runtime/flight/__init__.py +0 -0
  61. {mloda_core → mloda/core}/runtime/flight/runner_flight_server.py +1 -1
  62. mloda/core/runtime/run.py +317 -0
  63. mloda/core/runtime/worker/__init__.py +0 -0
  64. {mloda_core → mloda/core}/runtime/worker/multiprocessing_worker.py +15 -10
  65. {mloda_core → mloda/core}/runtime/worker/thread_worker.py +2 -2
  66. mloda/core/runtime/worker_manager.py +96 -0
  67. mloda/provider/__init__.py +101 -0
  68. mloda/steward/__init__.py +25 -0
  69. mloda/user/__init__.py +57 -0
  70. {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/METADATA +18 -22
  71. mloda-0.4.0.dist-info/RECORD +248 -0
  72. {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/top_level.txt +1 -1
  73. mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_filter_engine.py +2 -2
  74. mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_framework.py +15 -13
  75. mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_merge_engine.py +3 -3
  76. mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_pyarrow_transformer.py +1 -1
  77. mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_filter_engine.py +2 -2
  78. mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_framework.py +12 -10
  79. mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_pyarrow_transformer.py +1 -1
  80. mloda_plugins/compute_framework/base_implementations/pandas/dataframe.py +18 -16
  81. mloda_plugins/compute_framework/base_implementations/pandas/pandas_filter_engine.py +36 -13
  82. mloda_plugins/compute_framework/base_implementations/pandas/pandas_merge_engine.py +7 -7
  83. mloda_plugins/compute_framework/base_implementations/pandas/pandaspyarrowtransformer.py +1 -1
  84. mloda_plugins/compute_framework/base_implementations/polars/dataframe.py +16 -14
  85. mloda_plugins/compute_framework/base_implementations/polars/lazy_dataframe.py +13 -12
  86. mloda_plugins/compute_framework/base_implementations/polars/polars_filter_engine.py +2 -2
  87. mloda_plugins/compute_framework/base_implementations/polars/polars_lazy_pyarrow_transformer.py +1 -1
  88. mloda_plugins/compute_framework/base_implementations/polars/polars_merge_engine.py +3 -3
  89. mloda_plugins/compute_framework/base_implementations/polars/polars_pyarrow_transformer.py +1 -1
  90. mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_filter_engine.py +2 -2
  91. mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_merge_engine.py +3 -3
  92. mloda_plugins/compute_framework/base_implementations/pyarrow/table.py +12 -10
  93. mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_filter_engine.py +2 -2
  94. mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_framework.py +11 -9
  95. mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_merge_engine.py +3 -3
  96. mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_pyarrow_transformer.py +1 -1
  97. mloda_plugins/compute_framework/base_implementations/spark/spark_filter_engine.py +2 -2
  98. mloda_plugins/compute_framework/base_implementations/spark/spark_framework.py +17 -15
  99. mloda_plugins/compute_framework/base_implementations/spark/spark_merge_engine.py +3 -3
  100. mloda_plugins/compute_framework/base_implementations/spark/spark_pyarrow_transformer.py +1 -1
  101. mloda_plugins/config/feature/loader.py +2 -2
  102. mloda_plugins/feature_group/experimental/aggregated_feature_group/base.py +45 -62
  103. mloda_plugins/feature_group/experimental/aggregated_feature_group/pandas.py +2 -2
  104. mloda_plugins/feature_group/experimental/aggregated_feature_group/polars_lazy.py +2 -2
  105. mloda_plugins/feature_group/experimental/aggregated_feature_group/pyarrow.py +2 -2
  106. mloda_plugins/feature_group/experimental/clustering/base.py +69 -97
  107. mloda_plugins/feature_group/experimental/clustering/pandas.py +2 -2
  108. mloda_plugins/feature_group/experimental/data_quality/missing_value/base.py +58 -79
  109. mloda_plugins/feature_group/experimental/data_quality/missing_value/pandas.py +2 -2
  110. mloda_plugins/feature_group/experimental/data_quality/missing_value/pyarrow.py +2 -2
  111. mloda_plugins/feature_group/experimental/data_quality/missing_value/python_dict.py +2 -2
  112. mloda_plugins/feature_group/experimental/default_options_key.py +16 -19
  113. mloda_plugins/feature_group/experimental/dimensionality_reduction/base.py +80 -94
  114. mloda_plugins/feature_group/experimental/dimensionality_reduction/pandas.py +2 -2
  115. mloda_plugins/feature_group/experimental/dynamic_feature_group_factory/dynamic_feature_group_factory.py +24 -24
  116. mloda_plugins/feature_group/experimental/forecasting/base.py +106 -104
  117. mloda_plugins/feature_group/experimental/forecasting/forecasting_artifact.py +2 -2
  118. mloda_plugins/feature_group/experimental/forecasting/pandas.py +15 -15
  119. mloda_plugins/feature_group/experimental/geo_distance/base.py +50 -42
  120. mloda_plugins/feature_group/experimental/geo_distance/pandas.py +2 -2
  121. mloda_plugins/feature_group/experimental/llm/cli.py +4 -4
  122. mloda_plugins/feature_group/experimental/llm/cli_features/refactor_git_cached.py +19 -19
  123. mloda_plugins/feature_group/experimental/llm/installed_packages_feature_group.py +8 -8
  124. mloda_plugins/feature_group/experimental/llm/list_directory_feature_group.py +5 -5
  125. mloda_plugins/feature_group/experimental/llm/llm_api/claude.py +3 -3
  126. mloda_plugins/feature_group/experimental/llm/llm_api/gemini.py +3 -3
  127. mloda_plugins/feature_group/experimental/llm/llm_api/llm_base_request.py +5 -5
  128. mloda_plugins/feature_group/experimental/llm/llm_api/openai.py +3 -3
  129. mloda_plugins/feature_group/experimental/llm/llm_api/request_loop.py +6 -6
  130. mloda_plugins/feature_group/experimental/llm/llm_file_selector.py +10 -10
  131. mloda_plugins/feature_group/experimental/llm/tools/tool_collection.py +1 -1
  132. mloda_plugins/feature_group/experimental/node_centrality/base.py +46 -72
  133. mloda_plugins/feature_group/experimental/node_centrality/pandas.py +2 -2
  134. mloda_plugins/feature_group/experimental/sklearn/encoding/base.py +51 -51
  135. mloda_plugins/feature_group/experimental/sklearn/encoding/pandas.py +2 -2
  136. mloda_plugins/feature_group/experimental/sklearn/pipeline/base.py +52 -39
  137. mloda_plugins/feature_group/experimental/sklearn/pipeline/pandas.py +2 -2
  138. mloda_plugins/feature_group/experimental/sklearn/scaling/base.py +44 -58
  139. mloda_plugins/feature_group/experimental/sklearn/scaling/pandas.py +2 -2
  140. mloda_plugins/feature_group/experimental/sklearn/sklearn_artifact.py +2 -2
  141. mloda_plugins/feature_group/experimental/source_input_feature.py +15 -15
  142. mloda_plugins/feature_group/experimental/text_cleaning/base.py +38 -61
  143. mloda_plugins/feature_group/experimental/text_cleaning/pandas.py +2 -2
  144. mloda_plugins/feature_group/experimental/text_cleaning/python_dict.py +2 -2
  145. mloda_plugins/feature_group/experimental/time_window/base.py +106 -93
  146. mloda_plugins/feature_group/experimental/time_window/pandas.py +13 -13
  147. mloda_plugins/feature_group/experimental/time_window/pyarrow.py +12 -12
  148. mloda_plugins/feature_group/input_data/api_data/api_data.py +9 -11
  149. mloda_plugins/feature_group/input_data/read_context_files.py +7 -7
  150. mloda_plugins/feature_group/input_data/read_db.py +7 -9
  151. mloda_plugins/feature_group/input_data/read_db_feature.py +4 -4
  152. mloda_plugins/feature_group/input_data/read_dbs/sqlite.py +23 -13
  153. mloda_plugins/feature_group/input_data/read_file.py +8 -8
  154. mloda_plugins/feature_group/input_data/read_file_feature.py +4 -4
  155. mloda_plugins/feature_group/input_data/read_files/csv.py +6 -6
  156. mloda_plugins/feature_group/input_data/read_files/feather.py +5 -5
  157. mloda_plugins/feature_group/input_data/read_files/json.py +5 -5
  158. mloda_plugins/feature_group/input_data/read_files/orc.py +5 -5
  159. mloda_plugins/feature_group/input_data/read_files/parquet.py +5 -5
  160. mloda_plugins/feature_group/input_data/read_files/text_file_reader.py +5 -5
  161. mloda_plugins/function_extender/base_implementations/otel/otel_extender.py +4 -4
  162. mloda-0.3.3.dist-info/RECORD +0 -230
  163. mloda_core/abstract_plugins/components/link.py +0 -286
  164. mloda_core/abstract_plugins/function_extender.py +0 -34
  165. mloda_core/runtime/run.py +0 -617
  166. {mloda_core → mloda/core}/__init__.py +0 -0
  167. {mloda_core → mloda/core}/abstract_plugins/__init__.py +0 -0
  168. {mloda_core → mloda/core}/abstract_plugins/components/__init__.py +0 -0
  169. {mloda_core → mloda/core}/abstract_plugins/components/domain.py +0 -0
  170. {mloda_core → mloda/core}/abstract_plugins/components/feature_chainer/__init__.py +0 -0
  171. {mloda_core → mloda/core}/abstract_plugins/components/feature_name.py +0 -0
  172. {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/__init__.py +0 -0
  173. {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/base_transformer.py +0 -0
  174. {mloda_core → mloda/core}/abstract_plugins/components/hashable_dict.py +0 -0
  175. {mloda_core → mloda/core}/abstract_plugins/components/index/__init__.py +0 -0
  176. {mloda_core → mloda/core}/abstract_plugins/components/index/index.py +0 -0
  177. {mloda_core → mloda/core}/abstract_plugins/components/input_data/__init__.py +0 -0
  178. {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/__init__.py +0 -0
  179. {mloda_core → mloda/core}/abstract_plugins/components/input_data/creator/__init__.py +0 -0
  180. {mloda_core → mloda/core}/abstract_plugins/components/match_data/__init__.py +0 -0
  181. {mloda_core → mloda/core}/abstract_plugins/components/merge/__init__.py +0 -0
  182. {mloda_core → mloda/core}/abstract_plugins/components/plugin_option/__init__.py +0 -0
  183. {mloda_core → mloda/core}/abstract_plugins/components/utils.py +0 -0
  184. {mloda_core/abstract_plugins/plugin_loader → mloda/core/abstract_plugins/components/validators}/__init__.py +0 -0
  185. {mloda_core/api → mloda/core/abstract_plugins/plugin_loader}/__init__.py +0 -0
  186. {mloda_core → mloda/core}/abstract_plugins/plugin_loader/plugin_loader.py +0 -0
  187. {mloda_core/api/prepare → mloda/core/api}/__init__.py +0 -0
  188. {mloda_core/core → mloda/core/api/prepare}/__init__.py +0 -0
  189. {mloda_core/core/step → mloda/core/core}/__init__.py +0 -0
  190. {mloda_core/filter → mloda/core/core/step}/__init__.py +0 -0
  191. {mloda_core/prepare → mloda/core/filter}/__init__.py +0 -0
  192. {mloda_core → mloda/core}/filter/filter_parameter.py +0 -0
  193. {mloda_core/prepare/graph → mloda/core/prepare}/__init__.py +0 -0
  194. {mloda_core/runtime → mloda/core/prepare/graph}/__init__.py +0 -0
  195. {mloda_core/runtime/flight → mloda/core/prepare/validators}/__init__.py +0 -0
  196. {mloda_core/runtime/worker → mloda/core/runtime}/__init__.py +0 -0
  197. {mloda_core → mloda/core}/runtime/flight/flight_server.py +0 -0
  198. {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/WHEEL +0 -0
  199. {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/entry_points.txt +0 -0
  200. {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/licenses/LICENSE.TXT +0 -0
  201. {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/licenses/NOTICE.md +0 -0
@@ -1,11 +1,11 @@
1
1
  from typing import Optional, Set, Tuple, Type
2
2
 
3
- from mloda_core.prepare.accessible_plugins import FeatureGroupEnvironmentMapping
4
- from mloda_core.abstract_plugins.components.data_access_collection import DataAccessCollection
5
- from mloda_core.abstract_plugins.compute_frame_work import ComputeFrameWork
6
- from mloda_core.abstract_plugins.abstract_feature_group import AbstractFeatureGroup
7
- from mloda_core.abstract_plugins.components.feature import Feature
8
- from mloda_core.abstract_plugins.components.link import Link
3
+ from mloda.core.prepare.accessible_plugins import FeatureGroupEnvironmentMapping
4
+ from mloda.core.abstract_plugins.components.data_access_collection import DataAccessCollection
5
+ from mloda.core.abstract_plugins.compute_framework import ComputeFramework
6
+ from mloda.core.abstract_plugins.feature_group import FeatureGroup
7
+ from mloda.core.abstract_plugins.components.feature import Feature
8
+ from mloda.core.abstract_plugins.components.link import Link
9
9
 
10
10
  import logging
11
11
 
@@ -53,9 +53,7 @@ class IdentifyFeatureGroupClass:
53
53
  _identified_feature_groups = self.filter_subclasses(_identified_feature_groups)
54
54
  return _identified_feature_groups
55
55
 
56
- def _filter_feature_group_by_links(
57
- self, feature_group: Type[AbstractFeatureGroup], links: Optional[Set[Link]]
58
- ) -> bool:
56
+ def _filter_feature_group_by_links(self, feature_group: Type[FeatureGroup], links: Optional[Set[Link]]) -> bool:
59
57
  # Case index columns not given, so no validation possible
60
58
  if feature_group.index_columns() is None:
61
59
  return True
@@ -76,18 +74,18 @@ class IdentifyFeatureGroupClass:
76
74
 
77
75
  def _filter_feature_group_by_criteria(
78
76
  self,
79
- feature_group: Type[AbstractFeatureGroup],
77
+ feature_group: Type[FeatureGroup],
80
78
  feature: Feature,
81
79
  data_access_collection: Optional[DataAccessCollection],
82
80
  ) -> bool:
83
81
  return feature_group.match_feature_group_criteria(feature.name, feature.options, data_access_collection)
84
82
 
85
- def _filter_feature_group_by_domain(self, feature_group: Type[AbstractFeatureGroup], feature: Feature) -> bool:
83
+ def _filter_feature_group_by_domain(self, feature_group: Type[FeatureGroup], feature: Feature) -> bool:
86
84
  return not feature.domain or feature_group.get_domain() == feature.domain
87
85
 
88
86
  def _filter_feature_group_by_framework(
89
87
  self,
90
- compute_frameworks: Set[Type[ComputeFrameWork]],
88
+ compute_frameworks: Set[Type[ComputeFramework]],
91
89
  feature: Feature,
92
90
  ) -> bool:
93
91
  if feature.compute_frameworks is None:
@@ -114,7 +112,7 @@ class IdentifyFeatureGroupClass:
114
112
  if not compute_frameworks:
115
113
  raise ValueError(f"Feature {feature.name} {feature_group_class.get_class_name()} has no compute framework.")
116
114
 
117
- def get(self) -> Tuple[Type[AbstractFeatureGroup], Set[Type[ComputeFrameWork]]]:
115
+ def get(self) -> Tuple[Type[FeatureGroup], Set[Type[ComputeFramework]]]:
118
116
  return next(iter(self.feature_group_compute_framework_mapping.items()))
119
117
 
120
118
  def filter_subclasses(
@@ -123,7 +121,7 @@ class IdentifyFeatureGroupClass:
123
121
  """
124
122
  This functionality ensures that only subclass feature groups are kept.
125
123
  """
126
- fgs_to_pop: Set[Type[AbstractFeatureGroup]] = set()
124
+ fgs_to_pop: Set[Type[FeatureGroup]] = set()
127
125
 
128
126
  for i_feature_group, i_compute_frameworks in _identified_feature_groups.items():
129
127
  for o_feature_group, o_compute_frameworks in _identified_feature_groups.items():
@@ -1,8 +1,8 @@
1
1
  from collections import defaultdict
2
2
  from typing import Dict, Set, Type
3
3
  from uuid import UUID
4
- from mloda_core.abstract_plugins.compute_frame_work import ComputeFrameWork
5
- from mloda_core.core.step.join_step import JoinStep
4
+ from mloda.core.abstract_plugins.compute_framework import ComputeFramework
5
+ from mloda.core.core.step.join_step import JoinStep
6
6
 
7
7
 
8
8
  class JoinStepCollection:
@@ -10,7 +10,7 @@ class JoinStepCollection:
10
10
  self.collection: Dict[JoinStep, Set[UUID]] = defaultdict(set)
11
11
 
12
12
  def similar_dependent_joins_uuids(
13
- self, left_framework: Type[ComputeFrameWork], right_framework: Type[ComputeFrameWork]
13
+ self, left_framework: Type[ComputeFramework], right_framework: Type[ComputeFramework]
14
14
  ) -> Set[UUID]:
15
15
  """
16
16
  This functionality makes sure that we do not write on the same datasets due to overlapping joins at once.
@@ -2,10 +2,10 @@ from copy import deepcopy
2
2
  from typing import Any, Dict, List, Set, Type
3
3
  from collections import defaultdict
4
4
  from uuid import UUID
5
- from mloda_core.abstract_plugins.compute_frame_work import ComputeFrameWork
6
- from mloda_core.prepare.graph.graph import Graph
7
- from mloda_core.prepare.resolve_links import LinkFrameworkTrekker, LinkTrekker
8
- from mloda_core.abstract_plugins.components.link import JoinType, Link
5
+ from mloda.core.abstract_plugins.compute_framework import ComputeFramework
6
+ from mloda.core.prepare.graph.graph import Graph
7
+ from mloda.core.prepare.resolve_links import LinkFrameworkTrekker, LinkTrekker
8
+ from mloda.core.abstract_plugins.components.link import JoinType, Link
9
9
 
10
10
 
11
11
  class ResolveComputeFrameworks:
@@ -108,8 +108,8 @@ class ResolveComputeFrameworks:
108
108
  self.to_invert_trekker_collection = []
109
109
 
110
110
  def resolve_trekked_links(
111
- self, trekked_links: List[LinkFrameworkTrekker], compute_frameworks: Set[Type[ComputeFrameWork]]
112
- ) -> Set[Type[ComputeFrameWork]]:
111
+ self, trekked_links: List[LinkFrameworkTrekker], compute_frameworks: Set[Type[ComputeFramework]]
112
+ ) -> Set[Type[ComputeFramework]]:
113
113
  new_cfws = set()
114
114
 
115
115
  for link, left_cfw, right_cfw in trekked_links:
@@ -1,23 +1,23 @@
1
1
  from collections import defaultdict
2
2
  from typing import Dict, List, Optional, Set, Tuple, Type, Union
3
3
  from uuid import UUID
4
- from mloda_core.prepare.graph.graph import Graph
5
- from mloda_core.prepare.resolve_compute_frameworks import ResolveComputeFrameworks
6
- from mloda_core.prepare.resolve_links import LinkFrameworkTrekker, ResolveLinks
7
- from mloda_core.abstract_plugins.abstract_feature_group import AbstractFeatureGroup
8
- from mloda_core.abstract_plugins.components.feature import Feature
9
- from mloda_core.abstract_plugins.components.link import Link
4
+ from mloda.core.prepare.graph.graph import Graph
5
+ from mloda.core.prepare.resolve_compute_frameworks import ResolveComputeFrameworks
6
+ from mloda.core.prepare.resolve_links import LinkFrameworkTrekker, ResolveLinks
7
+ from mloda.core.abstract_plugins.feature_group import FeatureGroup
8
+ from mloda.core.abstract_plugins.components.feature import Feature
9
+ from mloda.core.abstract_plugins.components.link import Link
10
10
 
11
11
 
12
- LinkFeatureQueue = List[Union[LinkFrameworkTrekker, Tuple[Feature, Type[AbstractFeatureGroup]]]]
12
+ LinkFeatureQueue = List[Union[LinkFrameworkTrekker, Tuple[Feature, Type[FeatureGroup]]]]
13
13
 
14
- PlannedQueue = List[Union[LinkFrameworkTrekker, Tuple[Type[AbstractFeatureGroup], Set[Feature]]]]
14
+ PlannedQueue = List[Union[LinkFrameworkTrekker, Tuple[Type[FeatureGroup], Set[Feature]]]]
15
15
 
16
16
 
17
17
  class ResolveGraph:
18
18
  def __init__(self, graph: Graph, links: Optional[Set[Link]] = None):
19
19
  self.graph = graph
20
- self.nodes_per_feature_group: Dict[Type[AbstractFeatureGroup], Set[Feature]] = {}
20
+ self.nodes_per_feature_group: Dict[Type[FeatureGroup], Set[Feature]] = {}
21
21
  self.resolver_compute_framework = ResolveComputeFrameworks(self.graph)
22
22
  self.resolver_links = ResolveLinks(self.graph, links)
23
23
 
@@ -73,8 +73,8 @@ class ResolveGraph:
73
73
 
74
74
  return feature_link_queue
75
75
 
76
- def get_nodes_with_same_feature_group_class(self) -> Dict[Type[AbstractFeatureGroup], Set[Feature]]:
77
- collection: Dict[Type[AbstractFeatureGroup], Set[Feature]] = defaultdict(set)
76
+ def get_nodes_with_same_feature_group_class(self) -> Dict[Type[FeatureGroup], Set[Feature]]:
77
+ collection: Dict[Type[FeatureGroup], Set[Feature]] = defaultdict(set)
78
78
 
79
79
  for node in self.graph.queue:
80
80
  node_properties = self.graph.get_nodes()[node]
@@ -2,12 +2,13 @@ from collections import OrderedDict, defaultdict
2
2
  from typing import Dict, List, Optional, Set, Tuple, Type, Union
3
3
  from uuid import UUID
4
4
 
5
- from mloda_core.abstract_plugins.compute_frame_work import ComputeFrameWork
6
- from mloda_core.prepare.graph.graph import Graph
7
- from mloda_core.abstract_plugins.components.link import Link
5
+ from mloda.core.abstract_plugins.compute_framework import ComputeFramework
6
+ from mloda.core.prepare.graph.graph import Graph
7
+ from mloda.core.abstract_plugins.components.link import Link
8
+ from mloda.core.prepare.validators.resolve_link_validator import ResolveLinkValidator
8
9
 
9
10
 
10
- LinkFrameworkTrekker = Tuple[Link, Type[ComputeFrameWork], Type[ComputeFrameWork]]
11
+ LinkFrameworkTrekker = Tuple[Link, Type[ComputeFramework], Type[ComputeFramework]]
11
12
 
12
13
 
13
14
  class LinkTrekker:
@@ -23,7 +24,7 @@ class LinkTrekker:
23
24
  self.data[key].add(value)
24
25
 
25
26
  def invert_link(
26
- self, link: Link, left_cfw: Type[ComputeFrameWork], right_cfw: Type[ComputeFrameWork], uuid: UUID
27
+ self, link: Link, left_cfw: Type[ComputeFramework], right_cfw: Type[ComputeFramework], uuid: UUID
27
28
  ) -> None:
28
29
  """
29
30
  The purpose of this function is to invert left and right of an index during a run.
@@ -45,7 +46,7 @@ class LinkTrekker:
45
46
  del self.data[(link, left_cfw, right_cfw)]
46
47
  del self.data_ordered[(link, left_cfw, right_cfw)]
47
48
 
48
- def get_position(self, link: Link, left_cfw: Type[ComputeFrameWork], right_cfw: Type[ComputeFrameWork]) -> int:
49
+ def get_position(self, link: Link, left_cfw: Type[ComputeFramework], right_cfw: Type[ComputeFramework]) -> int:
49
50
  for i, (k, _) in enumerate(self.data_ordered.items()):
50
51
  if k == (link, left_cfw, right_cfw):
51
52
  return i
@@ -203,8 +204,7 @@ class LinkTrekker:
203
204
  if k not in self.data_ordered:
204
205
  self.data_ordered[k] = v
205
206
 
206
- if len(self.data.items()) != len(self.data_ordered.items()):
207
- raise ValueError("Data ordered is not the same length as data!")
207
+ ResolveLinkValidator.validate_data_consistency(self.data, self.data_ordered)
208
208
 
209
209
 
210
210
  class ResolveLinks:
@@ -241,27 +241,7 @@ class ResolveLinks:
241
241
  self.graph.set_root_parents_by_direct_()
242
242
 
243
243
  self.go_through_each_child_and_its_parents_and_look_for_links()
244
- self.validate_link_trekker()
245
-
246
- def validate_link_trekker(self) -> None:
247
- for link_fw_trekker, _ in self.link_trekker.data.items():
248
- for other_fw_trekker, _ in self.link_trekker.data.items():
249
- if link_fw_trekker == other_fw_trekker:
250
- continue
251
-
252
- link, _, _ = link_fw_trekker
253
- other, _, _ = other_fw_trekker
254
-
255
- # if feature group match
256
- if (
257
- link.left_feature_group.get_class_name() == other.left_feature_group.get_class_name()
258
- and link.right_feature_group.get_class_name() == other.right_feature_group.get_class_name()
259
- ):
260
- # case join different
261
- if link.jointype != other.jointype:
262
- raise Exception(
263
- f"Link {link} and {other} have the same feature groups, but different join types!"
264
- )
244
+ ResolveLinkValidator.validate_no_conflicting_join_types(self.link_trekker.data)
265
245
 
266
246
  def get_link_trekker(self) -> LinkTrekker:
267
247
  return self.link_trekker
@@ -374,8 +354,8 @@ class ResolveLinks:
374
354
  def create_link_trekker_key(
375
355
  self,
376
356
  link: Link,
377
- left_frameworks: Optional[Set[Type[ComputeFrameWork]]] = None,
378
- right_frameworks: Optional[Set[Type[ComputeFrameWork]]] = None,
357
+ left_frameworks: Optional[Set[Type[ComputeFramework]]] = None,
358
+ right_frameworks: Optional[Set[Type[ComputeFramework]]] = None,
379
359
  ) -> LinkFrameworkTrekker:
380
360
  if left_frameworks is None or right_frameworks is None:
381
361
  raise ValueError("Left or right frameworks are not set!")
@@ -0,0 +1,32 @@
1
+ from collections import OrderedDict
2
+ from typing import Any, Dict, Set
3
+ from uuid import UUID
4
+
5
+
6
+ class ResolveLinkValidator:
7
+ @staticmethod
8
+ def validate_data_consistency(
9
+ data: Dict[Any, Set[UUID]],
10
+ data_ordered: "OrderedDict[Any, Set[UUID]]",
11
+ ) -> None:
12
+ if len(data.items()) != len(data_ordered.items()):
13
+ raise ValueError("Data and data_ordered have different lengths")
14
+
15
+ @staticmethod
16
+ def validate_no_conflicting_join_types(data: Dict[Any, Set[UUID]]) -> None:
17
+ seen_pairs: Dict[Any, Any] = {}
18
+ for key in data.keys():
19
+ link, _, _ = key
20
+ left_fg = link.left_feature_group
21
+ right_fg = link.right_feature_group
22
+ jointype = link.jointype
23
+
24
+ pair_key = (left_fg, right_fg)
25
+
26
+ if pair_key in seen_pairs:
27
+ if seen_pairs[pair_key] != jointype:
28
+ raise Exception(
29
+ f"Conflicting join types for {left_fg.get_class_name()} and {right_fg.get_class_name()}"
30
+ )
31
+ else:
32
+ seen_pairs[pair_key] = jointype
@@ -0,0 +1,271 @@
1
+ from __future__ import annotations
2
+
3
+ import multiprocessing
4
+ import threading
5
+ import traceback
6
+ import logging
7
+ from typing import Any, Callable, Dict, Optional, Set, Type
8
+ from uuid import UUID, uuid4
9
+
10
+ from mloda.core.abstract_plugins.compute_framework import ComputeFramework
11
+ from mloda.core.abstract_plugins.components.parallelization_modes import ParallelizationMode
12
+ from mloda.core.core.cfw_manager import CfwManager
13
+ from mloda.core.core.step.feature_group_step import FeatureGroupStep
14
+ from mloda.core.core.step.join_step import JoinStep
15
+ from mloda.core.core.step.transform_frame_work_step import TransformFrameworkStep
16
+ from mloda.core.runtime.worker_manager import WorkerManager
17
+ from mloda.core.runtime.worker.thread_worker import thread_worker
18
+ from mloda.core.runtime.worker.multiprocessing_worker import worker
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class ComputeFrameworkExecutor:
24
+ """
25
+ Manages compute framework initialization and step execution.
26
+
27
+ Extracted from Runner class to handle CFW lifecycle and step execution logic.
28
+ """
29
+
30
+ def __init__(self, cfw_register: CfwManager, worker_manager: WorkerManager) -> None:
31
+ """
32
+ Initialize the executor with dependencies.
33
+
34
+ Args:
35
+ cfw_register: The CFW manager for registering compute frameworks.
36
+ worker_manager: The worker manager for handling parallel execution.
37
+ """
38
+ self.cfw_collection: Dict[UUID, ComputeFramework] = {}
39
+ self.cfw_register = cfw_register
40
+ self.worker_manager = worker_manager
41
+
42
+ def init_compute_framework(
43
+ self,
44
+ cf_class: Type[ComputeFramework],
45
+ parallelization_mode: ParallelizationMode,
46
+ children_if_root: Set[UUID],
47
+ uuid: Optional[UUID] = None,
48
+ ) -> UUID:
49
+ """
50
+ Initializes a compute framework.
51
+
52
+ Returns:
53
+ The UUID of the compute framework.
54
+ """
55
+ # get function_extender
56
+ function_extender = self.cfw_register.get_function_extender()
57
+
58
+ # init framework
59
+ new_cfw = cf_class(
60
+ parallelization_mode,
61
+ frozenset(children_if_root),
62
+ uuid or uuid4(),
63
+ function_extender=function_extender,
64
+ )
65
+
66
+ # add to register
67
+ self.cfw_register.add_cfw_to_compute_frameworks(new_cfw.get_uuid(), cf_class.get_class_name(), children_if_root)
68
+
69
+ # add to collection
70
+ self.cfw_collection[new_cfw.get_uuid()] = new_cfw
71
+
72
+ return new_cfw.get_uuid()
73
+
74
+ def add_compute_framework(
75
+ self,
76
+ step: Any,
77
+ parallelization_mode: ParallelizationMode,
78
+ feature_uuid: UUID,
79
+ children_if_root: Set[UUID],
80
+ ) -> UUID:
81
+ """
82
+ Adds a compute framework to the CFW register and CFW collection.
83
+
84
+ Returns:
85
+ The UUID of the compute framework.
86
+ """
87
+ with multiprocessing.Lock():
88
+ cfw_uuid = self.cfw_register.get_cfw_uuid(step.compute_framework.get_class_name(), feature_uuid)
89
+ # if cfw does not exist, create a new one
90
+ if cfw_uuid is None:
91
+ cfw_uuid = self.init_compute_framework(step.compute_framework, parallelization_mode, children_if_root)
92
+
93
+ return cfw_uuid
94
+
95
+ def get_cfw(self, compute_framework: Type[ComputeFramework], feature_uuid: UUID) -> ComputeFramework:
96
+ """
97
+ Retrieves a compute framework based on its type and a feature UUID.
98
+
99
+ Args:
100
+ compute_framework: The type of compute framework to retrieve.
101
+ feature_uuid: The UUID of the feature associated with the compute framework.
102
+ """
103
+ cfw_uuid = self.cfw_register.get_initialized_compute_framework_uuid(
104
+ compute_framework, feature_uuid=feature_uuid
105
+ )
106
+ if cfw_uuid is None:
107
+ raise ValueError(f"cfw_uuid should not be none: {compute_framework}.")
108
+ return self.cfw_collection[cfw_uuid]
109
+
110
+ def _get_execution_function(
111
+ self, mode_by_cfw_register: Set[ParallelizationMode], mode_by_step: Set[ParallelizationMode]
112
+ ) -> Callable[[Any], None]:
113
+ """
114
+ Identifies the execution mode and returns the corresponding execute step function.
115
+
116
+ Returns:
117
+ The execute step function corresponding to the identified mode.
118
+ """
119
+ modes = mode_by_cfw_register.intersection(mode_by_step)
120
+
121
+ if ParallelizationMode.MULTIPROCESSING in modes:
122
+ return self.multi_execute_step
123
+ elif ParallelizationMode.THREADING in modes:
124
+ return self.thread_execute_step
125
+ return self.sync_execute_step
126
+
127
+ def prepare_execute_step(self, step: Any, parallelization_mode: ParallelizationMode) -> UUID:
128
+ """
129
+ Prepares a step for execution by initializing or retrieving the associated CFW.
130
+ """
131
+ cfw_uuid: Optional[UUID] = None
132
+
133
+ if isinstance(step, FeatureGroupStep):
134
+ for tfs_id in step.tfs_ids:
135
+ cfw_uuid = self.cfw_register.get_cfw_uuid(step.compute_framework.get_class_name(), tfs_id)
136
+ if cfw_uuid:
137
+ return cfw_uuid
138
+
139
+ feature_uuid = step.features.any_uuid
140
+
141
+ if feature_uuid is None:
142
+ raise ValueError(f"from_feature_uuid should not be none. {step, feature_uuid}")
143
+
144
+ cfw_uuid = self.add_compute_framework(step, parallelization_mode, feature_uuid, set(step.children_if_root))
145
+ elif isinstance(step, TransformFrameworkStep):
146
+ from_feature_uuid, from_cfw_uuid = None, None
147
+ for r_f in step.required_uuids:
148
+ from_cfw_uuid = self.cfw_register.get_cfw_uuid(step.from_framework.get_class_name(), r_f)
149
+ if from_cfw_uuid:
150
+ from_feature_uuid = r_f
151
+ break
152
+
153
+ if from_feature_uuid is None or from_cfw_uuid is None:
154
+ raise ValueError(
155
+ f"from_feature_uuid or from_cfw_uuid should not be none. {step, from_feature_uuid, from_cfw_uuid}"
156
+ )
157
+
158
+ from_cfw = self.cfw_collection[from_cfw_uuid]
159
+ childrens = set(from_cfw.children_if_root)
160
+
161
+ if step.link_id:
162
+ from_feature_uuid = step.link_id
163
+ childrens.add(from_feature_uuid)
164
+
165
+ with multiprocessing.Lock():
166
+ cfw_uuid = self.init_compute_framework(step.to_framework, parallelization_mode, childrens, step.uuid)
167
+
168
+ elif isinstance(step, JoinStep):
169
+ cfw_uuid = self.cfw_register.get_cfw_uuid(
170
+ step.left_framework.get_class_name(), next(iter(step.left_framework_uuids))
171
+ )
172
+
173
+ if cfw_uuid is None:
174
+ raise ValueError(f"This should not occur. {step}")
175
+
176
+ return cfw_uuid
177
+
178
+ def prepare_tfs_right_cfw(self, step: TransformFrameworkStep) -> UUID:
179
+ """
180
+ Prepares the right CFW for a TransformFrameworkStep.
181
+ """
182
+ uuid = step.right_framework_uuid if step.right_framework_uuid else next(iter(step.required_uuids))
183
+
184
+ cfw_uuid = self.cfw_register.get_cfw_uuid(step.from_framework.get_class_name(), uuid)
185
+
186
+ if cfw_uuid is None or isinstance(cfw_uuid, UUID) is False:
187
+ raise ValueError(
188
+ f"cfw_uuid should not be none in prepare_tfs: {step.from_framework.get_class_name()}, {uuid}"
189
+ )
190
+
191
+ return cfw_uuid
192
+
193
+ def prepare_tfs_and_joinstep(self, step: Any) -> Any:
194
+ """
195
+ Prepares CFWs required for TransformFrameworkStep or JoinStep.
196
+ """
197
+ from_cfw: Optional[Any] = None
198
+ if isinstance(step, TransformFrameworkStep):
199
+ from_cfw = self.prepare_tfs_right_cfw(step)
200
+ from_cfw = self.cfw_collection[from_cfw]
201
+ elif isinstance(step, JoinStep):
202
+ # Left framework here, because it is already transformed beforehand
203
+ from_cfw_uuid = self.cfw_register.get_cfw_uuid(step.left_framework.get_class_name(), step.link.uuid)
204
+
205
+ if from_cfw_uuid is None:
206
+ from_cfw_uuid = self.cfw_register.get_cfw_uuid(
207
+ step.left_framework.get_class_name(), next(iter(step.right_framework_uuids))
208
+ )
209
+
210
+ if from_cfw_uuid is None:
211
+ raise ValueError(
212
+ f"from_cfw_uuid should not be none: {step.left_framework.get_class_name()}, {step.link.uuid}"
213
+ )
214
+
215
+ from_cfw = self.cfw_collection[from_cfw_uuid]
216
+ return from_cfw
217
+
218
+ def sync_execute_step(self, step: Any) -> None:
219
+ """
220
+ Executes a step synchronously.
221
+ """
222
+ cfw_uuid = self.prepare_execute_step(step, ParallelizationMode.SYNC)
223
+
224
+ try:
225
+ from_cfw = self.prepare_tfs_and_joinstep(step) or None
226
+ step.execute(self.cfw_register, self.cfw_collection[cfw_uuid], from_cfw=from_cfw)
227
+ step.step_is_done = True
228
+
229
+ except Exception as e:
230
+ error_message = f"An error occurred: {e}"
231
+ msg = f"{error_message}\nFull traceback:\n{traceback.format_exc()}"
232
+ logging.error(msg)
233
+ exc_info = traceback.format_exc()
234
+ self.cfw_register.set_error(msg, exc_info)
235
+
236
+ def thread_execute_step(self, step: Any) -> None:
237
+ """
238
+ Executes a step in a separate thread.
239
+ """
240
+ cfw_uuid = self.prepare_execute_step(step, ParallelizationMode.THREADING)
241
+ from_cfw = self.prepare_tfs_and_joinstep(step) or None
242
+
243
+ task = threading.Thread(
244
+ target=thread_worker,
245
+ args=(step, self.cfw_register, self.cfw_collection[cfw_uuid], from_cfw),
246
+ )
247
+
248
+ self.worker_manager.add_thread_task(task)
249
+
250
+ def multi_execute_step(self, step: Any) -> None:
251
+ """
252
+ Executes a step in a separate process.
253
+ """
254
+ cfw_uuid = self.prepare_execute_step(step, ParallelizationMode.MULTIPROCESSING)
255
+
256
+ from_cfw = None
257
+ if isinstance(step, TransformFrameworkStep):
258
+ from_cfw = self.prepare_tfs_right_cfw(step)
259
+
260
+ existing = self.worker_manager.get_process_queues(cfw_uuid)
261
+
262
+ if existing is None:
263
+ process, command_queue, result_queue = self.worker_manager.create_worker_process(
264
+ cfw_uuid,
265
+ worker,
266
+ (self.cfw_register, self.cfw_collection[cfw_uuid], from_cfw),
267
+ )
268
+ else:
269
+ process, command_queue, result_queue = existing
270
+
271
+ self.worker_manager.send_command(cfw_uuid, step)