mlrun 1.6.4rc7__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (305) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +40 -122
  3. mlrun/alerts/__init__.py +15 -0
  4. mlrun/alerts/alert.py +248 -0
  5. mlrun/api/schemas/__init__.py +5 -4
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +47 -257
  8. mlrun/artifacts/dataset.py +11 -192
  9. mlrun/artifacts/manager.py +79 -47
  10. mlrun/artifacts/model.py +31 -159
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +74 -1
  13. mlrun/common/db/sql_session.py +5 -5
  14. mlrun/common/formatters/__init__.py +21 -0
  15. mlrun/common/formatters/artifact.py +45 -0
  16. mlrun/common/formatters/base.py +113 -0
  17. mlrun/common/formatters/feature_set.py +33 -0
  18. mlrun/common/formatters/function.py +46 -0
  19. mlrun/common/formatters/pipeline.py +53 -0
  20. mlrun/common/formatters/project.py +51 -0
  21. mlrun/common/formatters/run.py +29 -0
  22. mlrun/common/helpers.py +12 -3
  23. mlrun/common/model_monitoring/helpers.py +9 -5
  24. mlrun/{runtimes → common/runtimes}/constants.py +37 -9
  25. mlrun/common/schemas/__init__.py +31 -5
  26. mlrun/common/schemas/alert.py +202 -0
  27. mlrun/common/schemas/api_gateway.py +196 -0
  28. mlrun/common/schemas/artifact.py +25 -4
  29. mlrun/common/schemas/auth.py +16 -5
  30. mlrun/common/schemas/background_task.py +1 -1
  31. mlrun/common/schemas/client_spec.py +4 -2
  32. mlrun/common/schemas/common.py +7 -4
  33. mlrun/common/schemas/constants.py +3 -0
  34. mlrun/common/schemas/feature_store.py +74 -44
  35. mlrun/common/schemas/frontend_spec.py +15 -7
  36. mlrun/common/schemas/function.py +12 -1
  37. mlrun/common/schemas/hub.py +11 -18
  38. mlrun/common/schemas/memory_reports.py +2 -2
  39. mlrun/common/schemas/model_monitoring/__init__.py +20 -4
  40. mlrun/common/schemas/model_monitoring/constants.py +123 -42
  41. mlrun/common/schemas/model_monitoring/grafana.py +13 -9
  42. mlrun/common/schemas/model_monitoring/model_endpoints.py +101 -54
  43. mlrun/common/schemas/notification.py +71 -14
  44. mlrun/common/schemas/object.py +2 -2
  45. mlrun/{model_monitoring/controller_handler.py → common/schemas/pagination.py} +9 -12
  46. mlrun/common/schemas/pipeline.py +8 -1
  47. mlrun/common/schemas/project.py +69 -18
  48. mlrun/common/schemas/runs.py +7 -1
  49. mlrun/common/schemas/runtime_resource.py +8 -12
  50. mlrun/common/schemas/schedule.py +4 -4
  51. mlrun/common/schemas/tag.py +1 -2
  52. mlrun/common/schemas/workflow.py +12 -4
  53. mlrun/common/types.py +14 -1
  54. mlrun/config.py +154 -69
  55. mlrun/data_types/data_types.py +6 -1
  56. mlrun/data_types/spark.py +2 -2
  57. mlrun/data_types/to_pandas.py +67 -37
  58. mlrun/datastore/__init__.py +6 -8
  59. mlrun/datastore/alibaba_oss.py +131 -0
  60. mlrun/datastore/azure_blob.py +143 -42
  61. mlrun/datastore/base.py +102 -58
  62. mlrun/datastore/datastore.py +34 -13
  63. mlrun/datastore/datastore_profile.py +146 -20
  64. mlrun/datastore/dbfs_store.py +3 -7
  65. mlrun/datastore/filestore.py +1 -4
  66. mlrun/datastore/google_cloud_storage.py +97 -33
  67. mlrun/datastore/hdfs.py +56 -0
  68. mlrun/datastore/inmem.py +6 -3
  69. mlrun/datastore/redis.py +7 -2
  70. mlrun/datastore/s3.py +34 -12
  71. mlrun/datastore/snowflake_utils.py +45 -0
  72. mlrun/datastore/sources.py +303 -111
  73. mlrun/datastore/spark_utils.py +31 -2
  74. mlrun/datastore/store_resources.py +9 -7
  75. mlrun/datastore/storeytargets.py +151 -0
  76. mlrun/datastore/targets.py +453 -176
  77. mlrun/datastore/utils.py +72 -58
  78. mlrun/datastore/v3io.py +6 -1
  79. mlrun/db/base.py +274 -41
  80. mlrun/db/factory.py +1 -1
  81. mlrun/db/httpdb.py +893 -225
  82. mlrun/db/nopdb.py +291 -33
  83. mlrun/errors.py +36 -6
  84. mlrun/execution.py +115 -42
  85. mlrun/feature_store/__init__.py +0 -2
  86. mlrun/feature_store/api.py +65 -73
  87. mlrun/feature_store/common.py +7 -12
  88. mlrun/feature_store/feature_set.py +76 -55
  89. mlrun/feature_store/feature_vector.py +39 -31
  90. mlrun/feature_store/ingestion.py +7 -6
  91. mlrun/feature_store/retrieval/base.py +16 -11
  92. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  93. mlrun/feature_store/retrieval/job.py +13 -4
  94. mlrun/feature_store/retrieval/local_merger.py +2 -0
  95. mlrun/feature_store/retrieval/spark_merger.py +24 -32
  96. mlrun/feature_store/steps.py +45 -34
  97. mlrun/features.py +11 -21
  98. mlrun/frameworks/_common/artifacts_library.py +9 -9
  99. mlrun/frameworks/_common/mlrun_interface.py +5 -5
  100. mlrun/frameworks/_common/model_handler.py +48 -48
  101. mlrun/frameworks/_common/plan.py +5 -6
  102. mlrun/frameworks/_common/producer.py +3 -4
  103. mlrun/frameworks/_common/utils.py +5 -5
  104. mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
  105. mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
  106. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
  107. mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
  108. mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
  109. mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
  110. mlrun/frameworks/_ml_common/model_handler.py +24 -24
  111. mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
  112. mlrun/frameworks/_ml_common/plan.py +2 -2
  113. mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
  114. mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
  115. mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
  116. mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
  117. mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
  118. mlrun/frameworks/_ml_common/utils.py +4 -4
  119. mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
  120. mlrun/frameworks/huggingface/model_server.py +4 -4
  121. mlrun/frameworks/lgbm/__init__.py +33 -33
  122. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  123. mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
  124. mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
  125. mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
  126. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
  127. mlrun/frameworks/lgbm/model_handler.py +10 -10
  128. mlrun/frameworks/lgbm/model_server.py +6 -6
  129. mlrun/frameworks/lgbm/utils.py +5 -5
  130. mlrun/frameworks/onnx/dataset.py +8 -8
  131. mlrun/frameworks/onnx/mlrun_interface.py +3 -3
  132. mlrun/frameworks/onnx/model_handler.py +6 -6
  133. mlrun/frameworks/onnx/model_server.py +7 -7
  134. mlrun/frameworks/parallel_coordinates.py +6 -6
  135. mlrun/frameworks/pytorch/__init__.py +18 -18
  136. mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
  137. mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
  138. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
  139. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
  140. mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
  141. mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
  142. mlrun/frameworks/pytorch/model_handler.py +17 -17
  143. mlrun/frameworks/pytorch/model_server.py +7 -7
  144. mlrun/frameworks/sklearn/__init__.py +13 -13
  145. mlrun/frameworks/sklearn/estimator.py +4 -4
  146. mlrun/frameworks/sklearn/metrics_library.py +14 -14
  147. mlrun/frameworks/sklearn/mlrun_interface.py +16 -9
  148. mlrun/frameworks/sklearn/model_handler.py +2 -2
  149. mlrun/frameworks/tf_keras/__init__.py +10 -7
  150. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
  151. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
  152. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
  153. mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
  154. mlrun/frameworks/tf_keras/model_handler.py +14 -14
  155. mlrun/frameworks/tf_keras/model_server.py +6 -6
  156. mlrun/frameworks/xgboost/__init__.py +13 -13
  157. mlrun/frameworks/xgboost/model_handler.py +6 -6
  158. mlrun/k8s_utils.py +61 -17
  159. mlrun/launcher/__init__.py +1 -1
  160. mlrun/launcher/base.py +16 -15
  161. mlrun/launcher/client.py +13 -11
  162. mlrun/launcher/factory.py +1 -1
  163. mlrun/launcher/local.py +23 -13
  164. mlrun/launcher/remote.py +17 -10
  165. mlrun/lists.py +7 -6
  166. mlrun/model.py +478 -103
  167. mlrun/model_monitoring/__init__.py +1 -1
  168. mlrun/model_monitoring/api.py +163 -371
  169. mlrun/{runtimes/mpijob/v1alpha1.py → model_monitoring/applications/__init__.py} +9 -15
  170. mlrun/model_monitoring/applications/_application_steps.py +188 -0
  171. mlrun/model_monitoring/applications/base.py +108 -0
  172. mlrun/model_monitoring/applications/context.py +341 -0
  173. mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
  174. mlrun/model_monitoring/applications/histogram_data_drift.py +354 -0
  175. mlrun/model_monitoring/applications/results.py +99 -0
  176. mlrun/model_monitoring/controller.py +131 -278
  177. mlrun/model_monitoring/db/__init__.py +18 -0
  178. mlrun/model_monitoring/db/stores/__init__.py +136 -0
  179. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  180. mlrun/model_monitoring/db/stores/base/store.py +213 -0
  181. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  182. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  183. mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
  184. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
  185. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  186. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
  187. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  188. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
  189. mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
  190. mlrun/model_monitoring/db/tsdb/base.py +448 -0
  191. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  192. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  193. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +279 -0
  194. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
  195. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +507 -0
  196. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  197. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
  198. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
  199. mlrun/model_monitoring/features_drift_table.py +134 -106
  200. mlrun/model_monitoring/helpers.py +199 -55
  201. mlrun/model_monitoring/metrics/__init__.py +13 -0
  202. mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
  203. mlrun/model_monitoring/model_endpoint.py +3 -2
  204. mlrun/model_monitoring/stream_processing.py +131 -398
  205. mlrun/model_monitoring/tracking_policy.py +9 -2
  206. mlrun/model_monitoring/writer.py +161 -125
  207. mlrun/package/__init__.py +6 -6
  208. mlrun/package/context_handler.py +5 -5
  209. mlrun/package/packager.py +7 -7
  210. mlrun/package/packagers/default_packager.py +8 -8
  211. mlrun/package/packagers/numpy_packagers.py +15 -15
  212. mlrun/package/packagers/pandas_packagers.py +5 -5
  213. mlrun/package/packagers/python_standard_library_packagers.py +10 -10
  214. mlrun/package/packagers_manager.py +19 -23
  215. mlrun/package/utils/_formatter.py +6 -6
  216. mlrun/package/utils/_pickler.py +2 -2
  217. mlrun/package/utils/_supported_format.py +4 -4
  218. mlrun/package/utils/log_hint_utils.py +2 -2
  219. mlrun/package/utils/type_hint_utils.py +4 -9
  220. mlrun/platforms/__init__.py +11 -10
  221. mlrun/platforms/iguazio.py +24 -203
  222. mlrun/projects/operations.py +52 -25
  223. mlrun/projects/pipelines.py +191 -197
  224. mlrun/projects/project.py +1227 -400
  225. mlrun/render.py +16 -19
  226. mlrun/run.py +209 -184
  227. mlrun/runtimes/__init__.py +83 -15
  228. mlrun/runtimes/base.py +51 -35
  229. mlrun/runtimes/daskjob.py +17 -10
  230. mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
  231. mlrun/runtimes/databricks_job/databricks_runtime.py +8 -7
  232. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  233. mlrun/runtimes/funcdoc.py +1 -29
  234. mlrun/runtimes/function_reference.py +1 -1
  235. mlrun/runtimes/kubejob.py +34 -128
  236. mlrun/runtimes/local.py +40 -11
  237. mlrun/runtimes/mpijob/__init__.py +0 -20
  238. mlrun/runtimes/mpijob/abstract.py +9 -10
  239. mlrun/runtimes/mpijob/v1.py +1 -1
  240. mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
  241. mlrun/runtimes/nuclio/api_gateway.py +769 -0
  242. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  243. mlrun/runtimes/nuclio/application/application.py +758 -0
  244. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  245. mlrun/runtimes/{function.py → nuclio/function.py} +200 -83
  246. mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
  247. mlrun/runtimes/{serving.py → nuclio/serving.py} +65 -68
  248. mlrun/runtimes/pod.py +281 -101
  249. mlrun/runtimes/remotesparkjob.py +12 -9
  250. mlrun/runtimes/sparkjob/spark3job.py +67 -51
  251. mlrun/runtimes/utils.py +41 -75
  252. mlrun/secrets.py +9 -5
  253. mlrun/serving/__init__.py +8 -1
  254. mlrun/serving/remote.py +2 -7
  255. mlrun/serving/routers.py +85 -69
  256. mlrun/serving/server.py +69 -44
  257. mlrun/serving/states.py +209 -36
  258. mlrun/serving/utils.py +22 -14
  259. mlrun/serving/v1_serving.py +6 -7
  260. mlrun/serving/v2_serving.py +129 -54
  261. mlrun/track/tracker.py +2 -1
  262. mlrun/track/tracker_manager.py +3 -3
  263. mlrun/track/trackers/mlflow_tracker.py +6 -2
  264. mlrun/utils/async_http.py +6 -8
  265. mlrun/utils/azure_vault.py +1 -1
  266. mlrun/utils/clones.py +1 -2
  267. mlrun/utils/condition_evaluator.py +3 -3
  268. mlrun/utils/db.py +21 -3
  269. mlrun/utils/helpers.py +405 -225
  270. mlrun/utils/http.py +3 -6
  271. mlrun/utils/logger.py +112 -16
  272. mlrun/utils/notifications/notification/__init__.py +17 -13
  273. mlrun/utils/notifications/notification/base.py +50 -2
  274. mlrun/utils/notifications/notification/console.py +2 -0
  275. mlrun/utils/notifications/notification/git.py +24 -1
  276. mlrun/utils/notifications/notification/ipython.py +3 -1
  277. mlrun/utils/notifications/notification/slack.py +96 -21
  278. mlrun/utils/notifications/notification/webhook.py +59 -2
  279. mlrun/utils/notifications/notification_pusher.py +149 -30
  280. mlrun/utils/regex.py +9 -0
  281. mlrun/utils/retryer.py +208 -0
  282. mlrun/utils/singleton.py +1 -1
  283. mlrun/utils/v3io_clients.py +4 -6
  284. mlrun/utils/version/version.json +2 -2
  285. mlrun/utils/version/version.py +2 -6
  286. mlrun-1.7.0.dist-info/METADATA +378 -0
  287. mlrun-1.7.0.dist-info/RECORD +351 -0
  288. {mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/WHEEL +1 -1
  289. mlrun/feature_store/retrieval/conversion.py +0 -273
  290. mlrun/kfpops.py +0 -868
  291. mlrun/model_monitoring/application.py +0 -310
  292. mlrun/model_monitoring/batch.py +0 -1095
  293. mlrun/model_monitoring/prometheus.py +0 -219
  294. mlrun/model_monitoring/stores/__init__.py +0 -111
  295. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -576
  296. mlrun/model_monitoring/stores/model_endpoint_store.py +0 -147
  297. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  298. mlrun/model_monitoring/stores/models/base.py +0 -84
  299. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
  300. mlrun/platforms/other.py +0 -306
  301. mlrun-1.6.4rc7.dist-info/METADATA +0 -272
  302. mlrun-1.6.4rc7.dist-info/RECORD +0 -314
  303. {mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/LICENSE +0 -0
  304. {mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/entry_points.txt +0 -0
  305. {mlrun-1.6.4rc7.dist-info → mlrun-1.7.0.dist-info}/top_level.txt +0 -0
@@ -17,9 +17,10 @@ import os
17
17
  import random
18
18
  import sys
19
19
  import time
20
+ import warnings
20
21
  from collections import Counter
21
22
  from copy import copy
22
- from typing import Any, Dict, List, Optional, Union
23
+ from typing import Any, Optional, Union
23
24
  from urllib.parse import urlparse
24
25
 
25
26
  import pandas as pd
@@ -28,8 +29,13 @@ from mergedeep import merge
28
29
  import mlrun
29
30
  import mlrun.utils.helpers
30
31
  from mlrun.config import config
32
+ from mlrun.datastore.snowflake_utils import (
33
+ get_snowflake_password,
34
+ get_snowflake_spark_options,
35
+ )
36
+ from mlrun.datastore.utils import transform_list_filters_to_tuple
31
37
  from mlrun.model import DataSource, DataTarget, DataTargetBase, TargetPathObject
32
- from mlrun.utils import now_date
38
+ from mlrun.utils import logger, now_date
33
39
  from mlrun.utils.helpers import to_parquet
34
40
  from mlrun.utils.v3io_clients import get_frames_client
35
41
 
@@ -41,9 +47,7 @@ from .spark_utils import spark_session_update_hadoop_options
41
47
  from .utils import (
42
48
  _generate_sql_query_with_time_filter,
43
49
  filter_df_start_end_time,
44
- parse_kafka_url,
45
50
  select_columns_from_df,
46
- store_path_to_spark,
47
51
  )
48
52
 
49
53
 
@@ -58,6 +62,7 @@ class TargetTypes:
58
62
  dataframe = "dataframe"
59
63
  custom = "custom"
60
64
  sql = "sql"
65
+ snowflake = "snowflake"
61
66
 
62
67
  @staticmethod
63
68
  def all():
@@ -72,6 +77,7 @@ class TargetTypes:
72
77
  TargetTypes.dataframe,
73
78
  TargetTypes.custom,
74
79
  TargetTypes.sql,
80
+ TargetTypes.snowflake,
75
81
  ]
76
82
 
77
83
 
@@ -79,11 +85,14 @@ def generate_target_run_id():
79
85
  return f"{round(time.time() * 1000)}_{random.randint(0, 999)}"
80
86
 
81
87
 
82
- def write_spark_dataframe_with_options(spark_options, df, mode):
88
+ def write_spark_dataframe_with_options(spark_options, df, mode, write_format=None):
83
89
  non_hadoop_spark_options = spark_session_update_hadoop_options(
84
90
  df.sql_ctx.sparkSession, spark_options
85
91
  )
86
- df.write.mode(mode).save(**non_hadoop_spark_options)
92
+ if write_format:
93
+ df.write.format(write_format).mode(mode).save(**non_hadoop_spark_options)
94
+ else:
95
+ df.write.mode(mode).save(**non_hadoop_spark_options)
87
96
 
88
97
 
89
98
  def default_target_names():
@@ -215,9 +224,8 @@ def validate_target_list(targets):
215
224
  ]
216
225
  if target_types_requiring_name:
217
226
  raise mlrun.errors.MLRunInvalidArgumentError(
218
- "Only one default name per target type is allowed (please specify name for {0} target)".format(
219
- target_types_requiring_name
220
- )
227
+ "Only one default name per target type is allowed (please "
228
+ f"specify name for {target_types_requiring_name} target)"
221
229
  )
222
230
 
223
231
  target_names_count = Counter(
@@ -232,9 +240,8 @@ def validate_target_list(targets):
232
240
 
233
241
  if targets_with_same_name:
234
242
  raise mlrun.errors.MLRunInvalidArgumentError(
235
- "Each target must have a unique name (more than one target with those names found {0})".format(
236
- targets_with_same_name
237
- )
243
+ "Each target must have a unique name (more than one target with "
244
+ f"those names found {targets_with_same_name})"
238
245
  )
239
246
 
240
247
  no_path_target_types_count = Counter(
@@ -252,9 +259,8 @@ def validate_target_list(targets):
252
259
  ]
253
260
  if target_types_requiring_path:
254
261
  raise mlrun.errors.MLRunInvalidArgumentError(
255
- "Only one default path per target type is allowed (please specify path for {0} target)".format(
256
- target_types_requiring_path
257
- )
262
+ "Only one default path per target type is allowed (please specify "
263
+ f"path for {target_types_requiring_path} target)"
258
264
  )
259
265
 
260
266
  target_paths_count = Counter(
@@ -269,9 +275,8 @@ def validate_target_list(targets):
269
275
 
270
276
  if targets_with_same_path:
271
277
  raise mlrun.errors.MLRunInvalidArgumentError(
272
- "Each target must have a unique path (more than one target with those names found {0})".format(
273
- targets_with_same_path
274
- )
278
+ "Each target must have a unique path (more than one target "
279
+ f"with those names found {targets_with_same_path})"
275
280
  )
276
281
 
277
282
 
@@ -384,23 +389,24 @@ class BaseStoreTarget(DataTargetBase):
384
389
  is_offline = False
385
390
  support_spark = False
386
391
  support_storey = False
392
+ support_pandas = False
387
393
  support_append = False
388
394
 
389
395
  def __init__(
390
396
  self,
391
397
  name: str = "",
392
398
  path=None,
393
- attributes: Dict[str, str] = None,
399
+ attributes: dict[str, str] = None,
394
400
  after_step=None,
395
401
  columns=None,
396
402
  partitioned: bool = False,
397
403
  key_bucketing_number: Optional[int] = None,
398
- partition_cols: Optional[List[str]] = None,
404
+ partition_cols: Optional[list[str]] = None,
399
405
  time_partitioning_granularity: Optional[str] = None,
400
406
  max_events: Optional[int] = None,
401
407
  flush_after_seconds: Optional[int] = None,
402
- storage_options: Dict[str, str] = None,
403
- schema: Dict[str, Any] = None,
408
+ storage_options: dict[str, str] = None,
409
+ schema: dict[str, Any] = None,
404
410
  credentials_prefix=None,
405
411
  ):
406
412
  super().__init__(
@@ -433,6 +439,12 @@ class BaseStoreTarget(DataTargetBase):
433
439
  self.storage_options = storage_options
434
440
  self.schema = schema or {}
435
441
  self.credentials_prefix = credentials_prefix
442
+ if credentials_prefix:
443
+ warnings.warn(
444
+ "The 'credentials_prefix' parameter is deprecated and will be removed in "
445
+ "1.9.0. Please use datastore profiles instead.",
446
+ FutureWarning,
447
+ )
436
448
 
437
449
  self._target = None
438
450
  self._resource = None
@@ -452,14 +464,11 @@ class BaseStoreTarget(DataTargetBase):
452
464
  if self.credentials_prefix
453
465
  else None
454
466
  )
455
- store, resolved_store_path = mlrun.store_manager.get_or_create_store(
467
+ store, resolved_store_path, url = mlrun.store_manager.get_or_create_store(
456
468
  self.get_target_path(),
457
469
  credentials_prefix_secrets,
458
470
  )
459
- if self.get_target_path() and self.get_target_path().startswith("ds://"):
460
- return store, store.url + resolved_store_path
461
- else:
462
- return store, self.get_target_path()
471
+ return store, resolved_store_path, url
463
472
 
464
473
  def _get_column_list(self, features, timestamp_key, key_columns, with_type=False):
465
474
  result = []
@@ -505,10 +514,13 @@ class BaseStoreTarget(DataTargetBase):
505
514
  options = self.get_spark_options(key_column, timestamp_key)
506
515
  options.update(kwargs)
507
516
  df = self.prepare_spark_df(df, key_column, timestamp_key, options)
508
- write_spark_dataframe_with_options(options, df, "overwrite")
517
+ write_format = options.pop("format", None)
518
+ write_spark_dataframe_with_options(
519
+ options, df, "overwrite", write_format=write_format
520
+ )
509
521
  elif hasattr(df, "dask"):
510
522
  dask_options = self.get_dask_options()
511
- store, target_path = self._get_store_and_path()
523
+ store, path_in_store, target_path = self._get_store_and_path()
512
524
  storage_options = store.get_storage_options()
513
525
  df = df.repartition(partition_size="100MB")
514
526
  try:
@@ -529,18 +541,21 @@ class BaseStoreTarget(DataTargetBase):
529
541
  except Exception as exc:
530
542
  raise RuntimeError("Failed to write Dask Dataframe") from exc
531
543
  else:
532
- store, target_path = self._get_store_and_path()
544
+ store, path_in_store, target_path = self._get_store_and_path()
533
545
  target_path = generate_path_with_chunk(self, chunk_id, target_path)
534
546
  file_system = store.filesystem
535
- if file_system.protocol == "file":
547
+ if (
548
+ file_system.protocol == "file"
549
+ # fsspec 2023.10.0 changed protocol from "file" to ("file", "local")
550
+ or isinstance(file_system.protocol, (tuple, list))
551
+ and "file" in file_system.protocol
552
+ ):
536
553
  dir = os.path.dirname(target_path)
537
554
  if dir:
538
555
  os.makedirs(dir, exist_ok=True)
539
556
  target_df = df
540
557
  partition_cols = None # single parquet file
541
- if not target_path.endswith(".parquet") and not target_path.endswith(
542
- ".pq"
543
- ): # directory
558
+ if not mlrun.utils.helpers.is_parquet_file(target_path): # directory
544
559
  partition_cols = []
545
560
  if timestamp_key and (
546
561
  self.partitioned or self.time_partitioning_granularity
@@ -649,6 +664,29 @@ class BaseStoreTarget(DataTargetBase):
649
664
  def _target_path_object(self):
650
665
  """return the actual/computed target path"""
651
666
  is_single_file = hasattr(self, "is_single_file") and self.is_single_file()
667
+
668
+ if self._resource and self.path:
669
+ parsed_url = urlparse(self.path)
670
+ # When the URL consists only from scheme and endpoint and no path,
671
+ # make a default path for DS and redis targets.
672
+ # Also ignore KafkaTarget when it uses the ds scheme (no default path for KafkaTarget)
673
+ if (
674
+ not isinstance(self, KafkaTarget)
675
+ and parsed_url.scheme in ["ds", "redis", "rediss"]
676
+ and (not parsed_url.path or parsed_url.path == "/")
677
+ ):
678
+ return TargetPathObject(
679
+ _get_target_path(
680
+ self,
681
+ self._resource,
682
+ self.run_id is not None,
683
+ netloc=parsed_url.netloc,
684
+ scheme=parsed_url.scheme,
685
+ ),
686
+ self.run_id,
687
+ is_single_file,
688
+ )
689
+
652
690
  return self.get_path() or (
653
691
  TargetPathObject(
654
692
  _get_target_path(self, self._resource, self.run_id is not None),
@@ -665,6 +703,7 @@ class BaseStoreTarget(DataTargetBase):
665
703
  self.kind, self.name, self.get_target_templated_path()
666
704
  )
667
705
  target = self._target
706
+ target.attributes = self.attributes
668
707
  target.run_id = self.run_id
669
708
  target.status = status or target.status or "created"
670
709
  target.updated = now_date().isoformat()
@@ -693,11 +732,25 @@ class BaseStoreTarget(DataTargetBase):
693
732
  timestamp_key=None,
694
733
  featureset_status=None,
695
734
  ):
735
+ if not self.support_storey:
736
+ raise mlrun.errors.MLRunRuntimeError(
737
+ f"{type(self).__name__} does not support storey engine"
738
+ )
696
739
  raise NotImplementedError()
697
740
 
698
741
  def purge(self):
699
- store, target_path = self._get_store_and_path()
700
- store.rm(target_path, recursive=True)
742
+ """
743
+ Delete the files of the target.
744
+
745
+ Do not use this function directly from the sdk. Use FeatureSet.purge_targets.
746
+ """
747
+ store, path_in_store, target_path = self._get_store_and_path()
748
+ if path_in_store not in ["", "/"]:
749
+ store.rm(path_in_store, recursive=True)
750
+ else:
751
+ raise mlrun.errors.MLRunInvalidArgumentError(
752
+ "Unable to delete target. Please Use purge_targets from FeatureSet object."
753
+ )
701
754
 
702
755
  def as_df(
703
756
  self,
@@ -707,9 +760,15 @@ class BaseStoreTarget(DataTargetBase):
707
760
  start_time=None,
708
761
  end_time=None,
709
762
  time_column=None,
763
+ additional_filters=None,
710
764
  **kwargs,
711
765
  ):
712
766
  """return the target data as dataframe"""
767
+ if not self.support_pandas:
768
+ raise NotImplementedError()
769
+ mlrun.utils.helpers.additional_filters_warning(
770
+ additional_filters, self.__class__
771
+ )
713
772
  return mlrun.get_dataitem(self.get_target_path()).as_df(
714
773
  columns=columns,
715
774
  df_module=df_module,
@@ -721,17 +780,25 @@ class BaseStoreTarget(DataTargetBase):
721
780
 
722
781
  def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
723
782
  # options used in spark.read.load(**options)
783
+ if not self.support_spark:
784
+ raise mlrun.errors.MLRunRuntimeError(
785
+ f"{type(self).__name__} does not support spark engine"
786
+ )
724
787
  raise NotImplementedError()
725
788
 
726
- def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options={}):
789
+ def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
727
790
  return df
728
791
 
729
792
  def get_dask_options(self):
730
793
  raise NotImplementedError()
731
794
 
795
+ @property
796
+ def source_spark_attributes(self) -> dict:
797
+ return {}
798
+
732
799
 
733
800
  class ParquetTarget(BaseStoreTarget):
734
- """parquet target storage driver, used to materialize feature set/vector data into parquet files
801
+ """Parquet target storage driver, used to materialize feature set/vector data into parquet files.
735
802
 
736
803
  :param name: optional, target name. By default will be called ParquetTarget
737
804
  :param path: optional, Output path. Can be either a file or directory.
@@ -760,22 +827,23 @@ class ParquetTarget(BaseStoreTarget):
760
827
  support_spark = True
761
828
  support_storey = True
762
829
  support_dask = True
830
+ support_pandas = True
763
831
  support_append = True
764
832
 
765
833
  def __init__(
766
834
  self,
767
835
  name: str = "",
768
836
  path=None,
769
- attributes: Dict[str, str] = None,
837
+ attributes: dict[str, str] = None,
770
838
  after_step=None,
771
839
  columns=None,
772
840
  partitioned: bool = None,
773
841
  key_bucketing_number: Optional[int] = None,
774
- partition_cols: Optional[List[str]] = None,
842
+ partition_cols: Optional[list[str]] = None,
775
843
  time_partitioning_granularity: Optional[str] = None,
776
844
  max_events: Optional[int] = 10000,
777
845
  flush_after_seconds: Optional[int] = 900,
778
- storage_options: Dict[str, str] = None,
846
+ storage_options: dict[str, str] = None,
779
847
  ):
780
848
  self.path = path
781
849
  if partitioned is None:
@@ -865,10 +933,9 @@ class ParquetTarget(BaseStoreTarget):
865
933
  if time_unit == time_partitioning_granularity:
866
934
  break
867
935
 
868
- if (
869
- not self.partitioned
870
- and not self.get_target_path().endswith(".parquet")
871
- and not self.get_target_path().endswith(".pq")
936
+ target_path = self.get_target_path()
937
+ if not self.partitioned and not mlrun.utils.helpers.is_parquet_file(
938
+ target_path
872
939
  ):
873
940
  partition_cols = []
874
941
 
@@ -876,25 +943,16 @@ class ParquetTarget(BaseStoreTarget):
876
943
  for key_column in key_columns:
877
944
  tuple_key_columns.append((key_column.name, key_column.value_type))
878
945
 
879
- store, target_path = self._get_store_and_path()
880
-
881
- storage_options = store.get_storage_options()
882
- if storage_options and self.storage_options:
883
- storage_options = merge(storage_options, self.storage_options)
884
- else:
885
- storage_options = storage_options or self.storage_options
886
-
887
946
  step = graph.add_step(
888
947
  name=self.name or "ParquetTarget",
889
948
  after=after,
890
949
  graph_shape="cylinder",
891
- class_name="storey.ParquetTarget",
950
+ class_name="mlrun.datastore.storeytargets.ParquetStoreyTarget",
892
951
  path=target_path,
893
952
  columns=column_list,
894
953
  index_cols=tuple_key_columns,
895
954
  partition_cols=partition_cols,
896
955
  time_field=timestamp_key,
897
- storage_options=storage_options,
898
956
  max_events=self.max_events,
899
957
  flush_after_seconds=self.flush_after_seconds,
900
958
  update_last_written=featureset_status.update_last_written_for_target,
@@ -929,27 +987,19 @@ class ParquetTarget(BaseStoreTarget):
929
987
  if unit == time_partitioning_granularity:
930
988
  break
931
989
 
932
- if self.path and self.path.startswith("ds://"):
933
- store, path = mlrun.store_manager.get_or_create_store(
934
- self.get_target_path()
935
- )
936
- storage_spark_options = store.get_spark_options()
937
- path = store.url + path
938
- result = {
939
- "path": store_path_to_spark(path, storage_spark_options),
940
- "format": "parquet",
941
- }
942
- result = {**result, **storage_spark_options}
943
- else:
944
- result = {
945
- "path": store_path_to_spark(self.get_target_path()),
990
+ store, path, url = self._get_store_and_path()
991
+ spark_options = store.get_spark_options()
992
+ spark_options.update(
993
+ {
994
+ "path": store.spark_url + path,
946
995
  "format": "parquet",
947
996
  }
997
+ )
948
998
  for partition_col in self.partition_cols or []:
949
999
  partition_cols.append(partition_col)
950
1000
  if partition_cols:
951
- result["partitionBy"] = partition_cols
952
- return result
1001
+ spark_options["partitionBy"] = partition_cols
1002
+ return spark_options
953
1003
 
954
1004
  def get_dask_options(self):
955
1005
  return {"format": "parquet"}
@@ -962,6 +1012,7 @@ class ParquetTarget(BaseStoreTarget):
962
1012
  start_time=None,
963
1013
  end_time=None,
964
1014
  time_column=None,
1015
+ additional_filters=None,
965
1016
  **kwargs,
966
1017
  ):
967
1018
  """return the target data as dataframe"""
@@ -972,6 +1023,7 @@ class ParquetTarget(BaseStoreTarget):
972
1023
  start_time=start_time,
973
1024
  end_time=end_time,
974
1025
  time_column=time_column,
1026
+ additional_filters=transform_list_filters_to_tuple(additional_filters),
975
1027
  **kwargs,
976
1028
  )
977
1029
  if not columns:
@@ -993,9 +1045,7 @@ class ParquetTarget(BaseStoreTarget):
993
1045
  return result
994
1046
 
995
1047
  def is_single_file(self):
996
- if self.path:
997
- return self.path.endswith(".parquet") or self.path.endswith(".pq")
998
- return False
1048
+ return mlrun.utils.helpers.is_parquet_file(self.path)
999
1049
 
1000
1050
  def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
1001
1051
  # If partitioning by time, add the necessary columns
@@ -1035,6 +1085,7 @@ class CSVTarget(BaseStoreTarget):
1035
1085
  is_offline = True
1036
1086
  support_spark = True
1037
1087
  support_storey = True
1088
+ support_pandas = True
1038
1089
 
1039
1090
  @staticmethod
1040
1091
  def _write_dataframe(df, storage_options, target_path, partition_cols, **kwargs):
@@ -1056,39 +1107,30 @@ class CSVTarget(BaseStoreTarget):
1056
1107
  column_list = self._get_column_list(
1057
1108
  features=features, timestamp_key=timestamp_key, key_columns=key_columns
1058
1109
  )
1059
- store, target_path = self._get_store_and_path()
1110
+ target_path = self.get_target_path()
1060
1111
  graph.add_step(
1061
1112
  name=self.name or "CSVTarget",
1062
1113
  after=after,
1063
1114
  graph_shape="cylinder",
1064
- class_name="storey.CSVTarget",
1115
+ class_name="mlrun.datastore.storeytargets.CSVStoreyTarget",
1065
1116
  path=target_path,
1066
1117
  columns=column_list,
1067
1118
  header=True,
1068
1119
  index_cols=key_columns,
1069
- storage_options=store.get_storage_options(),
1070
1120
  **self.attributes,
1071
1121
  )
1072
1122
 
1073
1123
  def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
1074
- if self.path and self.path.startswith("ds://"):
1075
- store, path = mlrun.store_manager.get_or_create_store(
1076
- self.get_target_path()
1077
- )
1078
- storage_spark_options = store.get_spark_options()
1079
- path = store.url + path
1080
- result = {
1081
- "path": store_path_to_spark(path, storage_spark_options),
1082
- "format": "csv",
1083
- "header": "true",
1084
- }
1085
- return {**result, **storage_spark_options}
1086
- else:
1087
- return {
1088
- "path": store_path_to_spark(self.get_target_path()),
1124
+ store, path, url = self._get_store_and_path()
1125
+ spark_options = store.get_spark_options()
1126
+ spark_options.update(
1127
+ {
1128
+ "path": store.spark_url + path,
1089
1129
  "format": "csv",
1090
1130
  "header": "true",
1091
1131
  }
1132
+ )
1133
+ return spark_options
1092
1134
 
1093
1135
  def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
1094
1136
  import pyspark.sql.functions as funcs
@@ -1110,8 +1152,12 @@ class CSVTarget(BaseStoreTarget):
1110
1152
  start_time=None,
1111
1153
  end_time=None,
1112
1154
  time_column=None,
1155
+ additional_filters=None,
1113
1156
  **kwargs,
1114
1157
  ):
1158
+ mlrun.utils.helpers.additional_filters_warning(
1159
+ additional_filters, self.__class__
1160
+ )
1115
1161
  df = super().as_df(
1116
1162
  columns=columns,
1117
1163
  df_module=df_module,
@@ -1132,6 +1178,134 @@ class CSVTarget(BaseStoreTarget):
1132
1178
  return True
1133
1179
 
1134
1180
 
1181
+ class SnowflakeTarget(BaseStoreTarget):
1182
+ """
1183
+ :param attributes: A dictionary of attributes for Snowflake connection; will be overridden by database parameters
1184
+ if they exist.
1185
+ :param url: Snowflake hostname, in the format: <account_name>.<region>.snowflakecomputing.com
1186
+ :param user: Snowflake user for login
1187
+ :param db_schema: Database schema
1188
+ :param database: Database name
1189
+ :param warehouse: Snowflake warehouse name
1190
+ :param table_name: Snowflake table name
1191
+ """
1192
+
1193
+ support_spark = True
1194
+ support_append = True
1195
+ is_offline = True
1196
+ kind = TargetTypes.snowflake
1197
+
1198
+ def __init__(
1199
+ self,
1200
+ name: str = "",
1201
+ path=None,
1202
+ attributes: dict[str, str] = None,
1203
+ after_step=None,
1204
+ columns=None,
1205
+ partitioned: bool = False,
1206
+ key_bucketing_number: Optional[int] = None,
1207
+ partition_cols: Optional[list[str]] = None,
1208
+ time_partitioning_granularity: Optional[str] = None,
1209
+ max_events: Optional[int] = None,
1210
+ flush_after_seconds: Optional[int] = None,
1211
+ storage_options: dict[str, str] = None,
1212
+ schema: dict[str, Any] = None,
1213
+ credentials_prefix=None,
1214
+ url: str = None,
1215
+ user: str = None,
1216
+ db_schema: str = None,
1217
+ database: str = None,
1218
+ warehouse: str = None,
1219
+ table_name: str = None,
1220
+ ):
1221
+ attributes = attributes or {}
1222
+ if url:
1223
+ attributes["url"] = url
1224
+ if user:
1225
+ attributes["user"] = user
1226
+ if database:
1227
+ attributes["database"] = database
1228
+ if db_schema:
1229
+ attributes["db_schema"] = db_schema
1230
+ if warehouse:
1231
+ attributes["warehouse"] = warehouse
1232
+ if table_name:
1233
+ attributes["table"] = table_name
1234
+
1235
+ super().__init__(
1236
+ name,
1237
+ path,
1238
+ attributes,
1239
+ after_step,
1240
+ list(schema.keys()) if schema else columns,
1241
+ partitioned,
1242
+ key_bucketing_number,
1243
+ partition_cols,
1244
+ time_partitioning_granularity,
1245
+ max_events=max_events,
1246
+ flush_after_seconds=flush_after_seconds,
1247
+ storage_options=storage_options,
1248
+ schema=schema,
1249
+ credentials_prefix=credentials_prefix,
1250
+ )
1251
+
1252
+ def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
1253
+ spark_options = get_snowflake_spark_options(self.attributes)
1254
+ spark_options["dbtable"] = self.attributes.get("table")
1255
+ return spark_options
1256
+
1257
+ def purge(self):
1258
+ import snowflake.connector
1259
+
1260
+ missing = [
1261
+ key
1262
+ for key in ["database", "db_schema", "table", "url", "user", "warehouse"]
1263
+ if self.attributes.get(key) is None
1264
+ ]
1265
+ if missing:
1266
+ raise mlrun.errors.MLRunRuntimeError(
1267
+ f"Can't purge Snowflake target, "
1268
+ f"some attributes are missing: {', '.join(missing)}"
1269
+ )
1270
+ account = self.attributes["url"].replace(".snowflakecomputing.com", "")
1271
+
1272
+ with snowflake.connector.connect(
1273
+ account=account,
1274
+ user=self.attributes["user"],
1275
+ password=get_snowflake_password(),
1276
+ warehouse=self.attributes["warehouse"],
1277
+ ) as snowflake_connector:
1278
+ drop_statement = (
1279
+ f"DROP TABLE IF EXISTS {self.attributes['database']}.{self.attributes['db_schema']}"
1280
+ f".{self.attributes['table']}"
1281
+ )
1282
+ snowflake_connector.execute_string(drop_statement)
1283
+
1284
+ def as_df(
1285
+ self,
1286
+ columns=None,
1287
+ df_module=None,
1288
+ entities=None,
1289
+ start_time=None,
1290
+ end_time=None,
1291
+ time_column=None,
1292
+ additional_filters=None,
1293
+ **kwargs,
1294
+ ):
1295
+ raise mlrun.errors.MLRunRuntimeError(
1296
+ f"{type(self).__name__} does not support pandas engine"
1297
+ )
1298
+
1299
+ @property
1300
+ def source_spark_attributes(self) -> dict:
1301
+ keys = ["url", "user", "database", "db_schema", "warehouse"]
1302
+ attributes = self.attributes or {}
1303
+ snowflake_dict = {key: attributes.get(key) for key in keys}
1304
+ table = attributes.get("table")
1305
+ snowflake_dict["query"] = f"SELECT * from {table}" if table else None
1306
+ return snowflake_dict
1307
+
1308
+
1135
1309
  class NoSqlBaseTarget(BaseStoreTarget):
1136
1310
  is_table = True
1137
1311
  is_online = True
@@ -1156,6 +1330,19 @@ class NoSqlBaseTarget(BaseStoreTarget):
1156
1330
  timestamp_key=None,
1157
1331
  featureset_status=None,
1158
1332
  ):
1333
+ table, column_list = self._get_table_and_columns(features, key_columns)
1334
+
1335
+ graph.add_step(
1336
+ name=self.name or self.writer_step_name,
1337
+ after=after,
1338
+ graph_shape="cylinder",
1339
+ class_name="mlrun.datastore.storeytargets.NoSqlStoreyTarget",
1340
+ columns=column_list,
1341
+ table=table,
1342
+ **self.attributes,
1343
+ )
1344
+
1345
+ def _get_table_and_columns(self, features, key_columns):
1159
1346
  key_columns = list(key_columns.keys())
1160
1347
  table = self._resource.uri
1161
1348
  column_list = self._get_column_list(
@@ -1174,15 +1361,7 @@ class NoSqlBaseTarget(BaseStoreTarget):
1174
1361
  col for col in column_list if col[0] not in aggregate_features
1175
1362
  ]
1176
1363
 
1177
- graph.add_step(
1178
- name=self.name or self.writer_step_name,
1179
- after=after,
1180
- graph_shape="cylinder",
1181
- class_name="storey.NoSqlTarget",
1182
- columns=column_list,
1183
- table=table,
1184
- **self.attributes,
1185
- )
1364
+ return table, column_list
1186
1365
 
1187
1366
  def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
1188
1367
  raise NotImplementedError()
@@ -1193,9 +1372,6 @@ class NoSqlBaseTarget(BaseStoreTarget):
1193
1372
  def get_dask_options(self):
1194
1373
  return {"format": "csv"}
1195
1374
 
1196
- def as_df(self, columns=None, df_module=None, **kwargs):
1197
- raise NotImplementedError()
1198
-
1199
1375
  def write_dataframe(
1200
1376
  self, df, key_column=None, timestamp_key=None, chunk_id=0, **kwargs
1201
1377
  ):
@@ -1203,7 +1379,10 @@ class NoSqlBaseTarget(BaseStoreTarget):
1203
1379
  options = self.get_spark_options(key_column, timestamp_key)
1204
1380
  options.update(kwargs)
1205
1381
  df = self.prepare_spark_df(df)
1206
- write_spark_dataframe_with_options(options, df, "overwrite")
1382
+ write_format = options.pop("format", None)
1383
+ write_spark_dataframe_with_options(
1384
+ options, df, "overwrite", write_format=write_format
1385
+ )
1207
1386
  else:
1208
1387
  # To prevent modification of the original dataframe and make sure
1209
1388
  # that the last event of a key is the one being persisted
@@ -1213,7 +1392,11 @@ class NoSqlBaseTarget(BaseStoreTarget):
1213
1392
  df = df.copy(deep=False)
1214
1393
  access_key = self._get_credential("V3IO_ACCESS_KEY")
1215
1394
 
1216
- _, path_with_container = parse_path(self.get_target_path())
1395
+ store, path_in_store, target_path = self._get_store_and_path()
1396
+ storage_options = store.get_storage_options()
1397
+ access_key = storage_options.get("v3io_access_key", access_key)
1398
+
1399
+ _, path_with_container = parse_path(target_path)
1217
1400
  container, path = split_path(path_with_container)
1218
1401
 
1219
1402
  frames_client = get_frames_client(
@@ -1231,17 +1414,31 @@ class NoSqlTarget(NoSqlBaseTarget):
1231
1414
  def get_table_object(self):
1232
1415
  from storey import Table, V3ioDriver
1233
1416
 
1234
- # TODO use options/cred
1235
- endpoint, uri = parse_path(self.get_target_path())
1417
+ store, path_in_store, target_path = self._get_store_and_path()
1418
+ endpoint, uri = parse_path(target_path)
1419
+ storage_options = store.get_storage_options()
1420
+ access_key = storage_options.get("v3io_access_key")
1421
+
1236
1422
  return Table(
1237
1423
  uri,
1238
- V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api),
1424
+ V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api, access_key=access_key),
1239
1425
  flush_interval_secs=mlrun.mlconf.feature_store.flush_interval,
1240
1426
  )
1241
1427
 
1242
1428
  def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
1429
+ store, path_in_store, target_path = self._get_store_and_path()
1430
+ storage_options = store.get_storage_options()
1431
+ store_access_key = storage_options.get("v3io_access_key")
1432
+ env_access_key = self._secrets.get(
1433
+ "V3IO_ACCESS_KEY", os.getenv("V3IO_ACCESS_KEY")
1434
+ )
1435
+ if store_access_key and env_access_key and store_access_key != env_access_key:
1436
+ logger.warning(
1437
+ "The Spark v3io connector does not support access_key parameterization."
1438
+ "Spark will disregard the store-provided key."
1439
+ )
1243
1440
  spark_options = {
1244
- "path": store_path_to_spark(self.get_target_path()),
1441
+ "path": store.spark_url + path_in_store,
1245
1442
  "format": "io.iguaz.v3io.spark.sql.kv",
1246
1443
  }
1247
1444
  if isinstance(key_column, list) and len(key_column) >= 1:
@@ -1287,11 +1484,9 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
1287
1484
  support_spark = True
1288
1485
  writer_step_name = "RedisNoSqlTarget"
1289
1486
 
1290
- # Fetch server url from the RedisNoSqlTarget::__init__() 'path' parameter.
1291
- # If not set fetch it from 'mlrun.mlconf.redis.url' (MLRUN_REDIS__URL environment variable).
1292
- # Then look for username and password at REDIS_xxx secrets
1293
- def _get_server_endpoint(self):
1294
- endpoint, uri = parse_path(self.get_target_path())
1487
+ @staticmethod
1488
+ def get_server_endpoint(path, credentials_prefix=None):
1489
+ endpoint, uri = parse_path(path)
1295
1490
  endpoint = endpoint or mlrun.mlconf.redis.url
1296
1491
  if endpoint.startswith("ds://"):
1297
1492
  datastore_profile = datastore_profile_read(endpoint)
@@ -1308,8 +1503,15 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
1308
1503
  raise mlrun.errors.MLRunInvalidArgumentError(
1309
1504
  "Provide Redis username and password only via secrets"
1310
1505
  )
1311
- user = self._get_credential("REDIS_USER", "")
1312
- password = self._get_credential("REDIS_PASSWORD", "")
1506
+ credentials_prefix = credentials_prefix or mlrun.get_secret_or_env(
1507
+ key="CREDENTIALS_PREFIX"
1508
+ )
1509
+ user = mlrun.get_secret_or_env(
1510
+ "REDIS_USER", default="", prefix=credentials_prefix
1511
+ )
1512
+ password = mlrun.get_secret_or_env(
1513
+ "REDIS_PASSWORD", default="", prefix=credentials_prefix
1514
+ )
1313
1515
  host = parsed_endpoint.hostname
1314
1516
  port = parsed_endpoint.port if parsed_endpoint.port else "6379"
1315
1517
  scheme = parsed_endpoint.scheme
@@ -1323,7 +1525,9 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
1323
1525
  from storey import Table
1324
1526
  from storey.redis_driver import RedisDriver
1325
1527
 
1326
- endpoint, uri = self._get_server_endpoint()
1528
+ endpoint, uri = self.get_server_endpoint(
1529
+ self.get_target_path(), self.credentials_prefix
1530
+ )
1327
1531
 
1328
1532
  return Table(
1329
1533
  uri,
@@ -1332,12 +1536,14 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
1332
1536
  )
1333
1537
 
1334
1538
  def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
1335
- endpoint, uri = self._get_server_endpoint()
1539
+ endpoint, uri = self.get_server_endpoint(
1540
+ self.get_target_path(), self.credentials_prefix
1541
+ )
1336
1542
  parsed_endpoint = urlparse(endpoint)
1337
-
1543
+ store, path_in_store, path = self._get_store_and_path()
1338
1544
  return {
1339
1545
  "key.column": "_spark_object_name",
1340
- "table": "{" + store_path_to_spark(self.get_target_path()),
1546
+ "table": "{" + path_in_store,
1341
1547
  "format": "org.apache.spark.sql.redis",
1342
1548
  "host": parsed_endpoint.hostname,
1343
1549
  "port": parsed_endpoint.port,
@@ -1364,6 +1570,29 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
1364
1570
 
1365
1571
  return df
1366
1572
 
1573
+ def add_writer_step(
1574
+ self,
1575
+ graph,
1576
+ after,
1577
+ features,
1578
+ key_columns=None,
1579
+ timestamp_key=None,
1580
+ featureset_status=None,
1581
+ ):
1582
+ table, column_list = self._get_table_and_columns(features, key_columns)
1583
+
1584
+ graph.add_step(
1585
+ path=self.get_target_path(),
1586
+ name=self.name or self.writer_step_name,
1587
+ after=after,
1588
+ graph_shape="cylinder",
1589
+ class_name="mlrun.datastore.storeytargets.RedisNoSqlStoreyTarget",
1590
+ columns=column_list,
1591
+ table=table,
1592
+ credentials_prefix=self.credentials_prefix,
1593
+ **self.attributes,
1594
+ )
1595
+
1367
1596
 
1368
1597
  class StreamTarget(BaseStoreTarget):
1369
1598
  kind = TargetTypes.stream
@@ -1382,33 +1611,46 @@ class StreamTarget(BaseStoreTarget):
1382
1611
  timestamp_key=None,
1383
1612
  featureset_status=None,
1384
1613
  ):
1385
- from storey import V3ioDriver
1386
-
1387
1614
  key_columns = list(key_columns.keys())
1388
- path = self.get_target_path()
1389
- if not path:
1390
- raise mlrun.errors.MLRunInvalidArgumentError("StreamTarget requires a path")
1391
- endpoint, uri = parse_path(path)
1615
+
1392
1616
  column_list = self._get_column_list(
1393
1617
  features=features, timestamp_key=timestamp_key, key_columns=key_columns
1394
1618
  )
1619
+ stream_path = self.get_target_path()
1620
+ if not stream_path:
1621
+ raise mlrun.errors.MLRunInvalidArgumentError("StreamTarget requires a path")
1395
1622
 
1396
1623
  graph.add_step(
1397
1624
  name=self.name or "StreamTarget",
1398
1625
  after=after,
1399
1626
  graph_shape="cylinder",
1400
- class_name="storey.StreamTarget",
1627
+ class_name="mlrun.datastore.storeytargets.StreamStoreyTarget",
1401
1628
  columns=column_list,
1402
- storage=V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api),
1403
- stream_path=uri,
1629
+ stream_path=stream_path,
1404
1630
  **self.attributes,
1405
1631
  )
1406
1632
 
1407
- def as_df(self, columns=None, df_module=None, **kwargs):
1408
- raise NotImplementedError()
1409
-
1410
1633
 
1411
1634
  class KafkaTarget(BaseStoreTarget):
1635
+ """
1636
+ Kafka target storage driver, used to write data into kafka topics.
1637
+ example::
1638
+ # define target
1639
+ kafka_target = KafkaTarget(
1640
+ name="kafka", path="my_topic", brokers="localhost:9092"
1641
+ )
1642
+ # ingest
1643
+ stocks_set.ingest(stocks, [kafka_target])
1644
+ :param name: target name
1645
+ :param path: topic name e.g. "my_topic"
1646
+ :param after_step: optional, after what step in the graph to add the target
1647
+ :param columns: optional, which columns from data to write
1648
+ :param bootstrap_servers: Deprecated. Use the brokers parameter instead
1649
+ :param producer_options: additional configurations for kafka producer
1650
+ :param brokers: kafka broker as represented by a host:port pair, or a list of kafka brokers, e.g.
1651
+ "localhost:9092", or ["kafka-broker-1:9092", "kafka-broker-2:9092"]
1652
+ """
1653
+
1412
1654
  kind = TargetTypes.kafka
1413
1655
  is_table = False
1414
1656
  is_online = False
@@ -1421,11 +1663,27 @@ class KafkaTarget(BaseStoreTarget):
1421
1663
  *args,
1422
1664
  bootstrap_servers=None,
1423
1665
  producer_options=None,
1666
+ brokers=None,
1424
1667
  **kwargs,
1425
1668
  ):
1426
1669
  attrs = {}
1427
- if bootstrap_servers is not None:
1428
- attrs["bootstrap_servers"] = bootstrap_servers
1670
+
1671
+ # TODO: Remove this in 1.9.0
1672
+ if bootstrap_servers:
1673
+ if brokers:
1674
+ raise mlrun.errors.MLRunInvalidArgumentError(
1675
+ "KafkaTarget cannot be created with both the 'brokers' parameter and the deprecated "
1676
+ "'bootstrap_servers' parameter. Please use 'brokers' only."
1677
+ )
1678
+ warnings.warn(
1679
+ "'bootstrap_servers' parameter is deprecated in 1.7.0 and will be removed in 1.9.0, "
1680
+ "use 'brokers' instead.",
1681
+ FutureWarning,
1682
+ )
1683
+ brokers = bootstrap_servers
1684
+
1685
+ if brokers:
1686
+ attrs["brokers"] = brokers
1429
1687
  if producer_options is not None:
1430
1688
  attrs["producer_options"] = producer_options
1431
1689
 
@@ -1444,37 +1702,21 @@ class KafkaTarget(BaseStoreTarget):
1444
1702
  column_list = self._get_column_list(
1445
1703
  features=features, timestamp_key=timestamp_key, key_columns=key_columns
1446
1704
  )
1447
- if self.path and self.path.startswith("ds://"):
1448
- datastore_profile = datastore_profile_read(self.path)
1449
- attributes = datastore_profile.attributes()
1450
- bootstrap_servers = attributes.pop("bootstrap_servers", None)
1451
- topic = datastore_profile.topic
1452
- else:
1453
- attributes = copy(self.attributes)
1454
- bootstrap_servers = attributes.pop("bootstrap_servers", None)
1455
- topic, bootstrap_servers = parse_kafka_url(
1456
- self.get_target_path(), bootstrap_servers
1457
- )
1705
+ path = self.get_target_path()
1458
1706
 
1459
- if not topic:
1460
- raise mlrun.errors.MLRunInvalidArgumentError(
1461
- "KafkaTarget requires a path (topic)"
1462
- )
1707
+ if not path:
1708
+ raise mlrun.errors.MLRunInvalidArgumentError("KafkaTarget requires a path")
1463
1709
 
1464
1710
  graph.add_step(
1465
1711
  name=self.name or "KafkaTarget",
1466
1712
  after=after,
1467
1713
  graph_shape="cylinder",
1468
- class_name="storey.KafkaTarget",
1714
+ class_name="mlrun.datastore.storeytargets.KafkaStoreyTarget",
1469
1715
  columns=column_list,
1470
- topic=topic,
1471
- bootstrap_servers=bootstrap_servers,
1472
- **attributes,
1716
+ path=path,
1717
+ attributes=self.attributes,
1473
1718
  )
1474
1719
 
1475
- def as_df(self, columns=None, df_module=None, **kwargs):
1476
- raise NotImplementedError()
1477
-
1478
1720
  def purge(self):
1479
1721
  pass
1480
1722
 
@@ -1509,7 +1751,7 @@ class TSDBTarget(BaseStoreTarget):
1509
1751
 
1510
1752
  graph.add_step(
1511
1753
  name=self.name or "TSDBTarget",
1512
- class_name="storey.TSDBTarget",
1754
+ class_name="mlrun.datastore.storeytargets.TSDBStoreyTarget",
1513
1755
  after=after,
1514
1756
  graph_shape="cylinder",
1515
1757
  path=uri,
@@ -1519,9 +1761,6 @@ class TSDBTarget(BaseStoreTarget):
1519
1761
  **self.attributes,
1520
1762
  )
1521
1763
 
1522
- def as_df(self, columns=None, df_module=None, **kwargs):
1523
- raise NotImplementedError()
1524
-
1525
1764
  def write_dataframe(
1526
1765
  self, df, key_column=None, timestamp_key=None, chunk_id=0, **kwargs
1527
1766
  ):
@@ -1535,7 +1774,11 @@ class TSDBTarget(BaseStoreTarget):
1535
1774
  key_column = [key_column]
1536
1775
  new_index.extend(key_column)
1537
1776
 
1538
- _, path_with_container = parse_path(self.get_target_path())
1777
+ store, path_in_store, target_path = self._get_store_and_path()
1778
+ storage_options = store.get_storage_options()
1779
+ access_key = storage_options.get("v3io_access_key", access_key)
1780
+
1781
+ _, path_with_container = parse_path(target_path)
1539
1782
  container, path = split_path(path_with_container)
1540
1783
 
1541
1784
  frames_client = get_frames_client(
@@ -1555,6 +1798,7 @@ class CustomTarget(BaseStoreTarget):
1555
1798
  is_online = False
1556
1799
  support_spark = False
1557
1800
  support_storey = True
1801
+ support_pandas = True
1558
1802
 
1559
1803
  def __init__(
1560
1804
  self,
@@ -1590,6 +1834,7 @@ class CustomTarget(BaseStoreTarget):
1590
1834
  class DFTarget(BaseStoreTarget):
1591
1835
  kind = TargetTypes.dataframe
1592
1836
  support_storey = True
1837
+ support_pandas = True
1593
1838
 
1594
1839
  def __init__(self, *args, name="dataframe", **kwargs):
1595
1840
  self._df = None
@@ -1626,11 +1871,16 @@ class DFTarget(BaseStoreTarget):
1626
1871
  self,
1627
1872
  columns=None,
1628
1873
  df_module=None,
1874
+ entities=None,
1629
1875
  start_time=None,
1630
1876
  end_time=None,
1631
1877
  time_column=None,
1878
+ additional_filters=None,
1632
1879
  **kwargs,
1633
1880
  ):
1881
+ mlrun.utils.helpers.additional_filters_warning(
1882
+ additional_filters, self.__class__
1883
+ )
1634
1884
  return select_columns_from_df(
1635
1885
  filter_df_start_end_time(
1636
1886
  self._df,
@@ -1647,29 +1897,30 @@ class SQLTarget(BaseStoreTarget):
1647
1897
  is_online = True
1648
1898
  support_spark = False
1649
1899
  support_storey = True
1900
+ support_pandas = True
1650
1901
 
1651
1902
  def __init__(
1652
1903
  self,
1653
1904
  name: str = "",
1654
1905
  path=None,
1655
- attributes: Dict[str, str] = None,
1906
+ attributes: dict[str, str] = None,
1656
1907
  after_step=None,
1657
1908
  partitioned: bool = False,
1658
1909
  key_bucketing_number: Optional[int] = None,
1659
- partition_cols: Optional[List[str]] = None,
1910
+ partition_cols: Optional[list[str]] = None,
1660
1911
  time_partitioning_granularity: Optional[str] = None,
1661
1912
  max_events: Optional[int] = None,
1662
1913
  flush_after_seconds: Optional[int] = None,
1663
- storage_options: Dict[str, str] = None,
1914
+ storage_options: dict[str, str] = None,
1664
1915
  db_url: str = None,
1665
1916
  table_name: str = None,
1666
- schema: Dict[str, Any] = None,
1917
+ schema: dict[str, Any] = None,
1667
1918
  primary_key_column: str = "",
1668
1919
  if_exists: str = "append",
1669
1920
  create_table: bool = False,
1670
1921
  # create_according_to_data: bool = False,
1671
1922
  varchar_len: int = 50,
1672
- parse_dates: List[str] = None,
1923
+ parse_dates: list[str] = None,
1673
1924
  ):
1674
1925
  """
1675
1926
  Write to SqlDB as output target for a flow.
@@ -1789,7 +2040,7 @@ class SQLTarget(BaseStoreTarget):
1789
2040
  name=self.name or "SqlTarget",
1790
2041
  after=after,
1791
2042
  graph_shape="cylinder",
1792
- class_name="storey.NoSqlTarget",
2043
+ class_name="mlrun.datastore.storeytargets.NoSqlStoreyTarget",
1793
2044
  columns=column_list,
1794
2045
  header=True,
1795
2046
  table=table,
@@ -1805,6 +2056,7 @@ class SQLTarget(BaseStoreTarget):
1805
2056
  start_time=None,
1806
2057
  end_time=None,
1807
2058
  time_column=None,
2059
+ additional_filters=None,
1808
2060
  **kwargs,
1809
2061
  ):
1810
2062
  try:
@@ -1813,9 +2065,13 @@ class SQLTarget(BaseStoreTarget):
1813
2065
  except (ModuleNotFoundError, ImportError) as exc:
1814
2066
  self._raise_sqlalchemy_import_error(exc)
1815
2067
 
2068
+ mlrun.utils.helpers.additional_filters_warning(
2069
+ additional_filters, self.__class__
2070
+ )
2071
+
1816
2072
  db_path, table_name, _, _, _, _ = self._parse_url()
1817
2073
  engine = sqlalchemy.create_engine(db_path)
1818
- parse_dates: Optional[List[str]] = self.attributes.get("parse_dates")
2074
+ parse_dates: Optional[list[str]] = self.attributes.get("parse_dates")
1819
2075
  with engine.connect() as conn:
1820
2076
  query, parse_dates = _generate_sql_query_with_time_filter(
1821
2077
  table_name=table_name,
@@ -1902,7 +2158,7 @@ class SQLTarget(BaseStoreTarget):
1902
2158
  raise ValueError(f"Table named {table_name} is not exist")
1903
2159
 
1904
2160
  elif not table_exists and create_table:
1905
- TYPE_TO_SQL_TYPE = {
2161
+ type_to_sql_type = {
1906
2162
  int: sqlalchemy.Integer,
1907
2163
  str: sqlalchemy.String(self.attributes.get("varchar_len")),
1908
2164
  datetime.datetime: sqlalchemy.dialects.mysql.DATETIME(fsp=6),
@@ -1915,12 +2171,16 @@ class SQLTarget(BaseStoreTarget):
1915
2171
  # creat new table with the given name
1916
2172
  columns = []
1917
2173
  for col, col_type in self.schema.items():
1918
- col_type = TYPE_TO_SQL_TYPE.get(col_type)
1919
- if col_type is None:
1920
- raise TypeError(f"{col_type} unsupported type")
2174
+ col_type_sql = type_to_sql_type.get(col_type)
2175
+ if col_type_sql is None:
2176
+ raise TypeError(
2177
+ f"'{col_type}' unsupported type for column '{col}'"
2178
+ )
1921
2179
  columns.append(
1922
2180
  sqlalchemy.Column(
1923
- col, col_type, primary_key=(col in primary_key_for_check)
2181
+ col,
2182
+ col_type_sql,
2183
+ primary_key=(col in primary_key_for_check),
1924
2184
  )
1925
2185
  )
1926
2186
 
@@ -1951,10 +2211,11 @@ kind_to_driver = {
1951
2211
  TargetTypes.tsdb: TSDBTarget,
1952
2212
  TargetTypes.custom: CustomTarget,
1953
2213
  TargetTypes.sql: SQLTarget,
2214
+ TargetTypes.snowflake: SnowflakeTarget,
1954
2215
  }
1955
2216
 
1956
2217
 
1957
- def _get_target_path(driver, resource, run_id_mode=False):
2218
+ def _get_target_path(driver, resource, run_id_mode=False, netloc=None, scheme=""):
1958
2219
  """return the default target path given the resource and target kind"""
1959
2220
  kind = driver.kind
1960
2221
  suffix = driver.suffix
@@ -1971,11 +2232,27 @@ def _get_target_path(driver, resource, run_id_mode=False):
1971
2232
  )
1972
2233
  name = resource.metadata.name
1973
2234
  project = resource.metadata.project or mlrun.mlconf.default_project
1974
- data_prefix = get_default_prefix_for_target(kind).format(
2235
+
2236
+ default_kind_name = kind
2237
+ if scheme == "ds":
2238
+ # "dsnosql" is not an actual target like Parquet or Redis; rather, it serves
2239
+ # as a placeholder that can be used in any specified target
2240
+ default_kind_name = "dsnosql"
2241
+ if scheme == "redis" or scheme == "rediss":
2242
+ default_kind_name = TargetTypes.redisnosql
2243
+
2244
+ netloc = netloc or ""
2245
+ data_prefix = get_default_prefix_for_target(default_kind_name).format(
2246
+ ds_profile_name=netloc, # In case of ds profile, set its the name
2247
+ authority=netloc, # In case of redis, replace {authority} with netloc
1975
2248
  project=project,
1976
2249
  kind=kind,
1977
2250
  name=name,
1978
2251
  )
2252
+
2253
+ if scheme == "rediss":
2254
+ data_prefix = data_prefix.replace("redis://", "rediss://", 1)
2255
+
1979
2256
  # todo: handle ver tag changes, may need to copy files?
1980
2257
  if not run_id_mode:
1981
2258
  version = resource.metadata.tag