mlrun 1.7.0rc4__py3-none-any.whl → 1.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (235) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +39 -121
  3. mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
  4. mlrun/alerts/alert.py +248 -0
  5. mlrun/api/schemas/__init__.py +4 -3
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +39 -254
  8. mlrun/artifacts/dataset.py +9 -190
  9. mlrun/artifacts/manager.py +73 -46
  10. mlrun/artifacts/model.py +30 -158
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +73 -1
  13. mlrun/common/db/sql_session.py +3 -2
  14. mlrun/common/formatters/__init__.py +21 -0
  15. mlrun/common/formatters/artifact.py +46 -0
  16. mlrun/common/formatters/base.py +113 -0
  17. mlrun/common/formatters/feature_set.py +44 -0
  18. mlrun/common/formatters/function.py +46 -0
  19. mlrun/common/formatters/pipeline.py +53 -0
  20. mlrun/common/formatters/project.py +51 -0
  21. mlrun/common/formatters/run.py +29 -0
  22. mlrun/common/helpers.py +11 -1
  23. mlrun/{runtimes → common/runtimes}/constants.py +32 -4
  24. mlrun/common/schemas/__init__.py +31 -4
  25. mlrun/common/schemas/alert.py +202 -0
  26. mlrun/common/schemas/api_gateway.py +196 -0
  27. mlrun/common/schemas/artifact.py +28 -1
  28. mlrun/common/schemas/auth.py +13 -2
  29. mlrun/common/schemas/client_spec.py +2 -1
  30. mlrun/common/schemas/common.py +7 -4
  31. mlrun/common/schemas/constants.py +3 -0
  32. mlrun/common/schemas/feature_store.py +58 -28
  33. mlrun/common/schemas/frontend_spec.py +8 -0
  34. mlrun/common/schemas/function.py +11 -0
  35. mlrun/common/schemas/hub.py +7 -9
  36. mlrun/common/schemas/model_monitoring/__init__.py +21 -4
  37. mlrun/common/schemas/model_monitoring/constants.py +136 -42
  38. mlrun/common/schemas/model_monitoring/grafana.py +9 -5
  39. mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
  40. mlrun/common/schemas/notification.py +69 -12
  41. mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
  42. mlrun/common/schemas/pipeline.py +7 -0
  43. mlrun/common/schemas/project.py +67 -16
  44. mlrun/common/schemas/runs.py +17 -0
  45. mlrun/common/schemas/schedule.py +1 -1
  46. mlrun/common/schemas/workflow.py +10 -2
  47. mlrun/common/types.py +14 -1
  48. mlrun/config.py +233 -58
  49. mlrun/data_types/data_types.py +11 -1
  50. mlrun/data_types/spark.py +5 -4
  51. mlrun/data_types/to_pandas.py +75 -34
  52. mlrun/datastore/__init__.py +8 -10
  53. mlrun/datastore/alibaba_oss.py +131 -0
  54. mlrun/datastore/azure_blob.py +131 -43
  55. mlrun/datastore/base.py +107 -47
  56. mlrun/datastore/datastore.py +17 -7
  57. mlrun/datastore/datastore_profile.py +91 -7
  58. mlrun/datastore/dbfs_store.py +3 -7
  59. mlrun/datastore/filestore.py +1 -3
  60. mlrun/datastore/google_cloud_storage.py +92 -32
  61. mlrun/datastore/hdfs.py +5 -0
  62. mlrun/datastore/inmem.py +6 -3
  63. mlrun/datastore/redis.py +3 -2
  64. mlrun/datastore/s3.py +30 -12
  65. mlrun/datastore/snowflake_utils.py +45 -0
  66. mlrun/datastore/sources.py +274 -59
  67. mlrun/datastore/spark_utils.py +30 -0
  68. mlrun/datastore/store_resources.py +9 -7
  69. mlrun/datastore/storeytargets.py +151 -0
  70. mlrun/datastore/targets.py +387 -119
  71. mlrun/datastore/utils.py +68 -5
  72. mlrun/datastore/v3io.py +28 -50
  73. mlrun/db/auth_utils.py +152 -0
  74. mlrun/db/base.py +245 -20
  75. mlrun/db/factory.py +1 -4
  76. mlrun/db/httpdb.py +909 -231
  77. mlrun/db/nopdb.py +279 -14
  78. mlrun/errors.py +35 -5
  79. mlrun/execution.py +111 -38
  80. mlrun/feature_store/__init__.py +0 -2
  81. mlrun/feature_store/api.py +46 -53
  82. mlrun/feature_store/common.py +6 -11
  83. mlrun/feature_store/feature_set.py +48 -23
  84. mlrun/feature_store/feature_vector.py +13 -2
  85. mlrun/feature_store/ingestion.py +7 -6
  86. mlrun/feature_store/retrieval/base.py +9 -4
  87. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  88. mlrun/feature_store/retrieval/job.py +13 -4
  89. mlrun/feature_store/retrieval/local_merger.py +2 -0
  90. mlrun/feature_store/retrieval/spark_merger.py +24 -32
  91. mlrun/feature_store/steps.py +38 -19
  92. mlrun/features.py +6 -14
  93. mlrun/frameworks/_common/plan.py +3 -3
  94. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
  95. mlrun/frameworks/_ml_common/plan.py +1 -1
  96. mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
  97. mlrun/frameworks/lgbm/__init__.py +1 -1
  98. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  99. mlrun/frameworks/lgbm/model_handler.py +1 -1
  100. mlrun/frameworks/parallel_coordinates.py +4 -4
  101. mlrun/frameworks/pytorch/__init__.py +2 -2
  102. mlrun/frameworks/sklearn/__init__.py +1 -1
  103. mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
  104. mlrun/frameworks/tf_keras/__init__.py +5 -2
  105. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
  106. mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
  107. mlrun/frameworks/xgboost/__init__.py +1 -1
  108. mlrun/k8s_utils.py +57 -12
  109. mlrun/launcher/__init__.py +1 -1
  110. mlrun/launcher/base.py +6 -5
  111. mlrun/launcher/client.py +13 -11
  112. mlrun/launcher/factory.py +1 -1
  113. mlrun/launcher/local.py +15 -5
  114. mlrun/launcher/remote.py +10 -3
  115. mlrun/lists.py +6 -2
  116. mlrun/model.py +297 -48
  117. mlrun/model_monitoring/__init__.py +1 -1
  118. mlrun/model_monitoring/api.py +152 -357
  119. mlrun/model_monitoring/applications/__init__.py +10 -0
  120. mlrun/model_monitoring/applications/_application_steps.py +190 -0
  121. mlrun/model_monitoring/applications/base.py +108 -0
  122. mlrun/model_monitoring/applications/context.py +341 -0
  123. mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
  124. mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
  125. mlrun/model_monitoring/applications/results.py +99 -0
  126. mlrun/model_monitoring/controller.py +130 -303
  127. mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
  128. mlrun/model_monitoring/db/stores/__init__.py +136 -0
  129. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  130. mlrun/model_monitoring/db/stores/base/store.py +213 -0
  131. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  132. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  133. mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
  134. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
  135. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  136. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
  137. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  138. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
  139. mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
  140. mlrun/model_monitoring/db/tsdb/base.py +448 -0
  141. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  142. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  143. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
  144. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
  145. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
  146. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  147. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
  148. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
  149. mlrun/model_monitoring/features_drift_table.py +34 -22
  150. mlrun/model_monitoring/helpers.py +177 -39
  151. mlrun/model_monitoring/model_endpoint.py +3 -2
  152. mlrun/model_monitoring/stream_processing.py +165 -398
  153. mlrun/model_monitoring/tracking_policy.py +7 -1
  154. mlrun/model_monitoring/writer.py +161 -125
  155. mlrun/package/packagers/default_packager.py +2 -2
  156. mlrun/package/packagers_manager.py +1 -0
  157. mlrun/package/utils/_formatter.py +2 -2
  158. mlrun/platforms/__init__.py +11 -10
  159. mlrun/platforms/iguazio.py +67 -228
  160. mlrun/projects/__init__.py +6 -1
  161. mlrun/projects/operations.py +47 -20
  162. mlrun/projects/pipelines.py +396 -249
  163. mlrun/projects/project.py +1176 -406
  164. mlrun/render.py +28 -22
  165. mlrun/run.py +208 -181
  166. mlrun/runtimes/__init__.py +76 -11
  167. mlrun/runtimes/base.py +54 -24
  168. mlrun/runtimes/daskjob.py +9 -2
  169. mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
  170. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  171. mlrun/runtimes/funcdoc.py +1 -29
  172. mlrun/runtimes/kubejob.py +34 -128
  173. mlrun/runtimes/local.py +39 -10
  174. mlrun/runtimes/mpijob/__init__.py +0 -20
  175. mlrun/runtimes/mpijob/abstract.py +8 -8
  176. mlrun/runtimes/mpijob/v1.py +1 -1
  177. mlrun/runtimes/nuclio/__init__.py +1 -0
  178. mlrun/runtimes/nuclio/api_gateway.py +769 -0
  179. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  180. mlrun/runtimes/nuclio/application/application.py +758 -0
  181. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  182. mlrun/runtimes/nuclio/function.py +188 -68
  183. mlrun/runtimes/nuclio/serving.py +57 -60
  184. mlrun/runtimes/pod.py +191 -58
  185. mlrun/runtimes/remotesparkjob.py +11 -8
  186. mlrun/runtimes/sparkjob/spark3job.py +17 -18
  187. mlrun/runtimes/utils.py +40 -73
  188. mlrun/secrets.py +6 -2
  189. mlrun/serving/__init__.py +8 -1
  190. mlrun/serving/remote.py +2 -3
  191. mlrun/serving/routers.py +89 -64
  192. mlrun/serving/server.py +54 -26
  193. mlrun/serving/states.py +187 -56
  194. mlrun/serving/utils.py +19 -11
  195. mlrun/serving/v2_serving.py +136 -63
  196. mlrun/track/tracker.py +2 -1
  197. mlrun/track/trackers/mlflow_tracker.py +5 -0
  198. mlrun/utils/async_http.py +26 -6
  199. mlrun/utils/db.py +18 -0
  200. mlrun/utils/helpers.py +375 -105
  201. mlrun/utils/http.py +2 -2
  202. mlrun/utils/logger.py +75 -9
  203. mlrun/utils/notifications/notification/__init__.py +14 -10
  204. mlrun/utils/notifications/notification/base.py +48 -0
  205. mlrun/utils/notifications/notification/console.py +2 -0
  206. mlrun/utils/notifications/notification/git.py +24 -1
  207. mlrun/utils/notifications/notification/ipython.py +2 -0
  208. mlrun/utils/notifications/notification/slack.py +96 -21
  209. mlrun/utils/notifications/notification/webhook.py +63 -2
  210. mlrun/utils/notifications/notification_pusher.py +146 -16
  211. mlrun/utils/regex.py +9 -0
  212. mlrun/utils/retryer.py +3 -2
  213. mlrun/utils/v3io_clients.py +2 -3
  214. mlrun/utils/version/version.json +2 -2
  215. mlrun-1.7.2.dist-info/METADATA +390 -0
  216. mlrun-1.7.2.dist-info/RECORD +351 -0
  217. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
  218. mlrun/feature_store/retrieval/conversion.py +0 -271
  219. mlrun/kfpops.py +0 -868
  220. mlrun/model_monitoring/application.py +0 -310
  221. mlrun/model_monitoring/batch.py +0 -974
  222. mlrun/model_monitoring/controller_handler.py +0 -37
  223. mlrun/model_monitoring/prometheus.py +0 -216
  224. mlrun/model_monitoring/stores/__init__.py +0 -111
  225. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
  226. mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
  227. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  228. mlrun/model_monitoring/stores/models/base.py +0 -84
  229. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
  230. mlrun/platforms/other.py +0 -305
  231. mlrun-1.7.0rc4.dist-info/METADATA +0 -269
  232. mlrun-1.7.0rc4.dist-info/RECORD +0 -321
  233. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
  234. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
  235. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0
@@ -17,6 +17,7 @@ import os
17
17
  import random
18
18
  import sys
19
19
  import time
20
+ import warnings
20
21
  from collections import Counter
21
22
  from copy import copy
22
23
  from typing import Any, Optional, Union
@@ -28,6 +29,11 @@ from mergedeep import merge
28
29
  import mlrun
29
30
  import mlrun.utils.helpers
30
31
  from mlrun.config import config
32
+ from mlrun.datastore.snowflake_utils import (
33
+ get_snowflake_password,
34
+ get_snowflake_spark_options,
35
+ )
36
+ from mlrun.datastore.utils import transform_list_filters_to_tuple
31
37
  from mlrun.model import DataSource, DataTarget, DataTargetBase, TargetPathObject
32
38
  from mlrun.utils import logger, now_date
33
39
  from mlrun.utils.helpers import to_parquet
@@ -41,7 +47,6 @@ from .spark_utils import spark_session_update_hadoop_options
41
47
  from .utils import (
42
48
  _generate_sql_query_with_time_filter,
43
49
  filter_df_start_end_time,
44
- parse_kafka_url,
45
50
  select_columns_from_df,
46
51
  )
47
52
 
@@ -57,6 +62,7 @@ class TargetTypes:
57
62
  dataframe = "dataframe"
58
63
  custom = "custom"
59
64
  sql = "sql"
65
+ snowflake = "snowflake"
60
66
 
61
67
  @staticmethod
62
68
  def all():
@@ -71,6 +77,7 @@ class TargetTypes:
71
77
  TargetTypes.dataframe,
72
78
  TargetTypes.custom,
73
79
  TargetTypes.sql,
80
+ TargetTypes.snowflake,
74
81
  ]
75
82
 
76
83
 
@@ -78,11 +85,14 @@ def generate_target_run_id():
78
85
  return f"{round(time.time() * 1000)}_{random.randint(0, 999)}"
79
86
 
80
87
 
81
- def write_spark_dataframe_with_options(spark_options, df, mode):
88
+ def write_spark_dataframe_with_options(spark_options, df, mode, write_format=None):
82
89
  non_hadoop_spark_options = spark_session_update_hadoop_options(
83
90
  df.sql_ctx.sparkSession, spark_options
84
91
  )
85
- df.write.mode(mode).save(**non_hadoop_spark_options)
92
+ if write_format:
93
+ df.write.format(write_format).mode(mode).save(**non_hadoop_spark_options)
94
+ else:
95
+ df.write.mode(mode).save(**non_hadoop_spark_options)
86
96
 
87
97
 
88
98
  def default_target_names():
@@ -379,6 +389,7 @@ class BaseStoreTarget(DataTargetBase):
379
389
  is_offline = False
380
390
  support_spark = False
381
391
  support_storey = False
392
+ support_pandas = False
382
393
  support_append = False
383
394
 
384
395
  def __init__(
@@ -428,6 +439,12 @@ class BaseStoreTarget(DataTargetBase):
428
439
  self.storage_options = storage_options
429
440
  self.schema = schema or {}
430
441
  self.credentials_prefix = credentials_prefix
442
+ if credentials_prefix:
443
+ warnings.warn(
444
+ "The 'credentials_prefix' parameter is deprecated and will be removed in "
445
+ "1.9.0. Please use datastore profiles instead.",
446
+ FutureWarning,
447
+ )
431
448
 
432
449
  self._target = None
433
450
  self._resource = None
@@ -451,7 +468,7 @@ class BaseStoreTarget(DataTargetBase):
451
468
  self.get_target_path(),
452
469
  credentials_prefix_secrets,
453
470
  )
454
- return store, url
471
+ return store, resolved_store_path, url
455
472
 
456
473
  def _get_column_list(self, features, timestamp_key, key_columns, with_type=False):
457
474
  result = []
@@ -497,10 +514,13 @@ class BaseStoreTarget(DataTargetBase):
497
514
  options = self.get_spark_options(key_column, timestamp_key)
498
515
  options.update(kwargs)
499
516
  df = self.prepare_spark_df(df, key_column, timestamp_key, options)
500
- write_spark_dataframe_with_options(options, df, "overwrite")
517
+ write_format = options.pop("format", None)
518
+ write_spark_dataframe_with_options(
519
+ options, df, "overwrite", write_format=write_format
520
+ )
501
521
  elif hasattr(df, "dask"):
502
522
  dask_options = self.get_dask_options()
503
- store, target_path = self._get_store_and_path()
523
+ store, path_in_store, target_path = self._get_store_and_path()
504
524
  storage_options = store.get_storage_options()
505
525
  df = df.repartition(partition_size="100MB")
506
526
  try:
@@ -521,18 +541,21 @@ class BaseStoreTarget(DataTargetBase):
521
541
  except Exception as exc:
522
542
  raise RuntimeError("Failed to write Dask Dataframe") from exc
523
543
  else:
524
- store, target_path = self._get_store_and_path()
544
+ store, path_in_store, target_path = self._get_store_and_path()
525
545
  target_path = generate_path_with_chunk(self, chunk_id, target_path)
526
546
  file_system = store.filesystem
527
- if file_system.protocol == "file":
547
+ if (
548
+ file_system.protocol == "file"
549
+ # fsspec 2023.10.0 changed protocol from "file" to ("file", "local")
550
+ or isinstance(file_system.protocol, (tuple, list))
551
+ and "file" in file_system.protocol
552
+ ):
528
553
  dir = os.path.dirname(target_path)
529
554
  if dir:
530
555
  os.makedirs(dir, exist_ok=True)
531
556
  target_df = df
532
557
  partition_cols = None # single parquet file
533
- if not target_path.endswith(".parquet") and not target_path.endswith(
534
- ".pq"
535
- ): # directory
558
+ if not mlrun.utils.helpers.is_parquet_file(target_path): # directory
536
559
  partition_cols = []
537
560
  if timestamp_key and (
538
561
  self.partitioned or self.time_partitioning_granularity
@@ -641,6 +664,29 @@ class BaseStoreTarget(DataTargetBase):
641
664
  def _target_path_object(self):
642
665
  """return the actual/computed target path"""
643
666
  is_single_file = hasattr(self, "is_single_file") and self.is_single_file()
667
+
668
+ if self._resource and self.path:
669
+ parsed_url = urlparse(self.path)
670
+ # When the URL consists only from scheme and endpoint and no path,
671
+ # make a default path for DS and redis targets.
672
+ # Also ignore KafkaTarget when it uses the ds scheme (no default path for KafkaTarget)
673
+ if (
674
+ not isinstance(self, KafkaTarget)
675
+ and parsed_url.scheme in ["ds", "redis", "rediss"]
676
+ and (not parsed_url.path or parsed_url.path == "/")
677
+ ):
678
+ return TargetPathObject(
679
+ _get_target_path(
680
+ self,
681
+ self._resource,
682
+ self.run_id is not None,
683
+ netloc=parsed_url.netloc,
684
+ scheme=parsed_url.scheme,
685
+ ),
686
+ self.run_id,
687
+ is_single_file,
688
+ )
689
+
644
690
  return self.get_path() or (
645
691
  TargetPathObject(
646
692
  _get_target_path(self, self._resource, self.run_id is not None),
@@ -657,6 +703,7 @@ class BaseStoreTarget(DataTargetBase):
657
703
  self.kind, self.name, self.get_target_templated_path()
658
704
  )
659
705
  target = self._target
706
+ target.attributes = self.attributes
660
707
  target.run_id = self.run_id
661
708
  target.status = status or target.status or "created"
662
709
  target.updated = now_date().isoformat()
@@ -685,11 +732,25 @@ class BaseStoreTarget(DataTargetBase):
685
732
  timestamp_key=None,
686
733
  featureset_status=None,
687
734
  ):
735
+ if not self.support_storey:
736
+ raise mlrun.errors.MLRunRuntimeError(
737
+ f"{type(self).__name__} does not support storey engine"
738
+ )
688
739
  raise NotImplementedError()
689
740
 
690
741
  def purge(self):
691
- store, target_path = self._get_store_and_path()
692
- store.rm(target_path, recursive=True)
742
+ """
743
+ Delete the files of the target.
744
+
745
+ Do not use this function directly from the sdk. Use FeatureSet.purge_targets.
746
+ """
747
+ store, path_in_store, target_path = self._get_store_and_path()
748
+ if path_in_store not in ["", "/"]:
749
+ store.rm(path_in_store, recursive=True)
750
+ else:
751
+ raise mlrun.errors.MLRunInvalidArgumentError(
752
+ "Unable to delete target. Please Use purge_targets from FeatureSet object."
753
+ )
693
754
 
694
755
  def as_df(
695
756
  self,
@@ -699,9 +760,15 @@ class BaseStoreTarget(DataTargetBase):
699
760
  start_time=None,
700
761
  end_time=None,
701
762
  time_column=None,
763
+ additional_filters=None,
702
764
  **kwargs,
703
765
  ):
704
766
  """return the target data as dataframe"""
767
+ if not self.support_pandas:
768
+ raise NotImplementedError()
769
+ mlrun.utils.helpers.additional_filters_warning(
770
+ additional_filters, self.__class__
771
+ )
705
772
  return mlrun.get_dataitem(self.get_target_path()).as_df(
706
773
  columns=columns,
707
774
  df_module=df_module,
@@ -713,14 +780,22 @@ class BaseStoreTarget(DataTargetBase):
713
780
 
714
781
  def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
715
782
  # options used in spark.read.load(**options)
783
+ if not self.support_spark:
784
+ raise mlrun.errors.MLRunRuntimeError(
785
+ f"{type(self).__name__} does not support spark engine"
786
+ )
716
787
  raise NotImplementedError()
717
788
 
718
- def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options={}):
789
+ def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
719
790
  return df
720
791
 
721
792
  def get_dask_options(self):
722
793
  raise NotImplementedError()
723
794
 
795
+ @property
796
+ def source_spark_attributes(self) -> dict:
797
+ return {}
798
+
724
799
 
725
800
  class ParquetTarget(BaseStoreTarget):
726
801
  """Parquet target storage driver, used to materialize feature set/vector data into parquet files.
@@ -752,6 +827,7 @@ class ParquetTarget(BaseStoreTarget):
752
827
  support_spark = True
753
828
  support_storey = True
754
829
  support_dask = True
830
+ support_pandas = True
755
831
  support_append = True
756
832
 
757
833
  def __init__(
@@ -857,10 +933,9 @@ class ParquetTarget(BaseStoreTarget):
857
933
  if time_unit == time_partitioning_granularity:
858
934
  break
859
935
 
860
- if (
861
- not self.partitioned
862
- and not self.get_target_path().endswith(".parquet")
863
- and not self.get_target_path().endswith(".pq")
936
+ target_path = self.get_target_path()
937
+ if not self.partitioned and not mlrun.utils.helpers.is_parquet_file(
938
+ target_path
864
939
  ):
865
940
  partition_cols = []
866
941
 
@@ -868,25 +943,16 @@ class ParquetTarget(BaseStoreTarget):
868
943
  for key_column in key_columns:
869
944
  tuple_key_columns.append((key_column.name, key_column.value_type))
870
945
 
871
- store, target_path = self._get_store_and_path()
872
-
873
- storage_options = store.get_storage_options()
874
- if storage_options and self.storage_options:
875
- storage_options = merge(storage_options, self.storage_options)
876
- else:
877
- storage_options = storage_options or self.storage_options
878
-
879
946
  step = graph.add_step(
880
947
  name=self.name or "ParquetTarget",
881
948
  after=after,
882
949
  graph_shape="cylinder",
883
- class_name="storey.ParquetTarget",
950
+ class_name="mlrun.datastore.storeytargets.ParquetStoreyTarget",
884
951
  path=target_path,
885
952
  columns=column_list,
886
953
  index_cols=tuple_key_columns,
887
954
  partition_cols=partition_cols,
888
955
  time_field=timestamp_key,
889
- storage_options=storage_options,
890
956
  max_events=self.max_events,
891
957
  flush_after_seconds=self.flush_after_seconds,
892
958
  update_last_written=featureset_status.update_last_written_for_target,
@@ -921,9 +987,7 @@ class ParquetTarget(BaseStoreTarget):
921
987
  if unit == time_partitioning_granularity:
922
988
  break
923
989
 
924
- store, path, url = mlrun.store_manager.get_or_create_store(
925
- self.get_target_path()
926
- )
990
+ store, path, url = self._get_store_and_path()
927
991
  spark_options = store.get_spark_options()
928
992
  spark_options.update(
929
993
  {
@@ -948,6 +1012,7 @@ class ParquetTarget(BaseStoreTarget):
948
1012
  start_time=None,
949
1013
  end_time=None,
950
1014
  time_column=None,
1015
+ additional_filters=None,
951
1016
  **kwargs,
952
1017
  ):
953
1018
  """return the target data as dataframe"""
@@ -958,6 +1023,7 @@ class ParquetTarget(BaseStoreTarget):
958
1023
  start_time=start_time,
959
1024
  end_time=end_time,
960
1025
  time_column=time_column,
1026
+ additional_filters=transform_list_filters_to_tuple(additional_filters),
961
1027
  **kwargs,
962
1028
  )
963
1029
  if not columns:
@@ -979,9 +1045,7 @@ class ParquetTarget(BaseStoreTarget):
979
1045
  return result
980
1046
 
981
1047
  def is_single_file(self):
982
- if self.path:
983
- return self.path.endswith(".parquet") or self.path.endswith(".pq")
984
- return False
1048
+ return mlrun.utils.helpers.is_parquet_file(self.path)
985
1049
 
986
1050
  def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
987
1051
  # If partitioning by time, add the necessary columns
@@ -1021,6 +1085,7 @@ class CSVTarget(BaseStoreTarget):
1021
1085
  is_offline = True
1022
1086
  support_spark = True
1023
1087
  support_storey = True
1088
+ support_pandas = True
1024
1089
 
1025
1090
  @staticmethod
1026
1091
  def _write_dataframe(df, storage_options, target_path, partition_cols, **kwargs):
@@ -1042,24 +1107,21 @@ class CSVTarget(BaseStoreTarget):
1042
1107
  column_list = self._get_column_list(
1043
1108
  features=features, timestamp_key=timestamp_key, key_columns=key_columns
1044
1109
  )
1045
- store, target_path = self._get_store_and_path()
1110
+ target_path = self.get_target_path()
1046
1111
  graph.add_step(
1047
1112
  name=self.name or "CSVTarget",
1048
1113
  after=after,
1049
1114
  graph_shape="cylinder",
1050
- class_name="storey.CSVTarget",
1115
+ class_name="mlrun.datastore.storeytargets.CSVStoreyTarget",
1051
1116
  path=target_path,
1052
1117
  columns=column_list,
1053
1118
  header=True,
1054
1119
  index_cols=key_columns,
1055
- storage_options=store.get_storage_options(),
1056
1120
  **self.attributes,
1057
1121
  )
1058
1122
 
1059
1123
  def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
1060
- store, path, url = mlrun.store_manager.get_or_create_store(
1061
- self.get_target_path()
1062
- )
1124
+ store, path, url = self._get_store_and_path()
1063
1125
  spark_options = store.get_spark_options()
1064
1126
  spark_options.update(
1065
1127
  {
@@ -1074,7 +1136,8 @@ class CSVTarget(BaseStoreTarget):
1074
1136
  import pyspark.sql.functions as funcs
1075
1137
 
1076
1138
  for col_name, col_type in df.dtypes:
1077
- if col_type == "timestamp":
1139
+ # covers TimestampType and TimestampNTZType, which was added in PySpark 3.4.0
1140
+ if col_type.startswith("timestamp"):
1078
1141
  # df.write.csv saves timestamps with millisecond precision, but we want microsecond precision
1079
1142
  # for compatibility with storey.
1080
1143
  df = df.withColumn(
@@ -1090,8 +1153,12 @@ class CSVTarget(BaseStoreTarget):
1090
1153
  start_time=None,
1091
1154
  end_time=None,
1092
1155
  time_column=None,
1156
+ additional_filters=None,
1093
1157
  **kwargs,
1094
1158
  ):
1159
+ mlrun.utils.helpers.additional_filters_warning(
1160
+ additional_filters, self.__class__
1161
+ )
1095
1162
  df = super().as_df(
1096
1163
  columns=columns,
1097
1164
  df_module=df_module,
@@ -1112,6 +1179,134 @@ class CSVTarget(BaseStoreTarget):
1112
1179
  return True
1113
1180
 
1114
1181
 
1182
+ class SnowflakeTarget(BaseStoreTarget):
1183
+ """
1184
+ :param attributes: A dictionary of attributes for Snowflake connection; will be overridden by database parameters
1185
+ if they exist.
1186
+ :param url: Snowflake hostname, in the format: <account_name>.<region>.snowflakecomputing.com
1187
+ :param user: Snowflake user for login
1188
+ :param db_schema: Database schema
1189
+ :param database: Database name
1190
+ :param warehouse: Snowflake warehouse name
1191
+ :param table_name: Snowflake table name
1192
+ """
1193
+
1194
+ support_spark = True
1195
+ support_append = True
1196
+ is_offline = True
1197
+ kind = TargetTypes.snowflake
1198
+
1199
+ def __init__(
1200
+ self,
1201
+ name: str = "",
1202
+ path=None,
1203
+ attributes: dict[str, str] = None,
1204
+ after_step=None,
1205
+ columns=None,
1206
+ partitioned: bool = False,
1207
+ key_bucketing_number: Optional[int] = None,
1208
+ partition_cols: Optional[list[str]] = None,
1209
+ time_partitioning_granularity: Optional[str] = None,
1210
+ max_events: Optional[int] = None,
1211
+ flush_after_seconds: Optional[int] = None,
1212
+ storage_options: dict[str, str] = None,
1213
+ schema: dict[str, Any] = None,
1214
+ credentials_prefix=None,
1215
+ url: str = None,
1216
+ user: str = None,
1217
+ db_schema: str = None,
1218
+ database: str = None,
1219
+ warehouse: str = None,
1220
+ table_name: str = None,
1221
+ ):
1222
+ attributes = attributes or {}
1223
+ if url:
1224
+ attributes["url"] = url
1225
+ if user:
1226
+ attributes["user"] = user
1227
+ if database:
1228
+ attributes["database"] = database
1229
+ if db_schema:
1230
+ attributes["db_schema"] = db_schema
1231
+ if warehouse:
1232
+ attributes["warehouse"] = warehouse
1233
+ if table_name:
1234
+ attributes["table"] = table_name
1235
+
1236
+ super().__init__(
1237
+ name,
1238
+ path,
1239
+ attributes,
1240
+ after_step,
1241
+ list(schema.keys()) if schema else columns,
1242
+ partitioned,
1243
+ key_bucketing_number,
1244
+ partition_cols,
1245
+ time_partitioning_granularity,
1246
+ max_events=max_events,
1247
+ flush_after_seconds=flush_after_seconds,
1248
+ storage_options=storage_options,
1249
+ schema=schema,
1250
+ credentials_prefix=credentials_prefix,
1251
+ )
1252
+
1253
+ def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
1254
+ spark_options = get_snowflake_spark_options(self.attributes)
1255
+ spark_options["dbtable"] = self.attributes.get("table")
1256
+ return spark_options
1257
+
1258
+ def purge(self):
1259
+ import snowflake.connector
1260
+
1261
+ missing = [
1262
+ key
1263
+ for key in ["database", "db_schema", "table", "url", "user", "warehouse"]
1264
+ if self.attributes.get(key) is None
1265
+ ]
1266
+ if missing:
1267
+ raise mlrun.errors.MLRunRuntimeError(
1268
+ f"Can't purge Snowflake target, "
1269
+ f"some attributes are missing: {', '.join(missing)}"
1270
+ )
1271
+ account = self.attributes["url"].replace(".snowflakecomputing.com", "")
1272
+
1273
+ with snowflake.connector.connect(
1274
+ account=account,
1275
+ user=self.attributes["user"],
1276
+ password=get_snowflake_password(),
1277
+ warehouse=self.attributes["warehouse"],
1278
+ ) as snowflake_connector:
1279
+ drop_statement = (
1280
+ f"DROP TABLE IF EXISTS {self.attributes['database']}.{self.attributes['db_schema']}"
1281
+ f".{self.attributes['table']}"
1282
+ )
1283
+ snowflake_connector.execute_string(drop_statement)
1284
+
1285
+ def as_df(
1286
+ self,
1287
+ columns=None,
1288
+ df_module=None,
1289
+ entities=None,
1290
+ start_time=None,
1291
+ end_time=None,
1292
+ time_column=None,
1293
+ additional_filters=None,
1294
+ **kwargs,
1295
+ ):
1296
+ raise mlrun.errors.MLRunRuntimeError(
1297
+ f"{type(self).__name__} does not support pandas engine"
1298
+ )
1299
+
1300
+ @property
1301
+ def source_spark_attributes(self) -> dict:
1302
+ keys = ["url", "user", "database", "db_schema", "warehouse"]
1303
+ attributes = self.attributes or {}
1304
+ snowflake_dict = {key: attributes.get(key) for key in keys}
1305
+ table = attributes.get("table")
1306
+ snowflake_dict["query"] = f"SELECT * from {table}" if table else None
1307
+ return snowflake_dict
1308
+
1309
+
1115
1310
  class NoSqlBaseTarget(BaseStoreTarget):
1116
1311
  is_table = True
1117
1312
  is_online = True
@@ -1136,6 +1331,19 @@ class NoSqlBaseTarget(BaseStoreTarget):
1136
1331
  timestamp_key=None,
1137
1332
  featureset_status=None,
1138
1333
  ):
1334
+ table, column_list = self._get_table_and_columns(features, key_columns)
1335
+
1336
+ graph.add_step(
1337
+ name=self.name or self.writer_step_name,
1338
+ after=after,
1339
+ graph_shape="cylinder",
1340
+ class_name="mlrun.datastore.storeytargets.NoSqlStoreyTarget",
1341
+ columns=column_list,
1342
+ table=table,
1343
+ **self.attributes,
1344
+ )
1345
+
1346
+ def _get_table_and_columns(self, features, key_columns):
1139
1347
  key_columns = list(key_columns.keys())
1140
1348
  table = self._resource.uri
1141
1349
  column_list = self._get_column_list(
@@ -1154,15 +1362,7 @@ class NoSqlBaseTarget(BaseStoreTarget):
1154
1362
  col for col in column_list if col[0] not in aggregate_features
1155
1363
  ]
1156
1364
 
1157
- graph.add_step(
1158
- name=self.name or self.writer_step_name,
1159
- after=after,
1160
- graph_shape="cylinder",
1161
- class_name="storey.NoSqlTarget",
1162
- columns=column_list,
1163
- table=table,
1164
- **self.attributes,
1165
- )
1365
+ return table, column_list
1166
1366
 
1167
1367
  def prepare_spark_df(self, df, key_columns, timestamp_key=None, spark_options=None):
1168
1368
  raise NotImplementedError()
@@ -1173,9 +1373,6 @@ class NoSqlBaseTarget(BaseStoreTarget):
1173
1373
  def get_dask_options(self):
1174
1374
  return {"format": "csv"}
1175
1375
 
1176
- def as_df(self, columns=None, df_module=None, **kwargs):
1177
- raise NotImplementedError()
1178
-
1179
1376
  def write_dataframe(
1180
1377
  self, df, key_column=None, timestamp_key=None, chunk_id=0, **kwargs
1181
1378
  ):
@@ -1183,7 +1380,10 @@ class NoSqlBaseTarget(BaseStoreTarget):
1183
1380
  options = self.get_spark_options(key_column, timestamp_key)
1184
1381
  options.update(kwargs)
1185
1382
  df = self.prepare_spark_df(df)
1186
- write_spark_dataframe_with_options(options, df, "overwrite")
1383
+ write_format = options.pop("format", None)
1384
+ write_spark_dataframe_with_options(
1385
+ options, df, "overwrite", write_format=write_format
1386
+ )
1187
1387
  else:
1188
1388
  # To prevent modification of the original dataframe and make sure
1189
1389
  # that the last event of a key is the one being persisted
@@ -1193,7 +1393,7 @@ class NoSqlBaseTarget(BaseStoreTarget):
1193
1393
  df = df.copy(deep=False)
1194
1394
  access_key = self._get_credential("V3IO_ACCESS_KEY")
1195
1395
 
1196
- store, target_path = self._get_store_and_path()
1396
+ store, path_in_store, target_path = self._get_store_and_path()
1197
1397
  storage_options = store.get_storage_options()
1198
1398
  access_key = storage_options.get("v3io_access_key", access_key)
1199
1399
 
@@ -1215,7 +1415,7 @@ class NoSqlTarget(NoSqlBaseTarget):
1215
1415
  def get_table_object(self):
1216
1416
  from storey import Table, V3ioDriver
1217
1417
 
1218
- store, target_path = self._get_store_and_path()
1418
+ store, path_in_store, target_path = self._get_store_and_path()
1219
1419
  endpoint, uri = parse_path(target_path)
1220
1420
  storage_options = store.get_storage_options()
1221
1421
  access_key = storage_options.get("v3io_access_key")
@@ -1227,7 +1427,7 @@ class NoSqlTarget(NoSqlBaseTarget):
1227
1427
  )
1228
1428
 
1229
1429
  def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
1230
- store, target_path = self._get_store_and_path()
1430
+ store, path_in_store, target_path = self._get_store_and_path()
1231
1431
  storage_options = store.get_storage_options()
1232
1432
  store_access_key = storage_options.get("v3io_access_key")
1233
1433
  env_access_key = self._secrets.get(
@@ -1239,7 +1439,7 @@ class NoSqlTarget(NoSqlBaseTarget):
1239
1439
  "Spark will disregard the store-provided key."
1240
1440
  )
1241
1441
  spark_options = {
1242
- "path": store.spark_url + target_path,
1442
+ "path": store.spark_url + path_in_store,
1243
1443
  "format": "io.iguaz.v3io.spark.sql.kv",
1244
1444
  }
1245
1445
  if isinstance(key_column, list) and len(key_column) >= 1:
@@ -1285,11 +1485,9 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
1285
1485
  support_spark = True
1286
1486
  writer_step_name = "RedisNoSqlTarget"
1287
1487
 
1288
- # Fetch server url from the RedisNoSqlTarget::__init__() 'path' parameter.
1289
- # If not set fetch it from 'mlrun.mlconf.redis.url' (MLRUN_REDIS__URL environment variable).
1290
- # Then look for username and password at REDIS_xxx secrets
1291
- def _get_server_endpoint(self):
1292
- endpoint, uri = parse_path(self.get_target_path())
1488
+ @staticmethod
1489
+ def get_server_endpoint(path, credentials_prefix=None):
1490
+ endpoint, uri = parse_path(path)
1293
1491
  endpoint = endpoint or mlrun.mlconf.redis.url
1294
1492
  if endpoint.startswith("ds://"):
1295
1493
  datastore_profile = datastore_profile_read(endpoint)
@@ -1306,8 +1504,15 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
1306
1504
  raise mlrun.errors.MLRunInvalidArgumentError(
1307
1505
  "Provide Redis username and password only via secrets"
1308
1506
  )
1309
- user = self._get_credential("REDIS_USER", "")
1310
- password = self._get_credential("REDIS_PASSWORD", "")
1507
+ credentials_prefix = credentials_prefix or mlrun.get_secret_or_env(
1508
+ key="CREDENTIALS_PREFIX"
1509
+ )
1510
+ user = mlrun.get_secret_or_env(
1511
+ "REDIS_USER", default="", prefix=credentials_prefix
1512
+ )
1513
+ password = mlrun.get_secret_or_env(
1514
+ "REDIS_PASSWORD", default="", prefix=credentials_prefix
1515
+ )
1311
1516
  host = parsed_endpoint.hostname
1312
1517
  port = parsed_endpoint.port if parsed_endpoint.port else "6379"
1313
1518
  scheme = parsed_endpoint.scheme
@@ -1321,7 +1526,9 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
1321
1526
  from storey import Table
1322
1527
  from storey.redis_driver import RedisDriver
1323
1528
 
1324
- endpoint, uri = self._get_server_endpoint()
1529
+ endpoint, uri = self.get_server_endpoint(
1530
+ self.get_target_path(), self.credentials_prefix
1531
+ )
1325
1532
 
1326
1533
  return Table(
1327
1534
  uri,
@@ -1330,12 +1537,14 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
1330
1537
  )
1331
1538
 
1332
1539
  def get_spark_options(self, key_column=None, timestamp_key=None, overwrite=True):
1333
- endpoint, uri = self._get_server_endpoint()
1540
+ endpoint, uri = self.get_server_endpoint(
1541
+ self.get_target_path(), self.credentials_prefix
1542
+ )
1334
1543
  parsed_endpoint = urlparse(endpoint)
1335
- store, path = self._get_store_and_path()
1544
+ store, path_in_store, path = self._get_store_and_path()
1336
1545
  return {
1337
1546
  "key.column": "_spark_object_name",
1338
- "table": "{" + store.spark_url + path,
1547
+ "table": "{" + path_in_store,
1339
1548
  "format": "org.apache.spark.sql.redis",
1340
1549
  "host": parsed_endpoint.hostname,
1341
1550
  "port": parsed_endpoint.port,
@@ -1362,6 +1571,29 @@ class RedisNoSqlTarget(NoSqlBaseTarget):
1362
1571
 
1363
1572
  return df
1364
1573
 
1574
+ def add_writer_step(
1575
+ self,
1576
+ graph,
1577
+ after,
1578
+ features,
1579
+ key_columns=None,
1580
+ timestamp_key=None,
1581
+ featureset_status=None,
1582
+ ):
1583
+ table, column_list = self._get_table_and_columns(features, key_columns)
1584
+
1585
+ graph.add_step(
1586
+ path=self.get_target_path(),
1587
+ name=self.name or self.writer_step_name,
1588
+ after=after,
1589
+ graph_shape="cylinder",
1590
+ class_name="mlrun.datastore.storeytargets.RedisNoSqlStoreyTarget",
1591
+ columns=column_list,
1592
+ table=table,
1593
+ credentials_prefix=self.credentials_prefix,
1594
+ **self.attributes,
1595
+ )
1596
+
1365
1597
 
1366
1598
  class StreamTarget(BaseStoreTarget):
1367
1599
  kind = TargetTypes.stream
@@ -1380,37 +1612,46 @@ class StreamTarget(BaseStoreTarget):
1380
1612
  timestamp_key=None,
1381
1613
  featureset_status=None,
1382
1614
  ):
1383
- from storey import V3ioDriver
1384
-
1385
1615
  key_columns = list(key_columns.keys())
1386
- store, path = self._get_store_and_path()
1387
- if not path:
1388
- raise mlrun.errors.MLRunInvalidArgumentError("StreamTarget requires a path")
1389
- endpoint, uri = parse_path(path)
1390
- storage_options = store.get_storage_options()
1391
- access_key = storage_options.get("v3io_access_key")
1616
+
1392
1617
  column_list = self._get_column_list(
1393
1618
  features=features, timestamp_key=timestamp_key, key_columns=key_columns
1394
1619
  )
1620
+ stream_path = self.get_target_path()
1621
+ if not stream_path:
1622
+ raise mlrun.errors.MLRunInvalidArgumentError("StreamTarget requires a path")
1395
1623
 
1396
1624
  graph.add_step(
1397
1625
  name=self.name or "StreamTarget",
1398
1626
  after=after,
1399
1627
  graph_shape="cylinder",
1400
- class_name="storey.StreamTarget",
1628
+ class_name="mlrun.datastore.storeytargets.StreamStoreyTarget",
1401
1629
  columns=column_list,
1402
- storage=V3ioDriver(
1403
- webapi=endpoint or mlrun.mlconf.v3io_api, access_key=access_key
1404
- ),
1405
- stream_path=uri,
1630
+ stream_path=stream_path,
1406
1631
  **self.attributes,
1407
1632
  )
1408
1633
 
1409
- def as_df(self, columns=None, df_module=None, **kwargs):
1410
- raise NotImplementedError()
1411
-
1412
1634
 
1413
1635
  class KafkaTarget(BaseStoreTarget):
1636
+ """
1637
+ Kafka target storage driver, used to write data into kafka topics.
1638
+ example::
1639
+ # define target
1640
+ kafka_target = KafkaTarget(
1641
+ name="kafka", path="my_topic", brokers="localhost:9092"
1642
+ )
1643
+ # ingest
1644
+ stocks_set.ingest(stocks, [kafka_target])
1645
+ :param name: target name
1646
+ :param path: topic name e.g. "my_topic"
1647
+ :param after_step: optional, after what step in the graph to add the target
1648
+ :param columns: optional, which columns from data to write
1649
+ :param bootstrap_servers: Deprecated. Use the brokers parameter instead
1650
+ :param producer_options: additional configurations for kafka producer
1651
+ :param brokers: kafka broker as represented by a host:port pair, or a list of kafka brokers, e.g.
1652
+ "localhost:9092", or ["kafka-broker-1:9092", "kafka-broker-2:9092"]
1653
+ """
1654
+
1414
1655
  kind = TargetTypes.kafka
1415
1656
  is_table = False
1416
1657
  is_online = False
@@ -1423,11 +1664,27 @@ class KafkaTarget(BaseStoreTarget):
1423
1664
  *args,
1424
1665
  bootstrap_servers=None,
1425
1666
  producer_options=None,
1667
+ brokers=None,
1426
1668
  **kwargs,
1427
1669
  ):
1428
1670
  attrs = {}
1429
- if bootstrap_servers is not None:
1430
- attrs["bootstrap_servers"] = bootstrap_servers
1671
+
1672
+ # TODO: Remove this in 1.9.0
1673
+ if bootstrap_servers:
1674
+ if brokers:
1675
+ raise mlrun.errors.MLRunInvalidArgumentError(
1676
+ "KafkaTarget cannot be created with both the 'brokers' parameter and the deprecated "
1677
+ "'bootstrap_servers' parameter. Please use 'brokers' only."
1678
+ )
1679
+ warnings.warn(
1680
+ "'bootstrap_servers' parameter is deprecated in 1.7.0 and will be removed in 1.9.0, "
1681
+ "use 'brokers' instead.",
1682
+ FutureWarning,
1683
+ )
1684
+ brokers = bootstrap_servers
1685
+
1686
+ if brokers:
1687
+ attrs["brokers"] = brokers
1431
1688
  if producer_options is not None:
1432
1689
  attrs["producer_options"] = producer_options
1433
1690
 
@@ -1446,37 +1703,21 @@ class KafkaTarget(BaseStoreTarget):
1446
1703
  column_list = self._get_column_list(
1447
1704
  features=features, timestamp_key=timestamp_key, key_columns=key_columns
1448
1705
  )
1449
- if self.path and self.path.startswith("ds://"):
1450
- datastore_profile = datastore_profile_read(self.path)
1451
- attributes = datastore_profile.attributes()
1452
- bootstrap_servers = attributes.pop("bootstrap_servers", None)
1453
- topic = datastore_profile.topic
1454
- else:
1455
- attributes = copy(self.attributes)
1456
- bootstrap_servers = attributes.pop("bootstrap_servers", None)
1457
- topic, bootstrap_servers = parse_kafka_url(
1458
- self.get_target_path(), bootstrap_servers
1459
- )
1706
+ path = self.get_target_path()
1460
1707
 
1461
- if not topic:
1462
- raise mlrun.errors.MLRunInvalidArgumentError(
1463
- "KafkaTarget requires a path (topic)"
1464
- )
1708
+ if not path:
1709
+ raise mlrun.errors.MLRunInvalidArgumentError("KafkaTarget requires a path")
1465
1710
 
1466
1711
  graph.add_step(
1467
1712
  name=self.name or "KafkaTarget",
1468
1713
  after=after,
1469
1714
  graph_shape="cylinder",
1470
- class_name="storey.KafkaTarget",
1715
+ class_name="mlrun.datastore.storeytargets.KafkaStoreyTarget",
1471
1716
  columns=column_list,
1472
- topic=topic,
1473
- bootstrap_servers=bootstrap_servers,
1474
- **attributes,
1717
+ path=path,
1718
+ attributes=self.attributes,
1475
1719
  )
1476
1720
 
1477
- def as_df(self, columns=None, df_module=None, **kwargs):
1478
- raise NotImplementedError()
1479
-
1480
1721
  def purge(self):
1481
1722
  pass
1482
1723
 
@@ -1511,7 +1752,7 @@ class TSDBTarget(BaseStoreTarget):
1511
1752
 
1512
1753
  graph.add_step(
1513
1754
  name=self.name or "TSDBTarget",
1514
- class_name="storey.TSDBTarget",
1755
+ class_name="mlrun.datastore.storeytargets.TSDBStoreyTarget",
1515
1756
  after=after,
1516
1757
  graph_shape="cylinder",
1517
1758
  path=uri,
@@ -1521,9 +1762,6 @@ class TSDBTarget(BaseStoreTarget):
1521
1762
  **self.attributes,
1522
1763
  )
1523
1764
 
1524
- def as_df(self, columns=None, df_module=None, **kwargs):
1525
- raise NotImplementedError()
1526
-
1527
1765
  def write_dataframe(
1528
1766
  self, df, key_column=None, timestamp_key=None, chunk_id=0, **kwargs
1529
1767
  ):
@@ -1537,7 +1775,7 @@ class TSDBTarget(BaseStoreTarget):
1537
1775
  key_column = [key_column]
1538
1776
  new_index.extend(key_column)
1539
1777
 
1540
- store, target_path = self._get_store_and_path()
1778
+ store, path_in_store, target_path = self._get_store_and_path()
1541
1779
  storage_options = store.get_storage_options()
1542
1780
  access_key = storage_options.get("v3io_access_key", access_key)
1543
1781
 
@@ -1561,6 +1799,7 @@ class CustomTarget(BaseStoreTarget):
1561
1799
  is_online = False
1562
1800
  support_spark = False
1563
1801
  support_storey = True
1802
+ support_pandas = True
1564
1803
 
1565
1804
  def __init__(
1566
1805
  self,
@@ -1596,6 +1835,7 @@ class CustomTarget(BaseStoreTarget):
1596
1835
  class DFTarget(BaseStoreTarget):
1597
1836
  kind = TargetTypes.dataframe
1598
1837
  support_storey = True
1838
+ support_pandas = True
1599
1839
 
1600
1840
  def __init__(self, *args, name="dataframe", **kwargs):
1601
1841
  self._df = None
@@ -1632,11 +1872,16 @@ class DFTarget(BaseStoreTarget):
1632
1872
  self,
1633
1873
  columns=None,
1634
1874
  df_module=None,
1875
+ entities=None,
1635
1876
  start_time=None,
1636
1877
  end_time=None,
1637
1878
  time_column=None,
1879
+ additional_filters=None,
1638
1880
  **kwargs,
1639
1881
  ):
1882
+ mlrun.utils.helpers.additional_filters_warning(
1883
+ additional_filters, self.__class__
1884
+ )
1640
1885
  return select_columns_from_df(
1641
1886
  filter_df_start_end_time(
1642
1887
  self._df,
@@ -1653,6 +1898,7 @@ class SQLTarget(BaseStoreTarget):
1653
1898
  is_online = True
1654
1899
  support_spark = False
1655
1900
  support_storey = True
1901
+ support_pandas = True
1656
1902
 
1657
1903
  def __init__(
1658
1904
  self,
@@ -1795,7 +2041,7 @@ class SQLTarget(BaseStoreTarget):
1795
2041
  name=self.name or "SqlTarget",
1796
2042
  after=after,
1797
2043
  graph_shape="cylinder",
1798
- class_name="storey.NoSqlTarget",
2044
+ class_name="mlrun.datastore.storeytargets.NoSqlStoreyTarget",
1799
2045
  columns=column_list,
1800
2046
  header=True,
1801
2047
  table=table,
@@ -1811,6 +2057,7 @@ class SQLTarget(BaseStoreTarget):
1811
2057
  start_time=None,
1812
2058
  end_time=None,
1813
2059
  time_column=None,
2060
+ additional_filters=None,
1814
2061
  **kwargs,
1815
2062
  ):
1816
2063
  try:
@@ -1819,6 +2066,10 @@ class SQLTarget(BaseStoreTarget):
1819
2066
  except (ModuleNotFoundError, ImportError) as exc:
1820
2067
  self._raise_sqlalchemy_import_error(exc)
1821
2068
 
2069
+ mlrun.utils.helpers.additional_filters_warning(
2070
+ additional_filters, self.__class__
2071
+ )
2072
+
1822
2073
  db_path, table_name, _, _, _, _ = self._parse_url()
1823
2074
  engine = sqlalchemy.create_engine(db_path)
1824
2075
  parse_dates: Optional[list[str]] = self.attributes.get("parse_dates")
@@ -1908,7 +2159,7 @@ class SQLTarget(BaseStoreTarget):
1908
2159
  raise ValueError(f"Table named {table_name} is not exist")
1909
2160
 
1910
2161
  elif not table_exists and create_table:
1911
- TYPE_TO_SQL_TYPE = {
2162
+ type_to_sql_type = {
1912
2163
  int: sqlalchemy.Integer,
1913
2164
  str: sqlalchemy.String(self.attributes.get("varchar_len")),
1914
2165
  datetime.datetime: sqlalchemy.dialects.mysql.DATETIME(fsp=6),
@@ -1921,7 +2172,7 @@ class SQLTarget(BaseStoreTarget):
1921
2172
  # creat new table with the given name
1922
2173
  columns = []
1923
2174
  for col, col_type in self.schema.items():
1924
- col_type_sql = TYPE_TO_SQL_TYPE.get(col_type)
2175
+ col_type_sql = type_to_sql_type.get(col_type)
1925
2176
  if col_type_sql is None:
1926
2177
  raise TypeError(
1927
2178
  f"'{col_type}' unsupported type for column '{col}'"
@@ -1961,10 +2212,11 @@ kind_to_driver = {
1961
2212
  TargetTypes.tsdb: TSDBTarget,
1962
2213
  TargetTypes.custom: CustomTarget,
1963
2214
  TargetTypes.sql: SQLTarget,
2215
+ TargetTypes.snowflake: SnowflakeTarget,
1964
2216
  }
1965
2217
 
1966
2218
 
1967
- def _get_target_path(driver, resource, run_id_mode=False):
2219
+ def _get_target_path(driver, resource, run_id_mode=False, netloc=None, scheme=""):
1968
2220
  """return the default target path given the resource and target kind"""
1969
2221
  kind = driver.kind
1970
2222
  suffix = driver.suffix
@@ -1981,11 +2233,27 @@ def _get_target_path(driver, resource, run_id_mode=False):
1981
2233
  )
1982
2234
  name = resource.metadata.name
1983
2235
  project = resource.metadata.project or mlrun.mlconf.default_project
1984
- data_prefix = get_default_prefix_for_target(kind).format(
2236
+
2237
+ default_kind_name = kind
2238
+ if scheme == "ds":
2239
+ # "dsnosql" is not an actual target like Parquet or Redis; rather, it serves
2240
+ # as a placeholder that can be used in any specified target
2241
+ default_kind_name = "dsnosql"
2242
+ if scheme == "redis" or scheme == "rediss":
2243
+ default_kind_name = TargetTypes.redisnosql
2244
+
2245
+ netloc = netloc or ""
2246
+ data_prefix = get_default_prefix_for_target(default_kind_name).format(
2247
+ ds_profile_name=netloc, # In case of ds profile, set its the name
2248
+ authority=netloc, # In case of redis, replace {authority} with netloc
1985
2249
  project=project,
1986
2250
  kind=kind,
1987
2251
  name=name,
1988
2252
  )
2253
+
2254
+ if scheme == "rediss":
2255
+ data_prefix = data_prefix.replace("redis://", "rediss://", 1)
2256
+
1989
2257
  # todo: handle ver tag changes, may need to copy files?
1990
2258
  if not run_id_mode:
1991
2259
  version = resource.metadata.tag