mlrun 1.6.4rc8__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (305) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +40 -122
  3. mlrun/alerts/__init__.py +15 -0
  4. mlrun/alerts/alert.py +248 -0
  5. mlrun/api/schemas/__init__.py +5 -4
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +47 -257
  8. mlrun/artifacts/dataset.py +11 -192
  9. mlrun/artifacts/manager.py +79 -47
  10. mlrun/artifacts/model.py +31 -159
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +74 -1
  13. mlrun/common/db/sql_session.py +5 -5
  14. mlrun/common/formatters/__init__.py +21 -0
  15. mlrun/common/formatters/artifact.py +45 -0
  16. mlrun/common/formatters/base.py +113 -0
  17. mlrun/common/formatters/feature_set.py +33 -0
  18. mlrun/common/formatters/function.py +46 -0
  19. mlrun/common/formatters/pipeline.py +53 -0
  20. mlrun/common/formatters/project.py +51 -0
  21. mlrun/common/formatters/run.py +29 -0
  22. mlrun/common/helpers.py +12 -3
  23. mlrun/common/model_monitoring/helpers.py +9 -5
  24. mlrun/{runtimes → common/runtimes}/constants.py +37 -9
  25. mlrun/common/schemas/__init__.py +31 -5
  26. mlrun/common/schemas/alert.py +202 -0
  27. mlrun/common/schemas/api_gateway.py +196 -0
  28. mlrun/common/schemas/artifact.py +25 -4
  29. mlrun/common/schemas/auth.py +16 -5
  30. mlrun/common/schemas/background_task.py +1 -1
  31. mlrun/common/schemas/client_spec.py +4 -2
  32. mlrun/common/schemas/common.py +7 -4
  33. mlrun/common/schemas/constants.py +3 -0
  34. mlrun/common/schemas/feature_store.py +74 -44
  35. mlrun/common/schemas/frontend_spec.py +15 -7
  36. mlrun/common/schemas/function.py +12 -1
  37. mlrun/common/schemas/hub.py +11 -18
  38. mlrun/common/schemas/memory_reports.py +2 -2
  39. mlrun/common/schemas/model_monitoring/__init__.py +20 -4
  40. mlrun/common/schemas/model_monitoring/constants.py +123 -42
  41. mlrun/common/schemas/model_monitoring/grafana.py +13 -9
  42. mlrun/common/schemas/model_monitoring/model_endpoints.py +101 -54
  43. mlrun/common/schemas/notification.py +71 -14
  44. mlrun/common/schemas/object.py +2 -2
  45. mlrun/{model_monitoring/controller_handler.py → common/schemas/pagination.py} +9 -12
  46. mlrun/common/schemas/pipeline.py +8 -1
  47. mlrun/common/schemas/project.py +69 -18
  48. mlrun/common/schemas/runs.py +7 -1
  49. mlrun/common/schemas/runtime_resource.py +8 -12
  50. mlrun/common/schemas/schedule.py +4 -4
  51. mlrun/common/schemas/tag.py +1 -2
  52. mlrun/common/schemas/workflow.py +12 -4
  53. mlrun/common/types.py +14 -1
  54. mlrun/config.py +154 -69
  55. mlrun/data_types/data_types.py +6 -1
  56. mlrun/data_types/spark.py +2 -2
  57. mlrun/data_types/to_pandas.py +67 -37
  58. mlrun/datastore/__init__.py +6 -8
  59. mlrun/datastore/alibaba_oss.py +131 -0
  60. mlrun/datastore/azure_blob.py +143 -42
  61. mlrun/datastore/base.py +102 -58
  62. mlrun/datastore/datastore.py +34 -13
  63. mlrun/datastore/datastore_profile.py +146 -20
  64. mlrun/datastore/dbfs_store.py +3 -7
  65. mlrun/datastore/filestore.py +1 -4
  66. mlrun/datastore/google_cloud_storage.py +97 -33
  67. mlrun/datastore/hdfs.py +56 -0
  68. mlrun/datastore/inmem.py +6 -3
  69. mlrun/datastore/redis.py +7 -2
  70. mlrun/datastore/s3.py +34 -12
  71. mlrun/datastore/snowflake_utils.py +45 -0
  72. mlrun/datastore/sources.py +303 -111
  73. mlrun/datastore/spark_utils.py +31 -2
  74. mlrun/datastore/store_resources.py +9 -7
  75. mlrun/datastore/storeytargets.py +151 -0
  76. mlrun/datastore/targets.py +453 -176
  77. mlrun/datastore/utils.py +72 -58
  78. mlrun/datastore/v3io.py +6 -1
  79. mlrun/db/base.py +274 -41
  80. mlrun/db/factory.py +1 -1
  81. mlrun/db/httpdb.py +893 -225
  82. mlrun/db/nopdb.py +291 -33
  83. mlrun/errors.py +36 -6
  84. mlrun/execution.py +115 -42
  85. mlrun/feature_store/__init__.py +0 -2
  86. mlrun/feature_store/api.py +65 -73
  87. mlrun/feature_store/common.py +7 -12
  88. mlrun/feature_store/feature_set.py +76 -55
  89. mlrun/feature_store/feature_vector.py +39 -31
  90. mlrun/feature_store/ingestion.py +7 -6
  91. mlrun/feature_store/retrieval/base.py +16 -11
  92. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  93. mlrun/feature_store/retrieval/job.py +13 -4
  94. mlrun/feature_store/retrieval/local_merger.py +2 -0
  95. mlrun/feature_store/retrieval/spark_merger.py +24 -32
  96. mlrun/feature_store/steps.py +45 -34
  97. mlrun/features.py +11 -21
  98. mlrun/frameworks/_common/artifacts_library.py +9 -9
  99. mlrun/frameworks/_common/mlrun_interface.py +5 -5
  100. mlrun/frameworks/_common/model_handler.py +48 -48
  101. mlrun/frameworks/_common/plan.py +5 -6
  102. mlrun/frameworks/_common/producer.py +3 -4
  103. mlrun/frameworks/_common/utils.py +5 -5
  104. mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
  105. mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
  106. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
  107. mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
  108. mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
  109. mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
  110. mlrun/frameworks/_ml_common/model_handler.py +24 -24
  111. mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
  112. mlrun/frameworks/_ml_common/plan.py +2 -2
  113. mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
  114. mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
  115. mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
  116. mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
  117. mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
  118. mlrun/frameworks/_ml_common/utils.py +4 -4
  119. mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
  120. mlrun/frameworks/huggingface/model_server.py +4 -4
  121. mlrun/frameworks/lgbm/__init__.py +33 -33
  122. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  123. mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
  124. mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
  125. mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
  126. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
  127. mlrun/frameworks/lgbm/model_handler.py +10 -10
  128. mlrun/frameworks/lgbm/model_server.py +6 -6
  129. mlrun/frameworks/lgbm/utils.py +5 -5
  130. mlrun/frameworks/onnx/dataset.py +8 -8
  131. mlrun/frameworks/onnx/mlrun_interface.py +3 -3
  132. mlrun/frameworks/onnx/model_handler.py +6 -6
  133. mlrun/frameworks/onnx/model_server.py +7 -7
  134. mlrun/frameworks/parallel_coordinates.py +6 -6
  135. mlrun/frameworks/pytorch/__init__.py +18 -18
  136. mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
  137. mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
  138. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
  139. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
  140. mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
  141. mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
  142. mlrun/frameworks/pytorch/model_handler.py +17 -17
  143. mlrun/frameworks/pytorch/model_server.py +7 -7
  144. mlrun/frameworks/sklearn/__init__.py +13 -13
  145. mlrun/frameworks/sklearn/estimator.py +4 -4
  146. mlrun/frameworks/sklearn/metrics_library.py +14 -14
  147. mlrun/frameworks/sklearn/mlrun_interface.py +16 -9
  148. mlrun/frameworks/sklearn/model_handler.py +2 -2
  149. mlrun/frameworks/tf_keras/__init__.py +10 -7
  150. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
  151. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
  152. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
  153. mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
  154. mlrun/frameworks/tf_keras/model_handler.py +14 -14
  155. mlrun/frameworks/tf_keras/model_server.py +6 -6
  156. mlrun/frameworks/xgboost/__init__.py +13 -13
  157. mlrun/frameworks/xgboost/model_handler.py +6 -6
  158. mlrun/k8s_utils.py +61 -17
  159. mlrun/launcher/__init__.py +1 -1
  160. mlrun/launcher/base.py +16 -15
  161. mlrun/launcher/client.py +13 -11
  162. mlrun/launcher/factory.py +1 -1
  163. mlrun/launcher/local.py +23 -13
  164. mlrun/launcher/remote.py +17 -10
  165. mlrun/lists.py +7 -6
  166. mlrun/model.py +478 -103
  167. mlrun/model_monitoring/__init__.py +1 -1
  168. mlrun/model_monitoring/api.py +163 -371
  169. mlrun/{runtimes/mpijob/v1alpha1.py → model_monitoring/applications/__init__.py} +9 -15
  170. mlrun/model_monitoring/applications/_application_steps.py +188 -0
  171. mlrun/model_monitoring/applications/base.py +108 -0
  172. mlrun/model_monitoring/applications/context.py +341 -0
  173. mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
  174. mlrun/model_monitoring/applications/histogram_data_drift.py +354 -0
  175. mlrun/model_monitoring/applications/results.py +99 -0
  176. mlrun/model_monitoring/controller.py +131 -278
  177. mlrun/model_monitoring/db/__init__.py +18 -0
  178. mlrun/model_monitoring/db/stores/__init__.py +136 -0
  179. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  180. mlrun/model_monitoring/db/stores/base/store.py +213 -0
  181. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  182. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  183. mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
  184. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
  185. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  186. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
  187. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  188. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
  189. mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
  190. mlrun/model_monitoring/db/tsdb/base.py +448 -0
  191. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  192. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  193. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +279 -0
  194. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
  195. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +507 -0
  196. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  197. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
  198. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
  199. mlrun/model_monitoring/features_drift_table.py +134 -106
  200. mlrun/model_monitoring/helpers.py +199 -55
  201. mlrun/model_monitoring/metrics/__init__.py +13 -0
  202. mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
  203. mlrun/model_monitoring/model_endpoint.py +3 -2
  204. mlrun/model_monitoring/stream_processing.py +134 -398
  205. mlrun/model_monitoring/tracking_policy.py +9 -2
  206. mlrun/model_monitoring/writer.py +161 -125
  207. mlrun/package/__init__.py +6 -6
  208. mlrun/package/context_handler.py +5 -5
  209. mlrun/package/packager.py +7 -7
  210. mlrun/package/packagers/default_packager.py +8 -8
  211. mlrun/package/packagers/numpy_packagers.py +15 -15
  212. mlrun/package/packagers/pandas_packagers.py +5 -5
  213. mlrun/package/packagers/python_standard_library_packagers.py +10 -10
  214. mlrun/package/packagers_manager.py +19 -23
  215. mlrun/package/utils/_formatter.py +6 -6
  216. mlrun/package/utils/_pickler.py +2 -2
  217. mlrun/package/utils/_supported_format.py +4 -4
  218. mlrun/package/utils/log_hint_utils.py +2 -2
  219. mlrun/package/utils/type_hint_utils.py +4 -9
  220. mlrun/platforms/__init__.py +11 -10
  221. mlrun/platforms/iguazio.py +24 -203
  222. mlrun/projects/operations.py +52 -25
  223. mlrun/projects/pipelines.py +191 -197
  224. mlrun/projects/project.py +1227 -400
  225. mlrun/render.py +16 -19
  226. mlrun/run.py +209 -184
  227. mlrun/runtimes/__init__.py +83 -15
  228. mlrun/runtimes/base.py +51 -35
  229. mlrun/runtimes/daskjob.py +17 -10
  230. mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
  231. mlrun/runtimes/databricks_job/databricks_runtime.py +8 -7
  232. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  233. mlrun/runtimes/funcdoc.py +1 -29
  234. mlrun/runtimes/function_reference.py +1 -1
  235. mlrun/runtimes/kubejob.py +34 -128
  236. mlrun/runtimes/local.py +40 -11
  237. mlrun/runtimes/mpijob/__init__.py +0 -20
  238. mlrun/runtimes/mpijob/abstract.py +9 -10
  239. mlrun/runtimes/mpijob/v1.py +1 -1
  240. mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
  241. mlrun/runtimes/nuclio/api_gateway.py +769 -0
  242. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  243. mlrun/runtimes/nuclio/application/application.py +758 -0
  244. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  245. mlrun/runtimes/{function.py → nuclio/function.py} +200 -83
  246. mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
  247. mlrun/runtimes/{serving.py → nuclio/serving.py} +65 -68
  248. mlrun/runtimes/pod.py +281 -101
  249. mlrun/runtimes/remotesparkjob.py +12 -9
  250. mlrun/runtimes/sparkjob/spark3job.py +67 -51
  251. mlrun/runtimes/utils.py +41 -75
  252. mlrun/secrets.py +9 -5
  253. mlrun/serving/__init__.py +8 -1
  254. mlrun/serving/remote.py +2 -7
  255. mlrun/serving/routers.py +85 -69
  256. mlrun/serving/server.py +69 -44
  257. mlrun/serving/states.py +209 -36
  258. mlrun/serving/utils.py +22 -14
  259. mlrun/serving/v1_serving.py +6 -7
  260. mlrun/serving/v2_serving.py +133 -54
  261. mlrun/track/tracker.py +2 -1
  262. mlrun/track/tracker_manager.py +3 -3
  263. mlrun/track/trackers/mlflow_tracker.py +6 -2
  264. mlrun/utils/async_http.py +6 -8
  265. mlrun/utils/azure_vault.py +1 -1
  266. mlrun/utils/clones.py +1 -2
  267. mlrun/utils/condition_evaluator.py +3 -3
  268. mlrun/utils/db.py +21 -3
  269. mlrun/utils/helpers.py +405 -225
  270. mlrun/utils/http.py +3 -6
  271. mlrun/utils/logger.py +112 -16
  272. mlrun/utils/notifications/notification/__init__.py +17 -13
  273. mlrun/utils/notifications/notification/base.py +50 -2
  274. mlrun/utils/notifications/notification/console.py +2 -0
  275. mlrun/utils/notifications/notification/git.py +24 -1
  276. mlrun/utils/notifications/notification/ipython.py +3 -1
  277. mlrun/utils/notifications/notification/slack.py +96 -21
  278. mlrun/utils/notifications/notification/webhook.py +59 -2
  279. mlrun/utils/notifications/notification_pusher.py +149 -30
  280. mlrun/utils/regex.py +9 -0
  281. mlrun/utils/retryer.py +208 -0
  282. mlrun/utils/singleton.py +1 -1
  283. mlrun/utils/v3io_clients.py +4 -6
  284. mlrun/utils/version/version.json +2 -2
  285. mlrun/utils/version/version.py +2 -6
  286. mlrun-1.7.0.dist-info/METADATA +378 -0
  287. mlrun-1.7.0.dist-info/RECORD +351 -0
  288. {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/WHEEL +1 -1
  289. mlrun/feature_store/retrieval/conversion.py +0 -273
  290. mlrun/kfpops.py +0 -868
  291. mlrun/model_monitoring/application.py +0 -310
  292. mlrun/model_monitoring/batch.py +0 -1095
  293. mlrun/model_monitoring/prometheus.py +0 -219
  294. mlrun/model_monitoring/stores/__init__.py +0 -111
  295. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -576
  296. mlrun/model_monitoring/stores/model_endpoint_store.py +0 -147
  297. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  298. mlrun/model_monitoring/stores/models/base.py +0 -84
  299. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
  300. mlrun/platforms/other.py +0 -306
  301. mlrun-1.6.4rc8.dist-info/METADATA +0 -272
  302. mlrun-1.6.4rc8.dist-info/RECORD +0 -314
  303. {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/LICENSE +0 -0
  304. {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/entry_points.txt +0 -0
  305. {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/top_level.txt +0 -0
@@ -12,12 +12,13 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import json
15
+ import operator
15
16
  import os
16
17
  import warnings
17
18
  from base64 import b64encode
18
19
  from copy import copy
19
20
  from datetime import datetime
20
- from typing import Dict, List, Optional, Union
21
+ from typing import Optional, Union
21
22
 
22
23
  import pandas as pd
23
24
  import semver
@@ -27,9 +28,12 @@ from nuclio import KafkaTrigger
27
28
  from nuclio.config import split_path
28
29
 
29
30
  import mlrun
31
+ from mlrun.config import config
32
+ from mlrun.datastore.snowflake_utils import get_snowflake_spark_options
33
+ from mlrun.datastore.utils import transform_list_filters_to_tuple
30
34
  from mlrun.secrets import SecretsStore
35
+ from mlrun.utils import logger
31
36
 
32
- from ..config import config
33
37
  from ..model import DataSource
34
38
  from ..platforms.iguazio import parse_path
35
39
  from ..utils import get_class, is_explicit_ack_supported
@@ -39,7 +43,6 @@ from .utils import (
39
43
  _generate_sql_query_with_time_filter,
40
44
  filter_df_start_end_time,
41
45
  select_columns_from_df,
42
- store_path_to_spark,
43
46
  )
44
47
 
45
48
 
@@ -83,7 +86,8 @@ class BaseSourceDriver(DataSource):
83
86
  )
84
87
 
85
88
  explicit_ack = (
86
- is_explicit_ack_supported(context) and mlrun.mlconf.is_explicit_ack()
89
+ is_explicit_ack_supported(context)
90
+ and mlrun.mlconf.is_explicit_ack_enabled()
87
91
  )
88
92
  return storey.SyncEmitSource(
89
93
  context=context,
@@ -102,8 +106,12 @@ class BaseSourceDriver(DataSource):
102
106
  start_time=None,
103
107
  end_time=None,
104
108
  time_field=None,
109
+ additional_filters=None,
105
110
  ):
106
111
  """return the source data as dataframe"""
112
+ mlrun.utils.helpers.additional_filters_warning(
113
+ additional_filters, self.__class__
114
+ )
107
115
  return mlrun.store_manager.object(url=self.path).as_df(
108
116
  columns=columns,
109
117
  df_module=df_module,
@@ -114,7 +122,11 @@ class BaseSourceDriver(DataSource):
114
122
 
115
123
  def to_spark_df(self, session, named_view=False, time_field=None, columns=None):
116
124
  if self.support_spark:
117
- df = load_spark_dataframe_with_options(session, self.get_spark_options())
125
+ spark_options = self.get_spark_options()
126
+ spark_format = spark_options.pop("format", None)
127
+ df = load_spark_dataframe_with_options(
128
+ session, spark_options, format=spark_format
129
+ )
118
130
  if named_view:
119
131
  df.createOrReplaceTempView(self.name)
120
132
  return self._filter_spark_df(df, time_field, columns)
@@ -170,10 +182,10 @@ class CSVSource(BaseSourceDriver):
170
182
  self,
171
183
  name: str = "",
172
184
  path: str = None,
173
- attributes: Dict[str, str] = None,
185
+ attributes: dict[str, object] = None,
174
186
  key_field: str = None,
175
187
  schedule: str = None,
176
- parse_dates: Union[None, int, str, List[int], List[str]] = None,
188
+ parse_dates: Union[None, int, str, list[int], list[str]] = None,
177
189
  **kwargs,
178
190
  ):
179
191
  super().__init__(name, path, attributes, key_field, schedule=schedule, **kwargs)
@@ -193,14 +205,10 @@ class CSVSource(BaseSourceDriver):
193
205
  parse_dates.append(time_field)
194
206
 
195
207
  data_item = mlrun.store_manager.object(self.path)
196
- if self.path and self.path.startswith("ds://"):
197
- store, path = mlrun.store_manager.get_or_create_store(self.path)
198
- path = store.url + path
199
- else:
200
- path = data_item.url
208
+ store, path, url = mlrun.store_manager.get_or_create_store(self.path)
201
209
 
202
210
  return storey.CSVSource(
203
- paths=path, # unlike self.path, it already has store:// replaced
211
+ paths=url, # unlike self.path, it already has store:// replaced
204
212
  build_dict=True,
205
213
  key_field=self.key_field or key_field,
206
214
  storage_options=data_item.store.get_storage_options(),
@@ -209,25 +217,17 @@ class CSVSource(BaseSourceDriver):
209
217
  )
210
218
 
211
219
  def get_spark_options(self):
212
- if self.path and self.path.startswith("ds://"):
213
- store, path = mlrun.store_manager.get_or_create_store(self.path)
214
- storage_spark_options = store.get_spark_options()
215
- path = store.url + path
216
- result = {
217
- "path": store_path_to_spark(path, storage_spark_options),
218
- "format": "csv",
219
- "header": "true",
220
- "inferSchema": "true",
221
- }
222
-
223
- return {**result, **storage_spark_options}
224
- else:
225
- return {
226
- "path": store_path_to_spark(self.path),
220
+ store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
221
+ spark_options = store.get_spark_options()
222
+ spark_options.update(
223
+ {
224
+ "path": store.spark_url + path,
227
225
  "format": "csv",
228
226
  "header": "true",
229
227
  "inferSchema": "true",
230
228
  }
229
+ )
230
+ return spark_options
231
231
 
232
232
  def to_spark_df(self, session, named_view=False, time_field=None, columns=None):
233
233
  import pyspark.sql.functions as funcs
@@ -253,7 +253,11 @@ class CSVSource(BaseSourceDriver):
253
253
  start_time=None,
254
254
  end_time=None,
255
255
  time_field=None,
256
+ additional_filters=None,
256
257
  ):
258
+ mlrun.utils.helpers.additional_filters_warning(
259
+ additional_filters, self.__class__
260
+ )
257
261
  reader_args = self.attributes.get("reader_args", {})
258
262
  return mlrun.store_manager.object(url=self.path).as_df(
259
263
  columns=columns,
@@ -289,6 +293,12 @@ class ParquetSource(BaseSourceDriver):
289
293
  :parameter start_time: filters out data before this time
290
294
  :parameter end_time: filters out data after this time
291
295
  :parameter attributes: additional parameters to pass to storey.
296
+ :param additional_filters: List of additional_filter conditions as tuples.
297
+ Each tuple should be in the format (column_name, operator, value).
298
+ Supported operators: "=", ">=", "<=", ">", "<".
299
+ Example: [("Product", "=", "Computer")]
300
+ For all supported filters, please see:
301
+ https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
292
302
  """
293
303
 
294
304
  kind = "parquet"
@@ -299,13 +309,19 @@ class ParquetSource(BaseSourceDriver):
299
309
  self,
300
310
  name: str = "",
301
311
  path: str = None,
302
- attributes: Dict[str, str] = None,
312
+ attributes: dict[str, object] = None,
303
313
  key_field: str = None,
304
314
  time_field: str = None,
305
315
  schedule: str = None,
306
316
  start_time: Optional[Union[datetime, str]] = None,
307
317
  end_time: Optional[Union[datetime, str]] = None,
318
+ additional_filters: Optional[list[Union[tuple, list]]] = None,
308
319
  ):
320
+ if additional_filters:
321
+ attributes = copy(attributes) or {}
322
+ additional_filters = transform_list_filters_to_tuple(additional_filters)
323
+ attributes["additional_filters"] = additional_filters
324
+
309
325
  super().__init__(
310
326
  name,
311
327
  path,
@@ -333,6 +349,10 @@ class ParquetSource(BaseSourceDriver):
333
349
  def end_time(self, end_time):
334
350
  self._end_time = self._convert_to_datetime(end_time)
335
351
 
352
+ @property
353
+ def additional_filters(self):
354
+ return self.attributes.get("additional_filters")
355
+
336
356
  @staticmethod
337
357
  def _convert_to_datetime(time):
338
358
  if time and isinstance(time, str):
@@ -349,45 +369,48 @@ class ParquetSource(BaseSourceDriver):
349
369
  start_time=None,
350
370
  end_time=None,
351
371
  context=None,
372
+ additional_filters=None,
352
373
  ):
353
374
  import storey
354
375
 
355
- attributes = self.attributes or {}
376
+ attributes = copy(self.attributes)
377
+ attributes.pop("additional_filters", None)
356
378
  if context:
357
379
  attributes["context"] = context
358
-
380
+ additional_filters = transform_list_filters_to_tuple(additional_filters)
359
381
  data_item = mlrun.store_manager.object(self.path)
360
- if self.path and self.path.startswith("ds://"):
361
- store, path = mlrun.store_manager.get_or_create_store(self.path)
362
- path = store.url + path
363
- else:
364
- path = data_item.url
365
-
382
+ store, path, url = mlrun.store_manager.get_or_create_store(self.path)
366
383
  return storey.ParquetSource(
367
- paths=path, # unlike self.path, it already has store:// replaced
384
+ paths=url, # unlike self.path, it already has store:// replaced
368
385
  key_field=self.key_field or key_field,
369
386
  storage_options=data_item.store.get_storage_options(),
370
387
  end_filter=self.end_time,
371
388
  start_filter=self.start_time,
372
389
  filter_column=self.time_field or time_field,
390
+ additional_filters=self.additional_filters or additional_filters,
373
391
  **attributes,
374
392
  )
375
393
 
394
+ @classmethod
395
+ def from_dict(cls, struct=None, fields=None, deprecated_fields: dict = None):
396
+ new_obj = super().from_dict(
397
+ struct=struct, fields=fields, deprecated_fields=deprecated_fields
398
+ )
399
+ new_obj.attributes["additional_filters"] = transform_list_filters_to_tuple(
400
+ new_obj.additional_filters
401
+ )
402
+ return new_obj
403
+
376
404
  def get_spark_options(self):
377
- if self.path and self.path.startswith("ds://"):
378
- store, path = mlrun.store_manager.get_or_create_store(self.path)
379
- storage_spark_options = store.get_spark_options()
380
- path = store.url + path
381
- result = {
382
- "path": store_path_to_spark(path, storage_spark_options),
383
- "format": "parquet",
384
- }
385
- return {**result, **storage_spark_options}
386
- else:
387
- return {
388
- "path": store_path_to_spark(self.path),
405
+ store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
406
+ spark_options = store.get_spark_options()
407
+ spark_options.update(
408
+ {
409
+ "path": store.spark_url + path,
389
410
  "format": "parquet",
390
411
  }
412
+ )
413
+ return spark_options
391
414
 
392
415
  def to_dataframe(
393
416
  self,
@@ -397,8 +420,10 @@ class ParquetSource(BaseSourceDriver):
397
420
  start_time=None,
398
421
  end_time=None,
399
422
  time_field=None,
423
+ additional_filters=None,
400
424
  ):
401
425
  reader_args = self.attributes.get("reader_args", {})
426
+ additional_filters = transform_list_filters_to_tuple(additional_filters)
402
427
  return mlrun.store_manager.object(url=self.path).as_df(
403
428
  columns=columns,
404
429
  df_module=df_module,
@@ -406,9 +431,88 @@ class ParquetSource(BaseSourceDriver):
406
431
  end_time=end_time or self.end_time,
407
432
  time_column=time_field or self.time_field,
408
433
  format="parquet",
434
+ additional_filters=additional_filters or self.additional_filters,
409
435
  **reader_args,
410
436
  )
411
437
 
438
+ def _build_spark_additional_filters(self, column_types: dict):
439
+ if not self.additional_filters:
440
+ return None
441
+ from pyspark.sql.functions import col, isnan, lit
442
+
443
+ operators = {
444
+ "==": operator.eq,
445
+ "=": operator.eq,
446
+ ">": operator.gt,
447
+ "<": operator.lt,
448
+ ">=": operator.ge,
449
+ "<=": operator.le,
450
+ "!=": operator.ne,
451
+ }
452
+
453
+ spark_filter = None
454
+ new_filter = lit(True)
455
+ for filter_tuple in self.additional_filters:
456
+ if not filter_tuple:
457
+ continue
458
+ col_name, op, value = filter_tuple
459
+ if op.lower() in ("in", "not in") and isinstance(value, (list, tuple, set)):
460
+ none_exists = False
461
+ value = list(value)
462
+ for sub_value in value:
463
+ if sub_value is None:
464
+ value.remove(sub_value)
465
+ none_exists = True
466
+ if none_exists:
467
+ filter_nan = column_types[col_name] not in ("timestamp", "date")
468
+ if value:
469
+ if op.lower() == "in":
470
+ new_filter = (
471
+ col(col_name).isin(value) | col(col_name).isNull()
472
+ )
473
+ if filter_nan:
474
+ new_filter = new_filter | isnan(col(col_name))
475
+
476
+ else:
477
+ new_filter = (
478
+ ~col(col_name).isin(value) & ~col(col_name).isNull()
479
+ )
480
+ if filter_nan:
481
+ new_filter = new_filter & ~isnan(col(col_name))
482
+ else:
483
+ if op.lower() == "in":
484
+ new_filter = col(col_name).isNull()
485
+ if filter_nan:
486
+ new_filter = new_filter | isnan(col(col_name))
487
+ else:
488
+ new_filter = ~col(col_name).isNull()
489
+ if filter_nan:
490
+ new_filter = new_filter & ~isnan(col(col_name))
491
+ else:
492
+ if op.lower() == "in":
493
+ new_filter = col(col_name).isin(value)
494
+ elif op.lower() == "not in":
495
+ new_filter = ~col(col_name).isin(value)
496
+ elif op in operators:
497
+ new_filter = operators[op](col(col_name), value)
498
+ else:
499
+ raise mlrun.errors.MLRunInvalidArgumentError(
500
+ f"unsupported filter operator: {op}"
501
+ )
502
+ if spark_filter is not None:
503
+ spark_filter = spark_filter & new_filter
504
+ else:
505
+ spark_filter = new_filter
506
+ return spark_filter
507
+
508
+ def _filter_spark_df(self, df, time_field=None, columns=None):
509
+ spark_additional_filters = self._build_spark_additional_filters(
510
+ column_types=dict(df.dtypes)
511
+ )
512
+ if spark_additional_filters is not None:
513
+ df = df.filter(spark_additional_filters)
514
+ return super()._filter_spark_df(df=df, time_field=time_field, columns=columns)
515
+
412
516
 
413
517
  class BigQuerySource(BaseSourceDriver):
414
518
  """
@@ -423,12 +527,17 @@ class BigQuerySource(BaseSourceDriver):
423
527
 
424
528
  # use sql query
425
529
  query_string = "SELECT * FROM `the-psf.pypi.downloads20210328` LIMIT 5000"
426
- source = BigQuerySource("bq1", query=query_string,
427
- gcp_project="my_project",
428
- materialization_dataset="dataviews")
530
+ source = BigQuerySource(
531
+ "bq1",
532
+ query=query_string,
533
+ gcp_project="my_project",
534
+ materialization_dataset="dataviews",
535
+ )
429
536
 
430
537
  # read a table
431
- source = BigQuerySource("bq2", table="the-psf.pypi.downloads20210328", gcp_project="my_project")
538
+ source = BigQuerySource(
539
+ "bq2", table="the-psf.pypi.downloads20210328", gcp_project="my_project"
540
+ )
432
541
 
433
542
 
434
543
  :parameter name: source name
@@ -531,10 +640,15 @@ class BigQuerySource(BaseSourceDriver):
531
640
  start_time=None,
532
641
  end_time=None,
533
642
  time_field=None,
643
+ additional_filters=None,
534
644
  ):
535
645
  from google.cloud import bigquery
536
646
  from google.cloud.bigquery_storage_v1 import BigQueryReadClient
537
647
 
648
+ mlrun.utils.helpers.additional_filters_warning(
649
+ additional_filters, self.__class__
650
+ )
651
+
538
652
  def schema_to_dtypes(schema):
539
653
  from mlrun.data_types.data_types import gbq_to_pandas_dtype
540
654
 
@@ -574,7 +688,6 @@ class BigQuerySource(BaseSourceDriver):
574
688
  else:
575
689
  df = rows_iterator.to_dataframe(dtypes=dtypes)
576
690
 
577
- # TODO : filter as part of the query
578
691
  return select_columns_from_df(
579
692
  filter_df_start_end_time(
580
693
  df,
@@ -636,7 +749,7 @@ class SnowflakeSource(BaseSourceDriver):
636
749
  url="...",
637
750
  user="...",
638
751
  database="...",
639
- schema="...",
752
+ db_schema="...",
640
753
  warehouse="...",
641
754
  )
642
755
 
@@ -651,7 +764,8 @@ class SnowflakeSource(BaseSourceDriver):
651
764
  :parameter url: URL of the snowflake cluster
652
765
  :parameter user: snowflake user
653
766
  :parameter database: snowflake database
654
- :parameter schema: snowflake schema
767
+ :parameter schema: snowflake schema - deprecated, use db_schema
768
+ :parameter db_schema: snowflake schema
655
769
  :parameter warehouse: snowflake warehouse
656
770
  """
657
771
 
@@ -663,6 +777,7 @@ class SnowflakeSource(BaseSourceDriver):
663
777
  self,
664
778
  name: str = "",
665
779
  key_field: str = None,
780
+ attributes: dict[str, object] = None,
666
781
  time_field: str = None,
667
782
  schedule: str = None,
668
783
  start_time=None,
@@ -672,21 +787,34 @@ class SnowflakeSource(BaseSourceDriver):
672
787
  user: str = None,
673
788
  database: str = None,
674
789
  schema: str = None,
790
+ db_schema: str = None,
675
791
  warehouse: str = None,
676
792
  **kwargs,
677
793
  ):
678
- attrs = {
679
- "query": query,
680
- "url": url,
681
- "user": user,
682
- "database": database,
683
- "schema": schema,
684
- "warehouse": warehouse,
685
- }
794
+ # TODO: Remove in 1.9.0
795
+ if schema:
796
+ warnings.warn(
797
+ "schema is deprecated in 1.7.0, and will be removed in 1.9.0, please use db_schema"
798
+ )
799
+ db_schema = db_schema or schema # TODO: Remove in 1.9.0
800
+
801
+ attributes = attributes or {}
802
+ if url:
803
+ attributes["url"] = url
804
+ if user:
805
+ attributes["user"] = user
806
+ if database:
807
+ attributes["database"] = database
808
+ if db_schema:
809
+ attributes["db_schema"] = db_schema
810
+ if warehouse:
811
+ attributes["warehouse"] = warehouse
812
+ if query:
813
+ attributes["query"] = query
686
814
 
687
815
  super().__init__(
688
816
  name,
689
- attributes=attrs,
817
+ attributes=attributes,
690
818
  key_field=key_field,
691
819
  time_field=time_field,
692
820
  schedule=schedule,
@@ -695,32 +823,24 @@ class SnowflakeSource(BaseSourceDriver):
695
823
  **kwargs,
696
824
  )
697
825
 
698
- def _get_password(self):
699
- key = "SNOWFLAKE_PASSWORD"
700
- snowflake_password = os.getenv(key) or os.getenv(
701
- SecretsStore.k8s_env_variable_name_for_secret(key)
702
- )
703
-
704
- if not snowflake_password:
705
- raise mlrun.errors.MLRunInvalidArgumentError(
706
- "No password provided. Set password using the SNOWFLAKE_PASSWORD "
707
- "project secret or environment variable."
708
- )
709
-
710
- return snowflake_password
711
-
712
826
  def get_spark_options(self):
713
- return {
714
- "format": "net.snowflake.spark.snowflake",
715
- "query": self.attributes.get("query"),
716
- "sfURL": self.attributes.get("url"),
717
- "sfUser": self.attributes.get("user"),
718
- "sfPassword": self._get_password(),
719
- "sfDatabase": self.attributes.get("database"),
720
- "sfSchema": self.attributes.get("schema"),
721
- "sfWarehouse": self.attributes.get("warehouse"),
722
- "application": "iguazio_platform",
723
- }
827
+ spark_options = get_snowflake_spark_options(self.attributes)
828
+ spark_options["query"] = self.attributes.get("query")
829
+ return spark_options
830
+
831
+ def to_dataframe(
832
+ self,
833
+ columns=None,
834
+ df_module=None,
835
+ entities=None,
836
+ start_time=None,
837
+ end_time=None,
838
+ time_field=None,
839
+ additional_filters=None,
840
+ ):
841
+ raise mlrun.errors.MLRunRuntimeError(
842
+ f"{type(self).__name__} supports only spark engine"
843
+ )
724
844
 
725
845
 
726
846
  class CustomSource(BaseSourceDriver):
@@ -774,7 +894,19 @@ class DataFrameSource:
774
894
  context=self.context or context,
775
895
  )
776
896
 
777
- def to_dataframe(self, **kwargs):
897
+ def to_dataframe(
898
+ self,
899
+ columns=None,
900
+ df_module=None,
901
+ entities=None,
902
+ start_time=None,
903
+ end_time=None,
904
+ time_field=None,
905
+ additional_filters=None,
906
+ ):
907
+ mlrun.utils.helpers.additional_filters_warning(
908
+ additional_filters, self.__class__
909
+ )
778
910
  return self._df
779
911
 
780
912
  def is_iterator(self):
@@ -800,7 +932,7 @@ class OnlineSource(BaseSourceDriver):
800
932
  self,
801
933
  name: str = None,
802
934
  path: str = None,
803
- attributes: Dict[str, object] = None,
935
+ attributes: dict[str, object] = None,
804
936
  key_field: str = None,
805
937
  time_field: str = None,
806
938
  workers: int = None,
@@ -812,16 +944,13 @@ class OnlineSource(BaseSourceDriver):
812
944
  def to_step(self, key_field=None, time_field=None, context=None):
813
945
  import storey
814
946
 
815
- source_class = (
816
- storey.AsyncEmitSource
817
- if config.datastore.async_source_mode == "enabled"
818
- else storey.SyncEmitSource
819
- )
820
947
  source_args = self.attributes.get("source_args", {})
821
948
  explicit_ack = (
822
- is_explicit_ack_supported(context) and mlrun.mlconf.is_explicit_ack()
949
+ is_explicit_ack_supported(context)
950
+ and mlrun.mlconf.is_explicit_ack_enabled()
823
951
  )
824
- src_class = source_class(
952
+ # TODO: Change to AsyncEmitSource once we can drop support for nuclio<1.12.10
953
+ src_class = storey.SyncEmitSource(
825
954
  context=context,
826
955
  key_field=self.key_field or key_field,
827
956
  full_event=True,
@@ -848,8 +977,6 @@ class HttpSource(OnlineSource):
848
977
 
849
978
 
850
979
  class StreamSource(OnlineSource):
851
- """Sets stream source for the flow. If stream doesn't exist it will create it"""
852
-
853
980
  kind = "v3ioStream"
854
981
 
855
982
  def __init__(
@@ -863,7 +990,7 @@ class StreamSource(OnlineSource):
863
990
  **kwargs,
864
991
  ):
865
992
  """
866
- Sets stream source for the flow. If stream doesn't exist it will create it
993
+ Sets the stream source for the flow. If the stream doesn't exist it will create it.
867
994
 
868
995
  :param name: stream name. Default "stream"
869
996
  :param group: consumer group. Default "serving"
@@ -882,8 +1009,15 @@ class StreamSource(OnlineSource):
882
1009
  super().__init__(name, attributes=attrs, **kwargs)
883
1010
 
884
1011
  def add_nuclio_trigger(self, function):
885
- endpoint, stream_path = parse_path(self.path)
886
- v3io_client = v3io.dataplane.Client(endpoint=endpoint)
1012
+ store, _, url = mlrun.store_manager.get_or_create_store(self.path)
1013
+ if store.kind != "v3io":
1014
+ raise mlrun.errors.MLRunInvalidArgumentError(
1015
+ "Only profiles that reference the v3io datastore can be used with StreamSource"
1016
+ )
1017
+ storage_options = store.get_storage_options()
1018
+ access_key = storage_options.get("v3io_access_key")
1019
+ endpoint, stream_path = parse_path(url)
1020
+ v3io_client = v3io.dataplane.Client(endpoint=endpoint, access_key=access_key)
887
1021
  container, stream_path = split_path(stream_path)
888
1022
  res = v3io_client.stream.create(
889
1023
  container=container,
@@ -898,12 +1032,13 @@ class StreamSource(OnlineSource):
898
1032
  engine = "async"
899
1033
  if hasattr(function.spec, "graph") and function.spec.graph.engine:
900
1034
  engine = function.spec.graph.engine
901
- if mlrun.mlconf.is_explicit_ack() and engine == "async":
1035
+
1036
+ if mlrun.mlconf.is_explicit_ack_enabled() and engine == "async":
902
1037
  kwargs["explicit_ack_mode"] = "explicitOnly"
903
1038
  kwargs["worker_allocation_mode"] = "static"
904
1039
 
905
1040
  function.add_v3io_stream_trigger(
906
- self.path,
1041
+ url,
907
1042
  self.name,
908
1043
  self.attributes["group"],
909
1044
  self.attributes["seek_to"],
@@ -915,8 +1050,6 @@ class StreamSource(OnlineSource):
915
1050
 
916
1051
 
917
1052
  class KafkaSource(OnlineSource):
918
- """Sets kafka source for the flow"""
919
-
920
1053
  kind = "kafka"
921
1054
 
922
1055
  def __init__(
@@ -970,6 +1103,7 @@ class KafkaSource(OnlineSource):
970
1103
  start_time=None,
971
1104
  end_time=None,
972
1105
  time_field=None,
1106
+ additional_filters=None,
973
1107
  ):
974
1108
  raise mlrun.MLRunInvalidArgumentError(
975
1109
  "KafkaSource does not support batch processing"
@@ -986,7 +1120,8 @@ class KafkaSource(OnlineSource):
986
1120
  engine = "async"
987
1121
  if hasattr(function.spec, "graph") and function.spec.graph.engine:
988
1122
  engine = function.spec.graph.engine
989
- if mlrun.mlconf.is_explicit_ack() and engine == "async":
1123
+
1124
+ if mlrun.mlconf.is_explicit_ack_enabled() and engine == "async":
990
1125
  explicit_ack_mode = "explicitOnly"
991
1126
  extra_attributes["workerAllocationMode"] = extra_attributes.get(
992
1127
  "worker_allocation_mode", "static"
@@ -1029,6 +1164,59 @@ class KafkaSource(OnlineSource):
1029
1164
  "to a Spark dataframe is not possible, as this operation is not supported by Spark"
1030
1165
  )
1031
1166
 
1167
+ def create_topics(
1168
+ self,
1169
+ num_partitions: int = 4,
1170
+ replication_factor: int = 1,
1171
+ topics: list[str] = None,
1172
+ ):
1173
+ """
1174
+ Create Kafka topics with the specified number of partitions and replication factor.
1175
+
1176
+ :param num_partitions: number of partitions for the topics
1177
+ :param replication_factor: replication factor for the topics
1178
+ :param topics: list of topic names to create, if None,
1179
+ the topics will be taken from the source attributes
1180
+ """
1181
+ from kafka.admin import KafkaAdminClient, NewTopic
1182
+
1183
+ brokers = self.attributes.get("brokers")
1184
+ if not brokers:
1185
+ raise mlrun.errors.MLRunInvalidArgumentError(
1186
+ "brokers must be specified in the KafkaSource attributes"
1187
+ )
1188
+ topics = topics or self.attributes.get("topics")
1189
+ if not topics:
1190
+ raise mlrun.errors.MLRunInvalidArgumentError(
1191
+ "topics must be specified in the KafkaSource attributes"
1192
+ )
1193
+ new_topics = [
1194
+ NewTopic(topic, num_partitions, replication_factor) for topic in topics
1195
+ ]
1196
+ kafka_admin = KafkaAdminClient(
1197
+ bootstrap_servers=brokers,
1198
+ sasl_mechanism=self.attributes.get("sasl", {}).get("sasl_mechanism"),
1199
+ sasl_plain_username=self.attributes.get("sasl", {}).get("username"),
1200
+ sasl_plain_password=self.attributes.get("sasl", {}).get("password"),
1201
+ sasl_kerberos_service_name=self.attributes.get("sasl", {}).get(
1202
+ "sasl_kerberos_service_name", "kafka"
1203
+ ),
1204
+ sasl_kerberos_domain_name=self.attributes.get("sasl", {}).get(
1205
+ "sasl_kerberos_domain_name"
1206
+ ),
1207
+ sasl_oauth_token_provider=self.attributes.get("sasl", {}).get("mechanism"),
1208
+ )
1209
+ try:
1210
+ kafka_admin.create_topics(new_topics)
1211
+ finally:
1212
+ kafka_admin.close()
1213
+ logger.info(
1214
+ "Kafka topics created successfully",
1215
+ topics=topics,
1216
+ num_partitions=num_partitions,
1217
+ replication_factor=replication_factor,
1218
+ )
1219
+
1032
1220
 
1033
1221
  class SQLSource(BaseSourceDriver):
1034
1222
  kind = "sqldb"
@@ -1047,7 +1235,7 @@ class SQLSource(BaseSourceDriver):
1047
1235
  db_url: str = None,
1048
1236
  table_name: str = None,
1049
1237
  spark_options: dict = None,
1050
- parse_dates: List[str] = None,
1238
+ parse_dates: list[str] = None,
1051
1239
  **kwargs,
1052
1240
  ):
1053
1241
  """
@@ -1110,9 +1298,13 @@ class SQLSource(BaseSourceDriver):
1110
1298
  start_time=None,
1111
1299
  end_time=None,
1112
1300
  time_field=None,
1301
+ additional_filters=None,
1113
1302
  ):
1114
1303
  import sqlalchemy as sqlalchemy
1115
1304
 
1305
+ mlrun.utils.helpers.additional_filters_warning(
1306
+ additional_filters, self.__class__
1307
+ )
1116
1308
  db_path = self.attributes.get("db_path")
1117
1309
  table_name = self.attributes.get("table_name")
1118
1310
  parse_dates = self.attributes.get("parse_dates")