mlrun 1.7.0rc4__py3-none-any.whl → 1.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (235) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +39 -121
  3. mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
  4. mlrun/alerts/alert.py +248 -0
  5. mlrun/api/schemas/__init__.py +4 -3
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +39 -254
  8. mlrun/artifacts/dataset.py +9 -190
  9. mlrun/artifacts/manager.py +73 -46
  10. mlrun/artifacts/model.py +30 -158
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +73 -1
  13. mlrun/common/db/sql_session.py +3 -2
  14. mlrun/common/formatters/__init__.py +21 -0
  15. mlrun/common/formatters/artifact.py +46 -0
  16. mlrun/common/formatters/base.py +113 -0
  17. mlrun/common/formatters/feature_set.py +44 -0
  18. mlrun/common/formatters/function.py +46 -0
  19. mlrun/common/formatters/pipeline.py +53 -0
  20. mlrun/common/formatters/project.py +51 -0
  21. mlrun/common/formatters/run.py +29 -0
  22. mlrun/common/helpers.py +11 -1
  23. mlrun/{runtimes → common/runtimes}/constants.py +32 -4
  24. mlrun/common/schemas/__init__.py +31 -4
  25. mlrun/common/schemas/alert.py +202 -0
  26. mlrun/common/schemas/api_gateway.py +196 -0
  27. mlrun/common/schemas/artifact.py +28 -1
  28. mlrun/common/schemas/auth.py +13 -2
  29. mlrun/common/schemas/client_spec.py +2 -1
  30. mlrun/common/schemas/common.py +7 -4
  31. mlrun/common/schemas/constants.py +3 -0
  32. mlrun/common/schemas/feature_store.py +58 -28
  33. mlrun/common/schemas/frontend_spec.py +8 -0
  34. mlrun/common/schemas/function.py +11 -0
  35. mlrun/common/schemas/hub.py +7 -9
  36. mlrun/common/schemas/model_monitoring/__init__.py +21 -4
  37. mlrun/common/schemas/model_monitoring/constants.py +136 -42
  38. mlrun/common/schemas/model_monitoring/grafana.py +9 -5
  39. mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
  40. mlrun/common/schemas/notification.py +69 -12
  41. mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
  42. mlrun/common/schemas/pipeline.py +7 -0
  43. mlrun/common/schemas/project.py +67 -16
  44. mlrun/common/schemas/runs.py +17 -0
  45. mlrun/common/schemas/schedule.py +1 -1
  46. mlrun/common/schemas/workflow.py +10 -2
  47. mlrun/common/types.py +14 -1
  48. mlrun/config.py +233 -58
  49. mlrun/data_types/data_types.py +11 -1
  50. mlrun/data_types/spark.py +5 -4
  51. mlrun/data_types/to_pandas.py +75 -34
  52. mlrun/datastore/__init__.py +8 -10
  53. mlrun/datastore/alibaba_oss.py +131 -0
  54. mlrun/datastore/azure_blob.py +131 -43
  55. mlrun/datastore/base.py +107 -47
  56. mlrun/datastore/datastore.py +17 -7
  57. mlrun/datastore/datastore_profile.py +91 -7
  58. mlrun/datastore/dbfs_store.py +3 -7
  59. mlrun/datastore/filestore.py +1 -3
  60. mlrun/datastore/google_cloud_storage.py +92 -32
  61. mlrun/datastore/hdfs.py +5 -0
  62. mlrun/datastore/inmem.py +6 -3
  63. mlrun/datastore/redis.py +3 -2
  64. mlrun/datastore/s3.py +30 -12
  65. mlrun/datastore/snowflake_utils.py +45 -0
  66. mlrun/datastore/sources.py +274 -59
  67. mlrun/datastore/spark_utils.py +30 -0
  68. mlrun/datastore/store_resources.py +9 -7
  69. mlrun/datastore/storeytargets.py +151 -0
  70. mlrun/datastore/targets.py +387 -119
  71. mlrun/datastore/utils.py +68 -5
  72. mlrun/datastore/v3io.py +28 -50
  73. mlrun/db/auth_utils.py +152 -0
  74. mlrun/db/base.py +245 -20
  75. mlrun/db/factory.py +1 -4
  76. mlrun/db/httpdb.py +909 -231
  77. mlrun/db/nopdb.py +279 -14
  78. mlrun/errors.py +35 -5
  79. mlrun/execution.py +111 -38
  80. mlrun/feature_store/__init__.py +0 -2
  81. mlrun/feature_store/api.py +46 -53
  82. mlrun/feature_store/common.py +6 -11
  83. mlrun/feature_store/feature_set.py +48 -23
  84. mlrun/feature_store/feature_vector.py +13 -2
  85. mlrun/feature_store/ingestion.py +7 -6
  86. mlrun/feature_store/retrieval/base.py +9 -4
  87. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  88. mlrun/feature_store/retrieval/job.py +13 -4
  89. mlrun/feature_store/retrieval/local_merger.py +2 -0
  90. mlrun/feature_store/retrieval/spark_merger.py +24 -32
  91. mlrun/feature_store/steps.py +38 -19
  92. mlrun/features.py +6 -14
  93. mlrun/frameworks/_common/plan.py +3 -3
  94. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
  95. mlrun/frameworks/_ml_common/plan.py +1 -1
  96. mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
  97. mlrun/frameworks/lgbm/__init__.py +1 -1
  98. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  99. mlrun/frameworks/lgbm/model_handler.py +1 -1
  100. mlrun/frameworks/parallel_coordinates.py +4 -4
  101. mlrun/frameworks/pytorch/__init__.py +2 -2
  102. mlrun/frameworks/sklearn/__init__.py +1 -1
  103. mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
  104. mlrun/frameworks/tf_keras/__init__.py +5 -2
  105. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
  106. mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
  107. mlrun/frameworks/xgboost/__init__.py +1 -1
  108. mlrun/k8s_utils.py +57 -12
  109. mlrun/launcher/__init__.py +1 -1
  110. mlrun/launcher/base.py +6 -5
  111. mlrun/launcher/client.py +13 -11
  112. mlrun/launcher/factory.py +1 -1
  113. mlrun/launcher/local.py +15 -5
  114. mlrun/launcher/remote.py +10 -3
  115. mlrun/lists.py +6 -2
  116. mlrun/model.py +297 -48
  117. mlrun/model_monitoring/__init__.py +1 -1
  118. mlrun/model_monitoring/api.py +152 -357
  119. mlrun/model_monitoring/applications/__init__.py +10 -0
  120. mlrun/model_monitoring/applications/_application_steps.py +190 -0
  121. mlrun/model_monitoring/applications/base.py +108 -0
  122. mlrun/model_monitoring/applications/context.py +341 -0
  123. mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
  124. mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
  125. mlrun/model_monitoring/applications/results.py +99 -0
  126. mlrun/model_monitoring/controller.py +130 -303
  127. mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
  128. mlrun/model_monitoring/db/stores/__init__.py +136 -0
  129. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  130. mlrun/model_monitoring/db/stores/base/store.py +213 -0
  131. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  132. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  133. mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
  134. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
  135. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  136. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
  137. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  138. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
  139. mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
  140. mlrun/model_monitoring/db/tsdb/base.py +448 -0
  141. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  142. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  143. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
  144. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
  145. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
  146. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  147. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
  148. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
  149. mlrun/model_monitoring/features_drift_table.py +34 -22
  150. mlrun/model_monitoring/helpers.py +177 -39
  151. mlrun/model_monitoring/model_endpoint.py +3 -2
  152. mlrun/model_monitoring/stream_processing.py +165 -398
  153. mlrun/model_monitoring/tracking_policy.py +7 -1
  154. mlrun/model_monitoring/writer.py +161 -125
  155. mlrun/package/packagers/default_packager.py +2 -2
  156. mlrun/package/packagers_manager.py +1 -0
  157. mlrun/package/utils/_formatter.py +2 -2
  158. mlrun/platforms/__init__.py +11 -10
  159. mlrun/platforms/iguazio.py +67 -228
  160. mlrun/projects/__init__.py +6 -1
  161. mlrun/projects/operations.py +47 -20
  162. mlrun/projects/pipelines.py +396 -249
  163. mlrun/projects/project.py +1176 -406
  164. mlrun/render.py +28 -22
  165. mlrun/run.py +208 -181
  166. mlrun/runtimes/__init__.py +76 -11
  167. mlrun/runtimes/base.py +54 -24
  168. mlrun/runtimes/daskjob.py +9 -2
  169. mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
  170. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  171. mlrun/runtimes/funcdoc.py +1 -29
  172. mlrun/runtimes/kubejob.py +34 -128
  173. mlrun/runtimes/local.py +39 -10
  174. mlrun/runtimes/mpijob/__init__.py +0 -20
  175. mlrun/runtimes/mpijob/abstract.py +8 -8
  176. mlrun/runtimes/mpijob/v1.py +1 -1
  177. mlrun/runtimes/nuclio/__init__.py +1 -0
  178. mlrun/runtimes/nuclio/api_gateway.py +769 -0
  179. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  180. mlrun/runtimes/nuclio/application/application.py +758 -0
  181. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  182. mlrun/runtimes/nuclio/function.py +188 -68
  183. mlrun/runtimes/nuclio/serving.py +57 -60
  184. mlrun/runtimes/pod.py +191 -58
  185. mlrun/runtimes/remotesparkjob.py +11 -8
  186. mlrun/runtimes/sparkjob/spark3job.py +17 -18
  187. mlrun/runtimes/utils.py +40 -73
  188. mlrun/secrets.py +6 -2
  189. mlrun/serving/__init__.py +8 -1
  190. mlrun/serving/remote.py +2 -3
  191. mlrun/serving/routers.py +89 -64
  192. mlrun/serving/server.py +54 -26
  193. mlrun/serving/states.py +187 -56
  194. mlrun/serving/utils.py +19 -11
  195. mlrun/serving/v2_serving.py +136 -63
  196. mlrun/track/tracker.py +2 -1
  197. mlrun/track/trackers/mlflow_tracker.py +5 -0
  198. mlrun/utils/async_http.py +26 -6
  199. mlrun/utils/db.py +18 -0
  200. mlrun/utils/helpers.py +375 -105
  201. mlrun/utils/http.py +2 -2
  202. mlrun/utils/logger.py +75 -9
  203. mlrun/utils/notifications/notification/__init__.py +14 -10
  204. mlrun/utils/notifications/notification/base.py +48 -0
  205. mlrun/utils/notifications/notification/console.py +2 -0
  206. mlrun/utils/notifications/notification/git.py +24 -1
  207. mlrun/utils/notifications/notification/ipython.py +2 -0
  208. mlrun/utils/notifications/notification/slack.py +96 -21
  209. mlrun/utils/notifications/notification/webhook.py +63 -2
  210. mlrun/utils/notifications/notification_pusher.py +146 -16
  211. mlrun/utils/regex.py +9 -0
  212. mlrun/utils/retryer.py +3 -2
  213. mlrun/utils/v3io_clients.py +2 -3
  214. mlrun/utils/version/version.json +2 -2
  215. mlrun-1.7.2.dist-info/METADATA +390 -0
  216. mlrun-1.7.2.dist-info/RECORD +351 -0
  217. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
  218. mlrun/feature_store/retrieval/conversion.py +0 -271
  219. mlrun/kfpops.py +0 -868
  220. mlrun/model_monitoring/application.py +0 -310
  221. mlrun/model_monitoring/batch.py +0 -974
  222. mlrun/model_monitoring/controller_handler.py +0 -37
  223. mlrun/model_monitoring/prometheus.py +0 -216
  224. mlrun/model_monitoring/stores/__init__.py +0 -111
  225. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
  226. mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
  227. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  228. mlrun/model_monitoring/stores/models/base.py +0 -84
  229. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
  230. mlrun/platforms/other.py +0 -305
  231. mlrun-1.7.0rc4.dist-info/METADATA +0 -269
  232. mlrun-1.7.0rc4.dist-info/RECORD +0 -321
  233. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
  234. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
  235. {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import json
15
+ import operator
15
16
  import os
16
17
  import warnings
17
18
  from base64 import b64encode
@@ -28,7 +29,10 @@ from nuclio.config import split_path
28
29
 
29
30
  import mlrun
30
31
  from mlrun.config import config
32
+ from mlrun.datastore.snowflake_utils import get_snowflake_spark_options
33
+ from mlrun.datastore.utils import transform_list_filters_to_tuple
31
34
  from mlrun.secrets import SecretsStore
35
+ from mlrun.utils import logger
32
36
 
33
37
  from ..model import DataSource
34
38
  from ..platforms.iguazio import parse_path
@@ -82,7 +86,8 @@ class BaseSourceDriver(DataSource):
82
86
  )
83
87
 
84
88
  explicit_ack = (
85
- is_explicit_ack_supported(context) and mlrun.mlconf.is_explicit_ack()
89
+ is_explicit_ack_supported(context)
90
+ and mlrun.mlconf.is_explicit_ack_enabled()
86
91
  )
87
92
  return storey.SyncEmitSource(
88
93
  context=context,
@@ -101,8 +106,12 @@ class BaseSourceDriver(DataSource):
101
106
  start_time=None,
102
107
  end_time=None,
103
108
  time_field=None,
109
+ additional_filters=None,
104
110
  ):
105
111
  """return the source data as dataframe"""
112
+ mlrun.utils.helpers.additional_filters_warning(
113
+ additional_filters, self.__class__
114
+ )
106
115
  return mlrun.store_manager.object(url=self.path).as_df(
107
116
  columns=columns,
108
117
  df_module=df_module,
@@ -113,7 +122,11 @@ class BaseSourceDriver(DataSource):
113
122
 
114
123
  def to_spark_df(self, session, named_view=False, time_field=None, columns=None):
115
124
  if self.support_spark:
116
- df = load_spark_dataframe_with_options(session, self.get_spark_options())
125
+ spark_options = self.get_spark_options()
126
+ spark_format = spark_options.pop("format", None)
127
+ df = load_spark_dataframe_with_options(
128
+ session, spark_options, format=spark_format
129
+ )
117
130
  if named_view:
118
131
  df.createOrReplaceTempView(self.name)
119
132
  return self._filter_spark_df(df, time_field, columns)
@@ -169,7 +182,7 @@ class CSVSource(BaseSourceDriver):
169
182
  self,
170
183
  name: str = "",
171
184
  path: str = None,
172
- attributes: dict[str, str] = None,
185
+ attributes: dict[str, object] = None,
173
186
  key_field: str = None,
174
187
  schedule: str = None,
175
188
  parse_dates: Union[None, int, str, list[int], list[str]] = None,
@@ -204,11 +217,11 @@ class CSVSource(BaseSourceDriver):
204
217
  )
205
218
 
206
219
  def get_spark_options(self):
207
- store, path, url = mlrun.store_manager.get_or_create_store(self.path)
220
+ store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
208
221
  spark_options = store.get_spark_options()
209
222
  spark_options.update(
210
223
  {
211
- "path": url,
224
+ "path": store.spark_url + path,
212
225
  "format": "csv",
213
226
  "header": "true",
214
227
  "inferSchema": "true",
@@ -240,7 +253,11 @@ class CSVSource(BaseSourceDriver):
240
253
  start_time=None,
241
254
  end_time=None,
242
255
  time_field=None,
256
+ additional_filters=None,
243
257
  ):
258
+ mlrun.utils.helpers.additional_filters_warning(
259
+ additional_filters, self.__class__
260
+ )
244
261
  reader_args = self.attributes.get("reader_args", {})
245
262
  return mlrun.store_manager.object(url=self.path).as_df(
246
263
  columns=columns,
@@ -276,6 +293,12 @@ class ParquetSource(BaseSourceDriver):
276
293
  :parameter start_time: filters out data before this time
277
294
  :parameter end_time: filters out data after this time
278
295
  :parameter attributes: additional parameters to pass to storey.
296
+ :param additional_filters: List of additional_filter conditions as tuples.
297
+ Each tuple should be in the format (column_name, operator, value).
298
+ Supported operators: "=", ">=", "<=", ">", "<".
299
+ Example: [("Product", "=", "Computer")]
300
+ For all supported filters, please see:
301
+ https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
279
302
  """
280
303
 
281
304
  kind = "parquet"
@@ -286,13 +309,19 @@ class ParquetSource(BaseSourceDriver):
286
309
  self,
287
310
  name: str = "",
288
311
  path: str = None,
289
- attributes: dict[str, str] = None,
312
+ attributes: dict[str, object] = None,
290
313
  key_field: str = None,
291
314
  time_field: str = None,
292
315
  schedule: str = None,
293
316
  start_time: Optional[Union[datetime, str]] = None,
294
317
  end_time: Optional[Union[datetime, str]] = None,
318
+ additional_filters: Optional[list[Union[tuple, list]]] = None,
295
319
  ):
320
+ if additional_filters:
321
+ attributes = copy(attributes) or {}
322
+ additional_filters = transform_list_filters_to_tuple(additional_filters)
323
+ attributes["additional_filters"] = additional_filters
324
+
296
325
  super().__init__(
297
326
  name,
298
327
  path,
@@ -320,6 +349,10 @@ class ParquetSource(BaseSourceDriver):
320
349
  def end_time(self, end_time):
321
350
  self._end_time = self._convert_to_datetime(end_time)
322
351
 
352
+ @property
353
+ def additional_filters(self):
354
+ return self.attributes.get("additional_filters")
355
+
323
356
  @staticmethod
324
357
  def _convert_to_datetime(time):
325
358
  if time and isinstance(time, str):
@@ -336,16 +369,17 @@ class ParquetSource(BaseSourceDriver):
336
369
  start_time=None,
337
370
  end_time=None,
338
371
  context=None,
372
+ additional_filters=None,
339
373
  ):
340
374
  import storey
341
375
 
342
- attributes = self.attributes or {}
376
+ attributes = copy(self.attributes)
377
+ attributes.pop("additional_filters", None)
343
378
  if context:
344
379
  attributes["context"] = context
345
-
380
+ additional_filters = transform_list_filters_to_tuple(additional_filters)
346
381
  data_item = mlrun.store_manager.object(self.path)
347
382
  store, path, url = mlrun.store_manager.get_or_create_store(self.path)
348
-
349
383
  return storey.ParquetSource(
350
384
  paths=url, # unlike self.path, it already has store:// replaced
351
385
  key_field=self.key_field or key_field,
@@ -353,11 +387,22 @@ class ParquetSource(BaseSourceDriver):
353
387
  end_filter=self.end_time,
354
388
  start_filter=self.start_time,
355
389
  filter_column=self.time_field or time_field,
390
+ additional_filters=self.additional_filters or additional_filters,
356
391
  **attributes,
357
392
  )
358
393
 
394
+ @classmethod
395
+ def from_dict(cls, struct=None, fields=None, deprecated_fields: dict = None):
396
+ new_obj = super().from_dict(
397
+ struct=struct, fields=fields, deprecated_fields=deprecated_fields
398
+ )
399
+ new_obj.attributes["additional_filters"] = transform_list_filters_to_tuple(
400
+ new_obj.additional_filters
401
+ )
402
+ return new_obj
403
+
359
404
  def get_spark_options(self):
360
- store, path, url = mlrun.store_manager.get_or_create_store(self.path)
405
+ store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
361
406
  spark_options = store.get_spark_options()
362
407
  spark_options.update(
363
408
  {
@@ -375,8 +420,10 @@ class ParquetSource(BaseSourceDriver):
375
420
  start_time=None,
376
421
  end_time=None,
377
422
  time_field=None,
423
+ additional_filters=None,
378
424
  ):
379
425
  reader_args = self.attributes.get("reader_args", {})
426
+ additional_filters = transform_list_filters_to_tuple(additional_filters)
380
427
  return mlrun.store_manager.object(url=self.path).as_df(
381
428
  columns=columns,
382
429
  df_module=df_module,
@@ -384,9 +431,88 @@ class ParquetSource(BaseSourceDriver):
384
431
  end_time=end_time or self.end_time,
385
432
  time_column=time_field or self.time_field,
386
433
  format="parquet",
434
+ additional_filters=additional_filters or self.additional_filters,
387
435
  **reader_args,
388
436
  )
389
437
 
438
+ def _build_spark_additional_filters(self, column_types: dict):
439
+ if not self.additional_filters:
440
+ return None
441
+ from pyspark.sql.functions import col, isnan, lit
442
+
443
+ operators = {
444
+ "==": operator.eq,
445
+ "=": operator.eq,
446
+ ">": operator.gt,
447
+ "<": operator.lt,
448
+ ">=": operator.ge,
449
+ "<=": operator.le,
450
+ "!=": operator.ne,
451
+ }
452
+
453
+ spark_filter = None
454
+ new_filter = lit(True)
455
+ for filter_tuple in self.additional_filters:
456
+ if not filter_tuple:
457
+ continue
458
+ col_name, op, value = filter_tuple
459
+ if op.lower() in ("in", "not in") and isinstance(value, (list, tuple, set)):
460
+ none_exists = False
461
+ value = list(value)
462
+ for sub_value in value:
463
+ if sub_value is None:
464
+ value.remove(sub_value)
465
+ none_exists = True
466
+ if none_exists:
467
+ filter_nan = column_types[col_name] not in ("timestamp", "date")
468
+ if value:
469
+ if op.lower() == "in":
470
+ new_filter = (
471
+ col(col_name).isin(value) | col(col_name).isNull()
472
+ )
473
+ if filter_nan:
474
+ new_filter = new_filter | isnan(col(col_name))
475
+
476
+ else:
477
+ new_filter = (
478
+ ~col(col_name).isin(value) & ~col(col_name).isNull()
479
+ )
480
+ if filter_nan:
481
+ new_filter = new_filter & ~isnan(col(col_name))
482
+ else:
483
+ if op.lower() == "in":
484
+ new_filter = col(col_name).isNull()
485
+ if filter_nan:
486
+ new_filter = new_filter | isnan(col(col_name))
487
+ else:
488
+ new_filter = ~col(col_name).isNull()
489
+ if filter_nan:
490
+ new_filter = new_filter & ~isnan(col(col_name))
491
+ else:
492
+ if op.lower() == "in":
493
+ new_filter = col(col_name).isin(value)
494
+ elif op.lower() == "not in":
495
+ new_filter = ~col(col_name).isin(value)
496
+ elif op in operators:
497
+ new_filter = operators[op](col(col_name), value)
498
+ else:
499
+ raise mlrun.errors.MLRunInvalidArgumentError(
500
+ f"unsupported filter operator: {op}"
501
+ )
502
+ if spark_filter is not None:
503
+ spark_filter = spark_filter & new_filter
504
+ else:
505
+ spark_filter = new_filter
506
+ return spark_filter
507
+
508
+ def _filter_spark_df(self, df, time_field=None, columns=None):
509
+ spark_additional_filters = self._build_spark_additional_filters(
510
+ column_types=dict(df.dtypes)
511
+ )
512
+ if spark_additional_filters is not None:
513
+ df = df.filter(spark_additional_filters)
514
+ return super()._filter_spark_df(df=df, time_field=time_field, columns=columns)
515
+
390
516
 
391
517
  class BigQuerySource(BaseSourceDriver):
392
518
  """
@@ -401,12 +527,17 @@ class BigQuerySource(BaseSourceDriver):
401
527
 
402
528
  # use sql query
403
529
  query_string = "SELECT * FROM `the-psf.pypi.downloads20210328` LIMIT 5000"
404
- source = BigQuerySource("bq1", query=query_string,
405
- gcp_project="my_project",
406
- materialization_dataset="dataviews")
530
+ source = BigQuerySource(
531
+ "bq1",
532
+ query=query_string,
533
+ gcp_project="my_project",
534
+ materialization_dataset="dataviews",
535
+ )
407
536
 
408
537
  # read a table
409
- source = BigQuerySource("bq2", table="the-psf.pypi.downloads20210328", gcp_project="my_project")
538
+ source = BigQuerySource(
539
+ "bq2", table="the-psf.pypi.downloads20210328", gcp_project="my_project"
540
+ )
410
541
 
411
542
 
412
543
  :parameter name: source name
@@ -509,10 +640,15 @@ class BigQuerySource(BaseSourceDriver):
509
640
  start_time=None,
510
641
  end_time=None,
511
642
  time_field=None,
643
+ additional_filters=None,
512
644
  ):
513
645
  from google.cloud import bigquery
514
646
  from google.cloud.bigquery_storage_v1 import BigQueryReadClient
515
647
 
648
+ mlrun.utils.helpers.additional_filters_warning(
649
+ additional_filters, self.__class__
650
+ )
651
+
516
652
  def schema_to_dtypes(schema):
517
653
  from mlrun.data_types.data_types import gbq_to_pandas_dtype
518
654
 
@@ -552,7 +688,6 @@ class BigQuerySource(BaseSourceDriver):
552
688
  else:
553
689
  df = rows_iterator.to_dataframe(dtypes=dtypes)
554
690
 
555
- # TODO : filter as part of the query
556
691
  return select_columns_from_df(
557
692
  filter_df_start_end_time(
558
693
  df,
@@ -614,7 +749,7 @@ class SnowflakeSource(BaseSourceDriver):
614
749
  url="...",
615
750
  user="...",
616
751
  database="...",
617
- schema="...",
752
+ db_schema="...",
618
753
  warehouse="...",
619
754
  )
620
755
 
@@ -629,7 +764,8 @@ class SnowflakeSource(BaseSourceDriver):
629
764
  :parameter url: URL of the snowflake cluster
630
765
  :parameter user: snowflake user
631
766
  :parameter database: snowflake database
632
- :parameter schema: snowflake schema
767
+ :parameter schema: snowflake schema - deprecated, use db_schema
768
+ :parameter db_schema: snowflake schema
633
769
  :parameter warehouse: snowflake warehouse
634
770
  """
635
771
 
@@ -641,6 +777,7 @@ class SnowflakeSource(BaseSourceDriver):
641
777
  self,
642
778
  name: str = "",
643
779
  key_field: str = None,
780
+ attributes: dict[str, object] = None,
644
781
  time_field: str = None,
645
782
  schedule: str = None,
646
783
  start_time=None,
@@ -650,21 +787,34 @@ class SnowflakeSource(BaseSourceDriver):
650
787
  user: str = None,
651
788
  database: str = None,
652
789
  schema: str = None,
790
+ db_schema: str = None,
653
791
  warehouse: str = None,
654
792
  **kwargs,
655
793
  ):
656
- attrs = {
657
- "query": query,
658
- "url": url,
659
- "user": user,
660
- "database": database,
661
- "schema": schema,
662
- "warehouse": warehouse,
663
- }
794
+ # TODO: Remove in 1.9.0
795
+ if schema:
796
+ warnings.warn(
797
+ "schema is deprecated in 1.7.0, and will be removed in 1.9.0, please use db_schema"
798
+ )
799
+ db_schema = db_schema or schema # TODO: Remove in 1.9.0
800
+
801
+ attributes = attributes or {}
802
+ if url:
803
+ attributes["url"] = url
804
+ if user:
805
+ attributes["user"] = user
806
+ if database:
807
+ attributes["database"] = database
808
+ if db_schema:
809
+ attributes["db_schema"] = db_schema
810
+ if warehouse:
811
+ attributes["warehouse"] = warehouse
812
+ if query:
813
+ attributes["query"] = query
664
814
 
665
815
  super().__init__(
666
816
  name,
667
- attributes=attrs,
817
+ attributes=attributes,
668
818
  key_field=key_field,
669
819
  time_field=time_field,
670
820
  schedule=schedule,
@@ -673,32 +823,24 @@ class SnowflakeSource(BaseSourceDriver):
673
823
  **kwargs,
674
824
  )
675
825
 
676
- def _get_password(self):
677
- key = "SNOWFLAKE_PASSWORD"
678
- snowflake_password = os.getenv(key) or os.getenv(
679
- SecretsStore.k8s_env_variable_name_for_secret(key)
680
- )
681
-
682
- if not snowflake_password:
683
- raise mlrun.errors.MLRunInvalidArgumentError(
684
- "No password provided. Set password using the SNOWFLAKE_PASSWORD "
685
- "project secret or environment variable."
686
- )
687
-
688
- return snowflake_password
689
-
690
826
  def get_spark_options(self):
691
- return {
692
- "format": "net.snowflake.spark.snowflake",
693
- "query": self.attributes.get("query"),
694
- "sfURL": self.attributes.get("url"),
695
- "sfUser": self.attributes.get("user"),
696
- "sfPassword": self._get_password(),
697
- "sfDatabase": self.attributes.get("database"),
698
- "sfSchema": self.attributes.get("schema"),
699
- "sfWarehouse": self.attributes.get("warehouse"),
700
- "application": "iguazio_platform",
701
- }
827
+ spark_options = get_snowflake_spark_options(self.attributes)
828
+ spark_options["query"] = self.attributes.get("query")
829
+ return spark_options
830
+
831
+ def to_dataframe(
832
+ self,
833
+ columns=None,
834
+ df_module=None,
835
+ entities=None,
836
+ start_time=None,
837
+ end_time=None,
838
+ time_field=None,
839
+ additional_filters=None,
840
+ ):
841
+ raise mlrun.errors.MLRunRuntimeError(
842
+ f"{type(self).__name__} supports only spark engine"
843
+ )
702
844
 
703
845
 
704
846
  class CustomSource(BaseSourceDriver):
@@ -752,7 +894,19 @@ class DataFrameSource:
752
894
  context=self.context or context,
753
895
  )
754
896
 
755
- def to_dataframe(self, **kwargs):
897
+ def to_dataframe(
898
+ self,
899
+ columns=None,
900
+ df_module=None,
901
+ entities=None,
902
+ start_time=None,
903
+ end_time=None,
904
+ time_field=None,
905
+ additional_filters=None,
906
+ ):
907
+ mlrun.utils.helpers.additional_filters_warning(
908
+ additional_filters, self.__class__
909
+ )
756
910
  return self._df
757
911
 
758
912
  def is_iterator(self):
@@ -792,9 +946,11 @@ class OnlineSource(BaseSourceDriver):
792
946
 
793
947
  source_args = self.attributes.get("source_args", {})
794
948
  explicit_ack = (
795
- is_explicit_ack_supported(context) and mlrun.mlconf.is_explicit_ack()
949
+ is_explicit_ack_supported(context)
950
+ and mlrun.mlconf.is_explicit_ack_enabled()
796
951
  )
797
- src_class = storey.AsyncEmitSource(
952
+ # TODO: Change to AsyncEmitSource once we can drop support for nuclio<1.12.10
953
+ src_class = storey.SyncEmitSource(
798
954
  context=context,
799
955
  key_field=self.key_field or key_field,
800
956
  full_event=True,
@@ -853,12 +1009,11 @@ class StreamSource(OnlineSource):
853
1009
  super().__init__(name, attributes=attrs, **kwargs)
854
1010
 
855
1011
  def add_nuclio_trigger(self, function):
856
- store, path, url = mlrun.store_manager.get_or_create_store(self.path)
1012
+ store, _, url = mlrun.store_manager.get_or_create_store(self.path)
857
1013
  if store.kind != "v3io":
858
1014
  raise mlrun.errors.MLRunInvalidArgumentError(
859
1015
  "Only profiles that reference the v3io datastore can be used with StreamSource"
860
1016
  )
861
- path = "v3io:/" + path
862
1017
  storage_options = store.get_storage_options()
863
1018
  access_key = storage_options.get("v3io_access_key")
864
1019
  endpoint, stream_path = parse_path(url)
@@ -877,12 +1032,13 @@ class StreamSource(OnlineSource):
877
1032
  engine = "async"
878
1033
  if hasattr(function.spec, "graph") and function.spec.graph.engine:
879
1034
  engine = function.spec.graph.engine
880
- if mlrun.mlconf.is_explicit_ack() and engine == "async":
1035
+
1036
+ if mlrun.mlconf.is_explicit_ack_enabled() and engine == "async":
881
1037
  kwargs["explicit_ack_mode"] = "explicitOnly"
882
1038
  kwargs["worker_allocation_mode"] = "static"
883
1039
 
884
1040
  function.add_v3io_stream_trigger(
885
- path,
1041
+ url,
886
1042
  self.name,
887
1043
  self.attributes["group"],
888
1044
  self.attributes["seek_to"],
@@ -947,6 +1103,7 @@ class KafkaSource(OnlineSource):
947
1103
  start_time=None,
948
1104
  end_time=None,
949
1105
  time_field=None,
1106
+ additional_filters=None,
950
1107
  ):
951
1108
  raise mlrun.MLRunInvalidArgumentError(
952
1109
  "KafkaSource does not support batch processing"
@@ -963,7 +1120,8 @@ class KafkaSource(OnlineSource):
963
1120
  engine = "async"
964
1121
  if hasattr(function.spec, "graph") and function.spec.graph.engine:
965
1122
  engine = function.spec.graph.engine
966
- if mlrun.mlconf.is_explicit_ack() and engine == "async":
1123
+
1124
+ if mlrun.mlconf.is_explicit_ack_enabled() and engine == "async":
967
1125
  explicit_ack_mode = "explicitOnly"
968
1126
  extra_attributes["workerAllocationMode"] = extra_attributes.get(
969
1127
  "worker_allocation_mode", "static"
@@ -1006,6 +1164,59 @@ class KafkaSource(OnlineSource):
1006
1164
  "to a Spark dataframe is not possible, as this operation is not supported by Spark"
1007
1165
  )
1008
1166
 
1167
+ def create_topics(
1168
+ self,
1169
+ num_partitions: int = 4,
1170
+ replication_factor: int = 1,
1171
+ topics: list[str] = None,
1172
+ ):
1173
+ """
1174
+ Create Kafka topics with the specified number of partitions and replication factor.
1175
+
1176
+ :param num_partitions: number of partitions for the topics
1177
+ :param replication_factor: replication factor for the topics
1178
+ :param topics: list of topic names to create, if None,
1179
+ the topics will be taken from the source attributes
1180
+ """
1181
+ from kafka.admin import KafkaAdminClient, NewTopic
1182
+
1183
+ brokers = self.attributes.get("brokers")
1184
+ if not brokers:
1185
+ raise mlrun.errors.MLRunInvalidArgumentError(
1186
+ "brokers must be specified in the KafkaSource attributes"
1187
+ )
1188
+ topics = topics or self.attributes.get("topics")
1189
+ if not topics:
1190
+ raise mlrun.errors.MLRunInvalidArgumentError(
1191
+ "topics must be specified in the KafkaSource attributes"
1192
+ )
1193
+ new_topics = [
1194
+ NewTopic(topic, num_partitions, replication_factor) for topic in topics
1195
+ ]
1196
+ kafka_admin = KafkaAdminClient(
1197
+ bootstrap_servers=brokers,
1198
+ sasl_mechanism=self.attributes.get("sasl", {}).get("sasl_mechanism"),
1199
+ sasl_plain_username=self.attributes.get("sasl", {}).get("username"),
1200
+ sasl_plain_password=self.attributes.get("sasl", {}).get("password"),
1201
+ sasl_kerberos_service_name=self.attributes.get("sasl", {}).get(
1202
+ "sasl_kerberos_service_name", "kafka"
1203
+ ),
1204
+ sasl_kerberos_domain_name=self.attributes.get("sasl", {}).get(
1205
+ "sasl_kerberos_domain_name"
1206
+ ),
1207
+ sasl_oauth_token_provider=self.attributes.get("sasl", {}).get("mechanism"),
1208
+ )
1209
+ try:
1210
+ kafka_admin.create_topics(new_topics)
1211
+ finally:
1212
+ kafka_admin.close()
1213
+ logger.info(
1214
+ "Kafka topics created successfully",
1215
+ topics=topics,
1216
+ num_partitions=num_partitions,
1217
+ replication_factor=replication_factor,
1218
+ )
1219
+
1009
1220
 
1010
1221
  class SQLSource(BaseSourceDriver):
1011
1222
  kind = "sqldb"
@@ -1087,9 +1298,13 @@ class SQLSource(BaseSourceDriver):
1087
1298
  start_time=None,
1088
1299
  end_time=None,
1089
1300
  time_field=None,
1301
+ additional_filters=None,
1090
1302
  ):
1091
1303
  import sqlalchemy as sqlalchemy
1092
1304
 
1305
+ mlrun.utils.helpers.additional_filters_warning(
1306
+ additional_filters, self.__class__
1307
+ )
1093
1308
  db_path = self.attributes.get("db_path")
1094
1309
  table_name = self.attributes.get("table_name")
1095
1310
  parse_dates = self.attributes.get("parse_dates")
@@ -13,7 +13,10 @@
13
13
  # limitations under the License.
14
14
 
15
15
 
16
+ from typing import Union
17
+
16
18
  import mlrun
19
+ from mlrun.features import Entity
17
20
 
18
21
 
19
22
  def spark_session_update_hadoop_options(session, spark_options) -> dict[str, str]:
@@ -35,3 +38,30 @@ def spark_session_update_hadoop_options(session, spark_options) -> dict[str, str
35
38
  else:
36
39
  non_hadoop_spark_options[key] = value
37
40
  return non_hadoop_spark_options
41
+
42
+
43
+ def check_special_columns_exists(
44
+ spark_df, entities: list[Union[Entity, str]], timestamp_key: str, label_column: str
45
+ ):
46
+ columns = spark_df.columns
47
+ entities = entities or []
48
+ entities = [
49
+ entity.name if isinstance(entity, Entity) else entity for entity in entities
50
+ ]
51
+ missing_entities = [entity for entity in entities if entity not in columns]
52
+ cases_message = "Please check the letter cases (uppercase or lowercase)"
53
+ if missing_entities:
54
+ raise mlrun.errors.MLRunInvalidArgumentError(
55
+ f"There are missing entities from dataframe during ingestion. missing_entities: {missing_entities}."
56
+ f" {cases_message}"
57
+ )
58
+ if timestamp_key and timestamp_key not in columns:
59
+ raise mlrun.errors.MLRunInvalidArgumentError(
60
+ f"timestamp_key is missing from dataframe during ingestion. timestamp_key: {timestamp_key}."
61
+ f" {cases_message}"
62
+ )
63
+ if label_column and label_column not in columns:
64
+ raise mlrun.errors.MLRunInvalidArgumentError(
65
+ f"label_column is missing from dataframe during ingestion. label_column: {label_column}. "
66
+ f"{cases_message}"
67
+ )
@@ -17,7 +17,7 @@
17
17
  import mlrun
18
18
  import mlrun.artifacts
19
19
  from mlrun.config import config
20
- from mlrun.utils.helpers import is_legacy_artifact, parse_artifact_uri
20
+ from mlrun.utils.helpers import parse_artifact_uri
21
21
 
22
22
  from ..common.helpers import parse_versioned_object_uri
23
23
  from ..platforms.iguazio import parse_path
@@ -27,6 +27,8 @@ from .targets import get_online_target
27
27
 
28
28
  def is_store_uri(url):
29
29
  """detect if the uri starts with the store schema prefix"""
30
+ if not url:
31
+ return False
30
32
  return url.startswith(DB_SCHEMA + "://")
31
33
 
32
34
 
@@ -146,7 +148,11 @@ def get_store_resource(
146
148
 
147
149
  db = db or mlrun.get_run_db(secrets=secrets)
148
150
  kind, uri = parse_store_uri(uri)
149
- if kind == StorePrefix.FeatureSet:
151
+ if not kind:
152
+ raise mlrun.errors.MLRunInvalidArgumentError(
153
+ f"Cannot get store resource from invalid URI: {uri}"
154
+ )
155
+ elif kind == StorePrefix.FeatureSet:
150
156
  project, name, tag, uid = parse_versioned_object_uri(
151
157
  uri, project or config.default_project
152
158
  )
@@ -167,11 +173,7 @@ def get_store_resource(
167
173
  )
168
174
  if resource.get("kind", "") == "link":
169
175
  # todo: support other link types (not just iter, move this to the db/api layer
170
- link_iteration = (
171
- resource.get("link_iteration", 0)
172
- if is_legacy_artifact(resource)
173
- else resource["spec"].get("link_iteration", 0)
174
- )
176
+ link_iteration = resource["spec"].get("link_iteration", 0)
175
177
 
176
178
  resource = db.read_artifact(
177
179
  key,