mlrun 1.6.4rc2__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (291) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +26 -112
  3. mlrun/alerts/__init__.py +15 -0
  4. mlrun/alerts/alert.py +144 -0
  5. mlrun/api/schemas/__init__.py +5 -4
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +46 -257
  8. mlrun/artifacts/dataset.py +11 -192
  9. mlrun/artifacts/manager.py +47 -48
  10. mlrun/artifacts/model.py +31 -159
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +69 -0
  13. mlrun/common/db/sql_session.py +2 -3
  14. mlrun/common/formatters/__init__.py +19 -0
  15. mlrun/common/formatters/artifact.py +21 -0
  16. mlrun/common/formatters/base.py +78 -0
  17. mlrun/common/formatters/function.py +41 -0
  18. mlrun/common/formatters/pipeline.py +53 -0
  19. mlrun/common/formatters/project.py +51 -0
  20. mlrun/common/helpers.py +1 -2
  21. mlrun/common/model_monitoring/helpers.py +9 -5
  22. mlrun/{runtimes → common/runtimes}/constants.py +37 -9
  23. mlrun/common/schemas/__init__.py +24 -4
  24. mlrun/common/schemas/alert.py +203 -0
  25. mlrun/common/schemas/api_gateway.py +148 -0
  26. mlrun/common/schemas/artifact.py +18 -8
  27. mlrun/common/schemas/auth.py +11 -5
  28. mlrun/common/schemas/background_task.py +1 -1
  29. mlrun/common/schemas/client_spec.py +4 -1
  30. mlrun/common/schemas/feature_store.py +16 -16
  31. mlrun/common/schemas/frontend_spec.py +8 -7
  32. mlrun/common/schemas/function.py +5 -1
  33. mlrun/common/schemas/hub.py +11 -18
  34. mlrun/common/schemas/memory_reports.py +2 -2
  35. mlrun/common/schemas/model_monitoring/__init__.py +18 -3
  36. mlrun/common/schemas/model_monitoring/constants.py +83 -26
  37. mlrun/common/schemas/model_monitoring/grafana.py +13 -9
  38. mlrun/common/schemas/model_monitoring/model_endpoints.py +99 -16
  39. mlrun/common/schemas/notification.py +4 -4
  40. mlrun/common/schemas/object.py +2 -2
  41. mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
  42. mlrun/common/schemas/pipeline.py +1 -10
  43. mlrun/common/schemas/project.py +24 -23
  44. mlrun/common/schemas/runtime_resource.py +8 -12
  45. mlrun/common/schemas/schedule.py +3 -3
  46. mlrun/common/schemas/tag.py +1 -2
  47. mlrun/common/schemas/workflow.py +2 -2
  48. mlrun/common/types.py +7 -1
  49. mlrun/config.py +54 -17
  50. mlrun/data_types/to_pandas.py +10 -12
  51. mlrun/datastore/__init__.py +5 -8
  52. mlrun/datastore/alibaba_oss.py +130 -0
  53. mlrun/datastore/azure_blob.py +17 -5
  54. mlrun/datastore/base.py +62 -39
  55. mlrun/datastore/datastore.py +28 -9
  56. mlrun/datastore/datastore_profile.py +146 -20
  57. mlrun/datastore/filestore.py +0 -1
  58. mlrun/datastore/google_cloud_storage.py +6 -2
  59. mlrun/datastore/hdfs.py +56 -0
  60. mlrun/datastore/inmem.py +2 -2
  61. mlrun/datastore/redis.py +6 -2
  62. mlrun/datastore/s3.py +9 -0
  63. mlrun/datastore/snowflake_utils.py +43 -0
  64. mlrun/datastore/sources.py +201 -96
  65. mlrun/datastore/spark_utils.py +1 -2
  66. mlrun/datastore/store_resources.py +7 -7
  67. mlrun/datastore/targets.py +358 -104
  68. mlrun/datastore/utils.py +72 -58
  69. mlrun/datastore/v3io.py +5 -1
  70. mlrun/db/base.py +185 -35
  71. mlrun/db/factory.py +1 -1
  72. mlrun/db/httpdb.py +614 -179
  73. mlrun/db/nopdb.py +210 -26
  74. mlrun/errors.py +12 -1
  75. mlrun/execution.py +41 -24
  76. mlrun/feature_store/__init__.py +0 -2
  77. mlrun/feature_store/api.py +40 -72
  78. mlrun/feature_store/common.py +1 -1
  79. mlrun/feature_store/feature_set.py +76 -55
  80. mlrun/feature_store/feature_vector.py +28 -30
  81. mlrun/feature_store/ingestion.py +7 -6
  82. mlrun/feature_store/retrieval/base.py +16 -11
  83. mlrun/feature_store/retrieval/conversion.py +11 -13
  84. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  85. mlrun/feature_store/retrieval/job.py +9 -3
  86. mlrun/feature_store/retrieval/local_merger.py +2 -0
  87. mlrun/feature_store/retrieval/spark_merger.py +34 -24
  88. mlrun/feature_store/steps.py +37 -34
  89. mlrun/features.py +9 -20
  90. mlrun/frameworks/_common/artifacts_library.py +9 -9
  91. mlrun/frameworks/_common/mlrun_interface.py +5 -5
  92. mlrun/frameworks/_common/model_handler.py +48 -48
  93. mlrun/frameworks/_common/plan.py +2 -3
  94. mlrun/frameworks/_common/producer.py +3 -4
  95. mlrun/frameworks/_common/utils.py +5 -5
  96. mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
  97. mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
  98. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
  99. mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
  100. mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
  101. mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
  102. mlrun/frameworks/_ml_common/model_handler.py +24 -24
  103. mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
  104. mlrun/frameworks/_ml_common/plan.py +1 -1
  105. mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
  106. mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
  107. mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
  108. mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
  109. mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
  110. mlrun/frameworks/_ml_common/utils.py +4 -4
  111. mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
  112. mlrun/frameworks/huggingface/model_server.py +4 -4
  113. mlrun/frameworks/lgbm/__init__.py +33 -33
  114. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  115. mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
  116. mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
  117. mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
  118. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
  119. mlrun/frameworks/lgbm/model_handler.py +10 -10
  120. mlrun/frameworks/lgbm/model_server.py +6 -6
  121. mlrun/frameworks/lgbm/utils.py +5 -5
  122. mlrun/frameworks/onnx/dataset.py +8 -8
  123. mlrun/frameworks/onnx/mlrun_interface.py +3 -3
  124. mlrun/frameworks/onnx/model_handler.py +6 -6
  125. mlrun/frameworks/onnx/model_server.py +7 -7
  126. mlrun/frameworks/parallel_coordinates.py +4 -3
  127. mlrun/frameworks/pytorch/__init__.py +18 -18
  128. mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
  129. mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
  130. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
  131. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
  132. mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
  133. mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
  134. mlrun/frameworks/pytorch/model_handler.py +17 -17
  135. mlrun/frameworks/pytorch/model_server.py +7 -7
  136. mlrun/frameworks/sklearn/__init__.py +13 -13
  137. mlrun/frameworks/sklearn/estimator.py +4 -4
  138. mlrun/frameworks/sklearn/metrics_library.py +14 -14
  139. mlrun/frameworks/sklearn/mlrun_interface.py +3 -6
  140. mlrun/frameworks/sklearn/model_handler.py +2 -2
  141. mlrun/frameworks/tf_keras/__init__.py +10 -7
  142. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
  143. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
  144. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
  145. mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
  146. mlrun/frameworks/tf_keras/model_handler.py +14 -14
  147. mlrun/frameworks/tf_keras/model_server.py +6 -6
  148. mlrun/frameworks/xgboost/__init__.py +13 -13
  149. mlrun/frameworks/xgboost/model_handler.py +6 -6
  150. mlrun/k8s_utils.py +14 -16
  151. mlrun/launcher/__init__.py +1 -1
  152. mlrun/launcher/base.py +16 -15
  153. mlrun/launcher/client.py +8 -6
  154. mlrun/launcher/factory.py +1 -1
  155. mlrun/launcher/local.py +17 -11
  156. mlrun/launcher/remote.py +16 -10
  157. mlrun/lists.py +7 -6
  158. mlrun/model.py +238 -73
  159. mlrun/model_monitoring/__init__.py +1 -1
  160. mlrun/model_monitoring/api.py +138 -315
  161. mlrun/model_monitoring/application.py +5 -296
  162. mlrun/model_monitoring/applications/__init__.py +24 -0
  163. mlrun/model_monitoring/applications/_application_steps.py +157 -0
  164. mlrun/model_monitoring/applications/base.py +282 -0
  165. mlrun/model_monitoring/applications/context.py +214 -0
  166. mlrun/model_monitoring/applications/evidently_base.py +211 -0
  167. mlrun/model_monitoring/applications/histogram_data_drift.py +349 -0
  168. mlrun/model_monitoring/applications/results.py +99 -0
  169. mlrun/model_monitoring/controller.py +104 -84
  170. mlrun/model_monitoring/controller_handler.py +13 -5
  171. mlrun/model_monitoring/db/__init__.py +18 -0
  172. mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -36
  173. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  174. mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +64 -40
  175. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  176. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  177. mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +109 -5
  178. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +88 -0
  179. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  180. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +684 -0
  181. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  182. mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +310 -165
  183. mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
  184. mlrun/model_monitoring/db/tsdb/base.py +329 -0
  185. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  186. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  187. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
  188. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
  189. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
  190. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  191. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
  192. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +630 -0
  193. mlrun/model_monitoring/evidently_application.py +6 -118
  194. mlrun/model_monitoring/features_drift_table.py +134 -106
  195. mlrun/model_monitoring/helpers.py +127 -28
  196. mlrun/model_monitoring/metrics/__init__.py +13 -0
  197. mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
  198. mlrun/model_monitoring/model_endpoint.py +3 -2
  199. mlrun/model_monitoring/prometheus.py +1 -4
  200. mlrun/model_monitoring/stream_processing.py +62 -231
  201. mlrun/model_monitoring/tracking_policy.py +9 -2
  202. mlrun/model_monitoring/writer.py +152 -124
  203. mlrun/package/__init__.py +6 -6
  204. mlrun/package/context_handler.py +5 -5
  205. mlrun/package/packager.py +7 -7
  206. mlrun/package/packagers/default_packager.py +6 -6
  207. mlrun/package/packagers/numpy_packagers.py +15 -15
  208. mlrun/package/packagers/pandas_packagers.py +5 -5
  209. mlrun/package/packagers/python_standard_library_packagers.py +10 -10
  210. mlrun/package/packagers_manager.py +19 -23
  211. mlrun/package/utils/_formatter.py +6 -6
  212. mlrun/package/utils/_pickler.py +2 -2
  213. mlrun/package/utils/_supported_format.py +4 -4
  214. mlrun/package/utils/log_hint_utils.py +2 -2
  215. mlrun/package/utils/type_hint_utils.py +4 -9
  216. mlrun/platforms/__init__.py +11 -10
  217. mlrun/platforms/iguazio.py +24 -203
  218. mlrun/projects/operations.py +35 -21
  219. mlrun/projects/pipelines.py +68 -99
  220. mlrun/projects/project.py +830 -266
  221. mlrun/render.py +3 -11
  222. mlrun/run.py +162 -166
  223. mlrun/runtimes/__init__.py +62 -7
  224. mlrun/runtimes/base.py +39 -32
  225. mlrun/runtimes/daskjob.py +8 -8
  226. mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
  227. mlrun/runtimes/databricks_job/databricks_runtime.py +7 -7
  228. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  229. mlrun/runtimes/funcdoc.py +0 -28
  230. mlrun/runtimes/function_reference.py +1 -1
  231. mlrun/runtimes/kubejob.py +28 -122
  232. mlrun/runtimes/local.py +6 -3
  233. mlrun/runtimes/mpijob/__init__.py +0 -20
  234. mlrun/runtimes/mpijob/abstract.py +9 -10
  235. mlrun/runtimes/mpijob/v1.py +1 -1
  236. mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
  237. mlrun/runtimes/nuclio/api_gateway.py +709 -0
  238. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  239. mlrun/runtimes/nuclio/application/application.py +523 -0
  240. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  241. mlrun/runtimes/{function.py → nuclio/function.py} +112 -73
  242. mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
  243. mlrun/runtimes/{serving.py → nuclio/serving.py} +45 -51
  244. mlrun/runtimes/pod.py +286 -88
  245. mlrun/runtimes/remotesparkjob.py +2 -2
  246. mlrun/runtimes/sparkjob/spark3job.py +51 -34
  247. mlrun/runtimes/utils.py +7 -75
  248. mlrun/secrets.py +9 -5
  249. mlrun/serving/remote.py +2 -7
  250. mlrun/serving/routers.py +13 -10
  251. mlrun/serving/server.py +22 -26
  252. mlrun/serving/states.py +99 -25
  253. mlrun/serving/utils.py +3 -3
  254. mlrun/serving/v1_serving.py +6 -7
  255. mlrun/serving/v2_serving.py +59 -20
  256. mlrun/track/tracker.py +2 -1
  257. mlrun/track/tracker_manager.py +3 -3
  258. mlrun/track/trackers/mlflow_tracker.py +1 -2
  259. mlrun/utils/async_http.py +5 -7
  260. mlrun/utils/azure_vault.py +1 -1
  261. mlrun/utils/clones.py +1 -2
  262. mlrun/utils/condition_evaluator.py +3 -3
  263. mlrun/utils/db.py +3 -3
  264. mlrun/utils/helpers.py +183 -197
  265. mlrun/utils/http.py +2 -5
  266. mlrun/utils/logger.py +76 -14
  267. mlrun/utils/notifications/notification/__init__.py +17 -12
  268. mlrun/utils/notifications/notification/base.py +14 -2
  269. mlrun/utils/notifications/notification/console.py +2 -0
  270. mlrun/utils/notifications/notification/git.py +3 -1
  271. mlrun/utils/notifications/notification/ipython.py +3 -1
  272. mlrun/utils/notifications/notification/slack.py +101 -21
  273. mlrun/utils/notifications/notification/webhook.py +11 -1
  274. mlrun/utils/notifications/notification_pusher.py +155 -30
  275. mlrun/utils/retryer.py +208 -0
  276. mlrun/utils/singleton.py +1 -1
  277. mlrun/utils/v3io_clients.py +2 -4
  278. mlrun/utils/version/version.json +2 -2
  279. mlrun/utils/version/version.py +2 -6
  280. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/METADATA +31 -19
  281. mlrun-1.7.0rc20.dist-info/RECORD +353 -0
  282. mlrun/kfpops.py +0 -868
  283. mlrun/model_monitoring/batch.py +0 -1095
  284. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  285. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
  286. mlrun/platforms/other.py +0 -306
  287. mlrun-1.6.4rc2.dist-info/RECORD +0 -314
  288. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/LICENSE +0 -0
  289. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/WHEEL +0 -0
  290. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/entry_points.txt +0 -0
  291. {mlrun-1.6.4rc2.dist-info → mlrun-1.7.0rc20.dist-info}/top_level.txt +0 -0
mlrun/datastore/redis.py CHANGED
@@ -31,7 +31,7 @@ class RedisStore(DataStore):
31
31
  """
32
32
 
33
33
  def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
34
- REDIS_DEFAULT_PORT = "6379"
34
+ redis_default_port = "6379"
35
35
  super().__init__(parent, name, schema, endpoint, secrets=secrets)
36
36
  self.headers = None
37
37
 
@@ -49,7 +49,7 @@ class RedisStore(DataStore):
49
49
  user = self._get_secret_or_env("REDIS_USER", "", credentials_prefix)
50
50
  password = self._get_secret_or_env("REDIS_PASSWORD", "", credentials_prefix)
51
51
  host = parsed_endpoint.hostname
52
- port = parsed_endpoint.port if parsed_endpoint.port else REDIS_DEFAULT_PORT
52
+ port = parsed_endpoint.port if parsed_endpoint.port else redis_default_port
53
53
  schema = parsed_endpoint.scheme
54
54
  if user or password:
55
55
  endpoint = f"{schema}://{user}:{password}@{host}:{port}"
@@ -163,3 +163,7 @@ class RedisStore(DataStore):
163
163
  self.redis.delete(k)
164
164
  else:
165
165
  self.redis.delete(key)
166
+
167
+ @property
168
+ def spark_url(self):
169
+ return ""
mlrun/datastore/s3.py CHANGED
@@ -156,6 +156,10 @@ class S3Store(DataStore):
156
156
 
157
157
  return self._sanitize_storage_options(storage_options)
158
158
 
159
+ @property
160
+ def spark_url(self):
161
+ return f"s3a://{self.endpoint}"
162
+
159
163
  def get_bucket_and_key(self, key):
160
164
  path = self._join(key)[1:]
161
165
  return self.endpoint, path
@@ -194,6 +198,11 @@ class S3Store(DataStore):
194
198
  bucket = self.s3.Bucket(bucket)
195
199
  return [obj.key[key_length:] for obj in bucket.objects.filter(Prefix=key)]
196
200
 
201
+ def rm(self, path, recursive=False, maxdepth=None):
202
+ bucket, key = self.get_bucket_and_key(path)
203
+ path = f"{bucket}/{key}"
204
+ self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
205
+
197
206
 
198
207
  def parse_s3_bucket_and_key(s3_path):
199
208
  try:
@@ -0,0 +1,43 @@
1
+ # Copyright 2024 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+
16
+ import mlrun
17
+
18
+
19
+ def get_snowflake_password():
20
+ key = "SNOWFLAKE_PASSWORD"
21
+ snowflake_password = mlrun.get_secret_or_env(key)
22
+
23
+ if not snowflake_password:
24
+ raise mlrun.errors.MLRunInvalidArgumentError(
25
+ f"No password provided. Set password using the {key} "
26
+ "project secret or environment variable."
27
+ )
28
+
29
+ return snowflake_password
30
+
31
+
32
+ def get_snowflake_spark_options(attributes):
33
+ return {
34
+ "format": "net.snowflake.spark.snowflake",
35
+ "sfURL": attributes.get("url"),
36
+ "sfUser": attributes.get("user"),
37
+ "sfPassword": get_snowflake_password(),
38
+ "sfDatabase": attributes.get("database"),
39
+ "sfSchema": attributes.get("schema"),
40
+ "sfWarehouse": attributes.get("warehouse"),
41
+ "application": "iguazio_platform",
42
+ "TIMESTAMP_TYPE_MAPPING": "TIMESTAMP_LTZ",
43
+ }
@@ -12,12 +12,13 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import json
15
+ import operator
15
16
  import os
16
17
  import warnings
17
18
  from base64 import b64encode
18
19
  from copy import copy
19
20
  from datetime import datetime
20
- from typing import Dict, List, Optional, Union
21
+ from typing import Optional, Union
21
22
 
22
23
  import pandas as pd
23
24
  import semver
@@ -27,9 +28,11 @@ from nuclio import KafkaTrigger
27
28
  from nuclio.config import split_path
28
29
 
29
30
  import mlrun
31
+ from mlrun.config import config
32
+ from mlrun.datastore.snowflake_utils import get_snowflake_spark_options
33
+ from mlrun.datastore.utils import transform_list_filters_to_tuple
30
34
  from mlrun.secrets import SecretsStore
31
35
 
32
- from ..config import config
33
36
  from ..model import DataSource
34
37
  from ..platforms.iguazio import parse_path
35
38
  from ..utils import get_class, is_explicit_ack_supported
@@ -39,7 +42,6 @@ from .utils import (
39
42
  _generate_sql_query_with_time_filter,
40
43
  filter_df_start_end_time,
41
44
  select_columns_from_df,
42
- store_path_to_spark,
43
45
  )
44
46
 
45
47
 
@@ -102,8 +104,12 @@ class BaseSourceDriver(DataSource):
102
104
  start_time=None,
103
105
  end_time=None,
104
106
  time_field=None,
107
+ additional_filters=None,
105
108
  ):
106
109
  """return the source data as dataframe"""
110
+ mlrun.utils.helpers.additional_filters_warning(
111
+ additional_filters, self.__class__
112
+ )
107
113
  return mlrun.store_manager.object(url=self.path).as_df(
108
114
  columns=columns,
109
115
  df_module=df_module,
@@ -114,7 +120,11 @@ class BaseSourceDriver(DataSource):
114
120
 
115
121
  def to_spark_df(self, session, named_view=False, time_field=None, columns=None):
116
122
  if self.support_spark:
117
- df = load_spark_dataframe_with_options(session, self.get_spark_options())
123
+ spark_options = self.get_spark_options()
124
+ spark_format = spark_options.pop("format", None)
125
+ df = load_spark_dataframe_with_options(
126
+ session, spark_options, format=spark_format
127
+ )
118
128
  if named_view:
119
129
  df.createOrReplaceTempView(self.name)
120
130
  return self._filter_spark_df(df, time_field, columns)
@@ -170,10 +180,10 @@ class CSVSource(BaseSourceDriver):
170
180
  self,
171
181
  name: str = "",
172
182
  path: str = None,
173
- attributes: Dict[str, str] = None,
183
+ attributes: dict[str, object] = None,
174
184
  key_field: str = None,
175
185
  schedule: str = None,
176
- parse_dates: Union[None, int, str, List[int], List[str]] = None,
186
+ parse_dates: Union[None, int, str, list[int], list[str]] = None,
177
187
  **kwargs,
178
188
  ):
179
189
  super().__init__(name, path, attributes, key_field, schedule=schedule, **kwargs)
@@ -193,14 +203,10 @@ class CSVSource(BaseSourceDriver):
193
203
  parse_dates.append(time_field)
194
204
 
195
205
  data_item = mlrun.store_manager.object(self.path)
196
- if self.path and self.path.startswith("ds://"):
197
- store, path = mlrun.store_manager.get_or_create_store(self.path)
198
- path = store.url + path
199
- else:
200
- path = data_item.url
206
+ store, path, url = mlrun.store_manager.get_or_create_store(self.path)
201
207
 
202
208
  return storey.CSVSource(
203
- paths=path, # unlike self.path, it already has store:// replaced
209
+ paths=url, # unlike self.path, it already has store:// replaced
204
210
  build_dict=True,
205
211
  key_field=self.key_field or key_field,
206
212
  storage_options=data_item.store.get_storage_options(),
@@ -209,25 +215,17 @@ class CSVSource(BaseSourceDriver):
209
215
  )
210
216
 
211
217
  def get_spark_options(self):
212
- if self.path and self.path.startswith("ds://"):
213
- store, path = mlrun.store_manager.get_or_create_store(self.path)
214
- storage_spark_options = store.get_spark_options()
215
- path = store.url + path
216
- result = {
217
- "path": store_path_to_spark(path, storage_spark_options),
218
- "format": "csv",
219
- "header": "true",
220
- "inferSchema": "true",
221
- }
222
-
223
- return {**result, **storage_spark_options}
224
- else:
225
- return {
226
- "path": store_path_to_spark(self.path),
218
+ store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
219
+ spark_options = store.get_spark_options()
220
+ spark_options.update(
221
+ {
222
+ "path": store.spark_url + path,
227
223
  "format": "csv",
228
224
  "header": "true",
229
225
  "inferSchema": "true",
230
226
  }
227
+ )
228
+ return spark_options
231
229
 
232
230
  def to_spark_df(self, session, named_view=False, time_field=None, columns=None):
233
231
  import pyspark.sql.functions as funcs
@@ -253,7 +251,11 @@ class CSVSource(BaseSourceDriver):
253
251
  start_time=None,
254
252
  end_time=None,
255
253
  time_field=None,
254
+ additional_filters=None,
256
255
  ):
256
+ mlrun.utils.helpers.additional_filters_warning(
257
+ additional_filters, self.__class__
258
+ )
257
259
  reader_args = self.attributes.get("reader_args", {})
258
260
  return mlrun.store_manager.object(url=self.path).as_df(
259
261
  columns=columns,
@@ -289,6 +291,12 @@ class ParquetSource(BaseSourceDriver):
289
291
  :parameter start_time: filters out data before this time
290
292
  :parameter end_time: filters out data after this time
291
293
  :parameter attributes: additional parameters to pass to storey.
294
+ :param additional_filters: List of additional_filter conditions as tuples.
295
+ Each tuple should be in the format (column_name, operator, value).
296
+ Supported operators: "=", ">=", "<=", ">", "<".
297
+ Example: [("Product", "=", "Computer")]
298
+ For all supported filters, please see:
299
+ https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
292
300
  """
293
301
 
294
302
  kind = "parquet"
@@ -299,13 +307,19 @@ class ParquetSource(BaseSourceDriver):
299
307
  self,
300
308
  name: str = "",
301
309
  path: str = None,
302
- attributes: Dict[str, str] = None,
310
+ attributes: dict[str, object] = None,
303
311
  key_field: str = None,
304
312
  time_field: str = None,
305
313
  schedule: str = None,
306
314
  start_time: Optional[Union[datetime, str]] = None,
307
315
  end_time: Optional[Union[datetime, str]] = None,
316
+ additional_filters: Optional[list[Union[tuple, list]]] = None,
308
317
  ):
318
+ if additional_filters:
319
+ attributes = copy(attributes) or {}
320
+ additional_filters = transform_list_filters_to_tuple(additional_filters)
321
+ attributes["additional_filters"] = additional_filters
322
+
309
323
  super().__init__(
310
324
  name,
311
325
  path,
@@ -333,6 +347,10 @@ class ParquetSource(BaseSourceDriver):
333
347
  def end_time(self, end_time):
334
348
  self._end_time = self._convert_to_datetime(end_time)
335
349
 
350
+ @property
351
+ def additional_filters(self):
352
+ return self.attributes.get("additional_filters")
353
+
336
354
  @staticmethod
337
355
  def _convert_to_datetime(time):
338
356
  if time and isinstance(time, str):
@@ -349,45 +367,48 @@ class ParquetSource(BaseSourceDriver):
349
367
  start_time=None,
350
368
  end_time=None,
351
369
  context=None,
370
+ additional_filters=None,
352
371
  ):
353
372
  import storey
354
373
 
355
- attributes = self.attributes or {}
374
+ attributes = copy(self.attributes)
375
+ attributes.pop("additional_filters", None)
356
376
  if context:
357
377
  attributes["context"] = context
358
-
378
+ additional_filters = transform_list_filters_to_tuple(additional_filters)
359
379
  data_item = mlrun.store_manager.object(self.path)
360
- if self.path and self.path.startswith("ds://"):
361
- store, path = mlrun.store_manager.get_or_create_store(self.path)
362
- path = store.url + path
363
- else:
364
- path = data_item.url
365
-
380
+ store, path, url = mlrun.store_manager.get_or_create_store(self.path)
366
381
  return storey.ParquetSource(
367
- paths=path, # unlike self.path, it already has store:// replaced
382
+ paths=url, # unlike self.path, it already has store:// replaced
368
383
  key_field=self.key_field or key_field,
369
384
  storage_options=data_item.store.get_storage_options(),
370
385
  end_filter=self.end_time,
371
386
  start_filter=self.start_time,
372
387
  filter_column=self.time_field or time_field,
388
+ additional_filters=self.additional_filters or additional_filters,
373
389
  **attributes,
374
390
  )
375
391
 
392
+ @classmethod
393
+ def from_dict(cls, struct=None, fields=None, deprecated_fields: dict = None):
394
+ new_obj = super().from_dict(
395
+ struct=struct, fields=fields, deprecated_fields=deprecated_fields
396
+ )
397
+ new_obj.attributes["additional_filters"] = transform_list_filters_to_tuple(
398
+ new_obj.additional_filters
399
+ )
400
+ return new_obj
401
+
376
402
  def get_spark_options(self):
377
- if self.path and self.path.startswith("ds://"):
378
- store, path = mlrun.store_manager.get_or_create_store(self.path)
379
- storage_spark_options = store.get_spark_options()
380
- path = store.url + path
381
- result = {
382
- "path": store_path_to_spark(path, storage_spark_options),
383
- "format": "parquet",
384
- }
385
- return {**result, **storage_spark_options}
386
- else:
387
- return {
388
- "path": store_path_to_spark(self.path),
403
+ store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
404
+ spark_options = store.get_spark_options()
405
+ spark_options.update(
406
+ {
407
+ "path": store.spark_url + path,
389
408
  "format": "parquet",
390
409
  }
410
+ )
411
+ return spark_options
391
412
 
392
413
  def to_dataframe(
393
414
  self,
@@ -397,8 +418,10 @@ class ParquetSource(BaseSourceDriver):
397
418
  start_time=None,
398
419
  end_time=None,
399
420
  time_field=None,
421
+ additional_filters=None,
400
422
  ):
401
423
  reader_args = self.attributes.get("reader_args", {})
424
+ additional_filters = transform_list_filters_to_tuple(additional_filters)
402
425
  return mlrun.store_manager.object(url=self.path).as_df(
403
426
  columns=columns,
404
427
  df_module=df_module,
@@ -406,9 +429,88 @@ class ParquetSource(BaseSourceDriver):
406
429
  end_time=end_time or self.end_time,
407
430
  time_column=time_field or self.time_field,
408
431
  format="parquet",
432
+ additional_filters=additional_filters or self.additional_filters,
409
433
  **reader_args,
410
434
  )
411
435
 
436
+ def _build_spark_additional_filters(self, column_types: dict):
437
+ if not self.additional_filters:
438
+ return None
439
+ from pyspark.sql.functions import col, isnan, lit
440
+
441
+ operators = {
442
+ "==": operator.eq,
443
+ "=": operator.eq,
444
+ ">": operator.gt,
445
+ "<": operator.lt,
446
+ ">=": operator.ge,
447
+ "<=": operator.le,
448
+ "!=": operator.ne,
449
+ }
450
+
451
+ spark_filter = None
452
+ new_filter = lit(True)
453
+ for filter_tuple in self.additional_filters:
454
+ if not filter_tuple:
455
+ continue
456
+ col_name, op, value = filter_tuple
457
+ if op.lower() in ("in", "not in") and isinstance(value, (list, tuple, set)):
458
+ none_exists = False
459
+ value = list(value)
460
+ for sub_value in value:
461
+ if sub_value is None:
462
+ value.remove(sub_value)
463
+ none_exists = True
464
+ if none_exists:
465
+ filter_nan = column_types[col_name] not in ("timestamp", "date")
466
+ if value:
467
+ if op.lower() == "in":
468
+ new_filter = (
469
+ col(col_name).isin(value) | col(col_name).isNull()
470
+ )
471
+ if filter_nan:
472
+ new_filter = new_filter | isnan(col(col_name))
473
+
474
+ else:
475
+ new_filter = (
476
+ ~col(col_name).isin(value) & ~col(col_name).isNull()
477
+ )
478
+ if filter_nan:
479
+ new_filter = new_filter & ~isnan(col(col_name))
480
+ else:
481
+ if op.lower() == "in":
482
+ new_filter = col(col_name).isNull()
483
+ if filter_nan:
484
+ new_filter = new_filter | isnan(col(col_name))
485
+ else:
486
+ new_filter = ~col(col_name).isNull()
487
+ if filter_nan:
488
+ new_filter = new_filter & ~isnan(col(col_name))
489
+ else:
490
+ if op.lower() == "in":
491
+ new_filter = col(col_name).isin(value)
492
+ elif op.lower() == "not in":
493
+ new_filter = ~col(col_name).isin(value)
494
+ elif op in operators:
495
+ new_filter = operators[op](col(col_name), value)
496
+ else:
497
+ raise mlrun.errors.MLRunInvalidArgumentError(
498
+ f"unsupported filter operator: {op}"
499
+ )
500
+ if spark_filter is not None:
501
+ spark_filter = spark_filter & new_filter
502
+ else:
503
+ spark_filter = new_filter
504
+ return spark_filter
505
+
506
+ def _filter_spark_df(self, df, time_field=None, columns=None):
507
+ spark_additional_filters = self._build_spark_additional_filters(
508
+ column_types=dict(df.dtypes)
509
+ )
510
+ if spark_additional_filters is not None:
511
+ df = df.filter(spark_additional_filters)
512
+ return super()._filter_spark_df(df=df, time_field=time_field, columns=columns)
513
+
412
514
 
413
515
  class BigQuerySource(BaseSourceDriver):
414
516
  """
@@ -423,12 +525,17 @@ class BigQuerySource(BaseSourceDriver):
423
525
 
424
526
  # use sql query
425
527
  query_string = "SELECT * FROM `the-psf.pypi.downloads20210328` LIMIT 5000"
426
- source = BigQuerySource("bq1", query=query_string,
427
- gcp_project="my_project",
428
- materialization_dataset="dataviews")
528
+ source = BigQuerySource(
529
+ "bq1",
530
+ query=query_string,
531
+ gcp_project="my_project",
532
+ materialization_dataset="dataviews",
533
+ )
429
534
 
430
535
  # read a table
431
- source = BigQuerySource("bq2", table="the-psf.pypi.downloads20210328", gcp_project="my_project")
536
+ source = BigQuerySource(
537
+ "bq2", table="the-psf.pypi.downloads20210328", gcp_project="my_project"
538
+ )
432
539
 
433
540
 
434
541
  :parameter name: source name
@@ -531,10 +638,15 @@ class BigQuerySource(BaseSourceDriver):
531
638
  start_time=None,
532
639
  end_time=None,
533
640
  time_field=None,
641
+ additional_filters=None,
534
642
  ):
535
643
  from google.cloud import bigquery
536
644
  from google.cloud.bigquery_storage_v1 import BigQueryReadClient
537
645
 
646
+ mlrun.utils.helpers.additional_filters_warning(
647
+ additional_filters, self.__class__
648
+ )
649
+
538
650
  def schema_to_dtypes(schema):
539
651
  from mlrun.data_types.data_types import gbq_to_pandas_dtype
540
652
 
@@ -574,7 +686,6 @@ class BigQuerySource(BaseSourceDriver):
574
686
  else:
575
687
  df = rows_iterator.to_dataframe(dtypes=dtypes)
576
688
 
577
- # TODO : filter as part of the query
578
689
  return select_columns_from_df(
579
690
  filter_df_start_end_time(
580
691
  df,
@@ -695,32 +806,10 @@ class SnowflakeSource(BaseSourceDriver):
695
806
  **kwargs,
696
807
  )
697
808
 
698
- def _get_password(self):
699
- key = "SNOWFLAKE_PASSWORD"
700
- snowflake_password = os.getenv(key) or os.getenv(
701
- SecretsStore.k8s_env_variable_name_for_secret(key)
702
- )
703
-
704
- if not snowflake_password:
705
- raise mlrun.errors.MLRunInvalidArgumentError(
706
- "No password provided. Set password using the SNOWFLAKE_PASSWORD "
707
- "project secret or environment variable."
708
- )
709
-
710
- return snowflake_password
711
-
712
809
  def get_spark_options(self):
713
- return {
714
- "format": "net.snowflake.spark.snowflake",
715
- "query": self.attributes.get("query"),
716
- "sfURL": self.attributes.get("url"),
717
- "sfUser": self.attributes.get("user"),
718
- "sfPassword": self._get_password(),
719
- "sfDatabase": self.attributes.get("database"),
720
- "sfSchema": self.attributes.get("schema"),
721
- "sfWarehouse": self.attributes.get("warehouse"),
722
- "application": "iguazio_platform",
723
- }
810
+ spark_options = get_snowflake_spark_options(self.attributes)
811
+ spark_options["query"] = self.attributes.get("query")
812
+ return spark_options
724
813
 
725
814
 
726
815
  class CustomSource(BaseSourceDriver):
@@ -774,7 +863,19 @@ class DataFrameSource:
774
863
  context=self.context or context,
775
864
  )
776
865
 
777
- def to_dataframe(self, **kwargs):
866
+ def to_dataframe(
867
+ self,
868
+ columns=None,
869
+ df_module=None,
870
+ entities=None,
871
+ start_time=None,
872
+ end_time=None,
873
+ time_field=None,
874
+ additional_filters=None,
875
+ ):
876
+ mlrun.utils.helpers.additional_filters_warning(
877
+ additional_filters, self.__class__
878
+ )
778
879
  return self._df
779
880
 
780
881
  def is_iterator(self):
@@ -800,7 +901,7 @@ class OnlineSource(BaseSourceDriver):
800
901
  self,
801
902
  name: str = None,
802
903
  path: str = None,
803
- attributes: Dict[str, object] = None,
904
+ attributes: dict[str, object] = None,
804
905
  key_field: str = None,
805
906
  time_field: str = None,
806
907
  workers: int = None,
@@ -812,16 +913,12 @@ class OnlineSource(BaseSourceDriver):
812
913
  def to_step(self, key_field=None, time_field=None, context=None):
813
914
  import storey
814
915
 
815
- source_class = (
816
- storey.AsyncEmitSource
817
- if config.datastore.async_source_mode == "enabled"
818
- else storey.SyncEmitSource
819
- )
820
916
  source_args = self.attributes.get("source_args", {})
821
917
  explicit_ack = (
822
918
  is_explicit_ack_supported(context) and mlrun.mlconf.is_explicit_ack()
823
919
  )
824
- src_class = source_class(
920
+ # TODO: Change to AsyncEmitSource once we can drop support for nuclio<1.12.10
921
+ src_class = storey.SyncEmitSource(
825
922
  context=context,
826
923
  key_field=self.key_field or key_field,
827
924
  full_event=True,
@@ -848,8 +945,6 @@ class HttpSource(OnlineSource):
848
945
 
849
946
 
850
947
  class StreamSource(OnlineSource):
851
- """Sets stream source for the flow. If stream doesn't exist it will create it"""
852
-
853
948
  kind = "v3ioStream"
854
949
 
855
950
  def __init__(
@@ -863,7 +958,7 @@ class StreamSource(OnlineSource):
863
958
  **kwargs,
864
959
  ):
865
960
  """
866
- Sets stream source for the flow. If stream doesn't exist it will create it
961
+ Sets the stream source for the flow. If the stream doesn't exist it will create it.
867
962
 
868
963
  :param name: stream name. Default "stream"
869
964
  :param group: consumer group. Default "serving"
@@ -882,8 +977,15 @@ class StreamSource(OnlineSource):
882
977
  super().__init__(name, attributes=attrs, **kwargs)
883
978
 
884
979
  def add_nuclio_trigger(self, function):
885
- endpoint, stream_path = parse_path(self.path)
886
- v3io_client = v3io.dataplane.Client(endpoint=endpoint)
980
+ store, _, url = mlrun.store_manager.get_or_create_store(self.path)
981
+ if store.kind != "v3io":
982
+ raise mlrun.errors.MLRunInvalidArgumentError(
983
+ "Only profiles that reference the v3io datastore can be used with StreamSource"
984
+ )
985
+ storage_options = store.get_storage_options()
986
+ access_key = storage_options.get("v3io_access_key")
987
+ endpoint, stream_path = parse_path(url)
988
+ v3io_client = v3io.dataplane.Client(endpoint=endpoint, access_key=access_key)
887
989
  container, stream_path = split_path(stream_path)
888
990
  res = v3io_client.stream.create(
889
991
  container=container,
@@ -903,7 +1005,7 @@ class StreamSource(OnlineSource):
903
1005
  kwargs["worker_allocation_mode"] = "static"
904
1006
 
905
1007
  function.add_v3io_stream_trigger(
906
- self.path,
1008
+ url,
907
1009
  self.name,
908
1010
  self.attributes["group"],
909
1011
  self.attributes["seek_to"],
@@ -915,8 +1017,6 @@ class StreamSource(OnlineSource):
915
1017
 
916
1018
 
917
1019
  class KafkaSource(OnlineSource):
918
- """Sets kafka source for the flow"""
919
-
920
1020
  kind = "kafka"
921
1021
 
922
1022
  def __init__(
@@ -970,6 +1070,7 @@ class KafkaSource(OnlineSource):
970
1070
  start_time=None,
971
1071
  end_time=None,
972
1072
  time_field=None,
1073
+ additional_filters=None,
973
1074
  ):
974
1075
  raise mlrun.MLRunInvalidArgumentError(
975
1076
  "KafkaSource does not support batch processing"
@@ -1047,7 +1148,7 @@ class SQLSource(BaseSourceDriver):
1047
1148
  db_url: str = None,
1048
1149
  table_name: str = None,
1049
1150
  spark_options: dict = None,
1050
- parse_dates: List[str] = None,
1151
+ parse_dates: list[str] = None,
1051
1152
  **kwargs,
1052
1153
  ):
1053
1154
  """
@@ -1110,9 +1211,13 @@ class SQLSource(BaseSourceDriver):
1110
1211
  start_time=None,
1111
1212
  end_time=None,
1112
1213
  time_field=None,
1214
+ additional_filters=None,
1113
1215
  ):
1114
1216
  import sqlalchemy as sqlalchemy
1115
1217
 
1218
+ mlrun.utils.helpers.additional_filters_warning(
1219
+ additional_filters, self.__class__
1220
+ )
1116
1221
  db_path = self.attributes.get("db_path")
1117
1222
  table_name = self.attributes.get("table_name")
1118
1223
  parse_dates = self.attributes.get("parse_dates")
@@ -12,12 +12,11 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from typing import Dict
16
15
 
17
16
  import mlrun
18
17
 
19
18
 
20
- def spark_session_update_hadoop_options(session, spark_options) -> Dict[str, str]:
19
+ def spark_session_update_hadoop_options(session, spark_options) -> dict[str, str]:
21
20
  hadoop_conf = session.sparkContext._jsc.hadoopConfiguration()
22
21
  non_hadoop_spark_options = {}
23
22