mlrun 1.4.0rc25__py3-none-any.whl → 1.5.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (184) hide show
  1. mlrun/__init__.py +2 -35
  2. mlrun/__main__.py +3 -41
  3. mlrun/api/api/api.py +6 -0
  4. mlrun/api/api/endpoints/feature_store.py +0 -4
  5. mlrun/api/api/endpoints/files.py +14 -2
  6. mlrun/api/api/endpoints/frontend_spec.py +2 -1
  7. mlrun/api/api/endpoints/functions.py +95 -59
  8. mlrun/api/api/endpoints/grafana_proxy.py +9 -9
  9. mlrun/api/api/endpoints/logs.py +17 -3
  10. mlrun/api/api/endpoints/model_endpoints.py +3 -2
  11. mlrun/api/api/endpoints/pipelines.py +1 -5
  12. mlrun/api/api/endpoints/projects.py +88 -0
  13. mlrun/api/api/endpoints/runs.py +48 -6
  14. mlrun/api/api/endpoints/submit.py +2 -1
  15. mlrun/api/api/endpoints/workflows.py +355 -0
  16. mlrun/api/api/utils.py +3 -4
  17. mlrun/api/crud/__init__.py +1 -0
  18. mlrun/api/crud/client_spec.py +6 -2
  19. mlrun/api/crud/feature_store.py +5 -0
  20. mlrun/api/crud/model_monitoring/__init__.py +1 -0
  21. mlrun/api/crud/model_monitoring/deployment.py +497 -0
  22. mlrun/api/crud/model_monitoring/grafana.py +96 -42
  23. mlrun/api/crud/model_monitoring/helpers.py +159 -0
  24. mlrun/api/crud/model_monitoring/model_endpoints.py +202 -476
  25. mlrun/api/crud/notifications.py +9 -4
  26. mlrun/api/crud/pipelines.py +6 -11
  27. mlrun/api/crud/projects.py +2 -2
  28. mlrun/api/crud/runtime_resources.py +4 -3
  29. mlrun/api/crud/runtimes/nuclio/helpers.py +5 -1
  30. mlrun/api/crud/secrets.py +21 -0
  31. mlrun/api/crud/workflows.py +352 -0
  32. mlrun/api/db/base.py +16 -1
  33. mlrun/api/db/init_db.py +2 -4
  34. mlrun/api/db/session.py +1 -1
  35. mlrun/api/db/sqldb/db.py +129 -31
  36. mlrun/api/db/sqldb/models/models_mysql.py +15 -1
  37. mlrun/api/db/sqldb/models/models_sqlite.py +16 -2
  38. mlrun/api/launcher.py +38 -6
  39. mlrun/api/main.py +3 -2
  40. mlrun/api/rundb/__init__.py +13 -0
  41. mlrun/{db → api/rundb}/sqldb.py +36 -84
  42. mlrun/api/runtime_handlers/__init__.py +56 -0
  43. mlrun/api/runtime_handlers/base.py +1247 -0
  44. mlrun/api/runtime_handlers/daskjob.py +209 -0
  45. mlrun/api/runtime_handlers/kubejob.py +37 -0
  46. mlrun/api/runtime_handlers/mpijob.py +147 -0
  47. mlrun/api/runtime_handlers/remotesparkjob.py +29 -0
  48. mlrun/api/runtime_handlers/sparkjob.py +148 -0
  49. mlrun/api/schemas/__init__.py +17 -6
  50. mlrun/api/utils/builder.py +1 -4
  51. mlrun/api/utils/clients/chief.py +14 -0
  52. mlrun/api/utils/clients/iguazio.py +33 -33
  53. mlrun/api/utils/clients/nuclio.py +2 -2
  54. mlrun/api/utils/periodic.py +9 -2
  55. mlrun/api/utils/projects/follower.py +14 -7
  56. mlrun/api/utils/projects/leader.py +2 -1
  57. mlrun/api/utils/projects/remotes/nop_follower.py +2 -2
  58. mlrun/api/utils/projects/remotes/nop_leader.py +2 -2
  59. mlrun/api/utils/runtimes/__init__.py +14 -0
  60. mlrun/api/utils/runtimes/nuclio.py +43 -0
  61. mlrun/api/utils/scheduler.py +98 -15
  62. mlrun/api/utils/singletons/db.py +5 -1
  63. mlrun/api/utils/singletons/project_member.py +4 -1
  64. mlrun/api/utils/singletons/scheduler.py +1 -1
  65. mlrun/artifacts/base.py +6 -6
  66. mlrun/artifacts/dataset.py +4 -4
  67. mlrun/artifacts/manager.py +2 -3
  68. mlrun/artifacts/model.py +2 -2
  69. mlrun/artifacts/plots.py +8 -8
  70. mlrun/common/db/__init__.py +14 -0
  71. mlrun/common/helpers.py +37 -0
  72. mlrun/{mlutils → common/model_monitoring}/__init__.py +3 -2
  73. mlrun/common/model_monitoring/helpers.py +69 -0
  74. mlrun/common/schemas/__init__.py +13 -1
  75. mlrun/common/schemas/auth.py +4 -1
  76. mlrun/common/schemas/client_spec.py +1 -1
  77. mlrun/common/schemas/function.py +17 -0
  78. mlrun/common/schemas/model_monitoring/__init__.py +48 -0
  79. mlrun/common/{model_monitoring.py → schemas/model_monitoring/constants.py} +11 -23
  80. mlrun/common/schemas/model_monitoring/grafana.py +55 -0
  81. mlrun/common/schemas/{model_endpoints.py → model_monitoring/model_endpoints.py} +32 -65
  82. mlrun/common/schemas/notification.py +1 -0
  83. mlrun/common/schemas/object.py +4 -0
  84. mlrun/common/schemas/project.py +1 -0
  85. mlrun/common/schemas/regex.py +1 -1
  86. mlrun/common/schemas/runs.py +1 -8
  87. mlrun/common/schemas/schedule.py +1 -8
  88. mlrun/common/schemas/workflow.py +54 -0
  89. mlrun/config.py +45 -42
  90. mlrun/datastore/__init__.py +21 -0
  91. mlrun/datastore/base.py +1 -1
  92. mlrun/datastore/datastore.py +9 -0
  93. mlrun/datastore/dbfs_store.py +168 -0
  94. mlrun/datastore/helpers.py +18 -0
  95. mlrun/datastore/sources.py +1 -0
  96. mlrun/datastore/store_resources.py +2 -5
  97. mlrun/datastore/v3io.py +1 -2
  98. mlrun/db/__init__.py +4 -68
  99. mlrun/db/base.py +12 -0
  100. mlrun/db/factory.py +65 -0
  101. mlrun/db/httpdb.py +175 -20
  102. mlrun/db/nopdb.py +4 -2
  103. mlrun/execution.py +4 -2
  104. mlrun/feature_store/__init__.py +1 -0
  105. mlrun/feature_store/api.py +1 -2
  106. mlrun/feature_store/common.py +2 -1
  107. mlrun/feature_store/feature_set.py +1 -11
  108. mlrun/feature_store/feature_vector.py +340 -2
  109. mlrun/feature_store/ingestion.py +5 -10
  110. mlrun/feature_store/retrieval/base.py +118 -104
  111. mlrun/feature_store/retrieval/dask_merger.py +17 -10
  112. mlrun/feature_store/retrieval/job.py +4 -1
  113. mlrun/feature_store/retrieval/local_merger.py +18 -18
  114. mlrun/feature_store/retrieval/spark_merger.py +21 -14
  115. mlrun/feature_store/retrieval/storey_merger.py +22 -16
  116. mlrun/kfpops.py +3 -9
  117. mlrun/launcher/base.py +57 -53
  118. mlrun/launcher/client.py +5 -4
  119. mlrun/launcher/factory.py +24 -13
  120. mlrun/launcher/local.py +6 -6
  121. mlrun/launcher/remote.py +4 -4
  122. mlrun/lists.py +0 -11
  123. mlrun/model.py +11 -17
  124. mlrun/model_monitoring/__init__.py +2 -22
  125. mlrun/model_monitoring/features_drift_table.py +1 -1
  126. mlrun/model_monitoring/helpers.py +22 -210
  127. mlrun/model_monitoring/model_endpoint.py +1 -1
  128. mlrun/model_monitoring/model_monitoring_batch.py +127 -50
  129. mlrun/model_monitoring/prometheus.py +219 -0
  130. mlrun/model_monitoring/stores/__init__.py +16 -11
  131. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +95 -23
  132. mlrun/model_monitoring/stores/models/mysql.py +47 -29
  133. mlrun/model_monitoring/stores/models/sqlite.py +47 -29
  134. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +31 -19
  135. mlrun/model_monitoring/{stream_processing_fs.py → stream_processing.py} +206 -64
  136. mlrun/model_monitoring/tracking_policy.py +104 -0
  137. mlrun/package/packager.py +6 -8
  138. mlrun/package/packagers/default_packager.py +121 -10
  139. mlrun/package/packagers/numpy_packagers.py +1 -1
  140. mlrun/platforms/__init__.py +0 -2
  141. mlrun/platforms/iguazio.py +0 -56
  142. mlrun/projects/pipelines.py +53 -159
  143. mlrun/projects/project.py +10 -37
  144. mlrun/render.py +1 -1
  145. mlrun/run.py +8 -124
  146. mlrun/runtimes/__init__.py +6 -42
  147. mlrun/runtimes/base.py +29 -1249
  148. mlrun/runtimes/daskjob.py +2 -198
  149. mlrun/runtimes/funcdoc.py +0 -9
  150. mlrun/runtimes/function.py +25 -29
  151. mlrun/runtimes/kubejob.py +5 -29
  152. mlrun/runtimes/local.py +1 -1
  153. mlrun/runtimes/mpijob/__init__.py +2 -2
  154. mlrun/runtimes/mpijob/abstract.py +10 -1
  155. mlrun/runtimes/mpijob/v1.py +0 -76
  156. mlrun/runtimes/mpijob/v1alpha1.py +1 -74
  157. mlrun/runtimes/nuclio.py +3 -2
  158. mlrun/runtimes/pod.py +28 -18
  159. mlrun/runtimes/remotesparkjob.py +1 -15
  160. mlrun/runtimes/serving.py +14 -6
  161. mlrun/runtimes/sparkjob/__init__.py +0 -1
  162. mlrun/runtimes/sparkjob/abstract.py +4 -131
  163. mlrun/runtimes/utils.py +0 -26
  164. mlrun/serving/routers.py +7 -7
  165. mlrun/serving/server.py +11 -8
  166. mlrun/serving/states.py +7 -1
  167. mlrun/serving/v2_serving.py +6 -6
  168. mlrun/utils/helpers.py +23 -42
  169. mlrun/utils/notifications/notification/__init__.py +4 -0
  170. mlrun/utils/notifications/notification/webhook.py +61 -0
  171. mlrun/utils/notifications/notification_pusher.py +5 -25
  172. mlrun/utils/regex.py +7 -2
  173. mlrun/utils/version/version.json +2 -2
  174. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/METADATA +26 -25
  175. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/RECORD +180 -158
  176. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/WHEEL +1 -1
  177. mlrun/mlutils/data.py +0 -160
  178. mlrun/mlutils/models.py +0 -78
  179. mlrun/mlutils/plots.py +0 -902
  180. mlrun/utils/model_monitoring.py +0 -249
  181. /mlrun/{api/db/sqldb/session.py → common/db/sql_session.py} +0 -0
  182. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/LICENSE +0 -0
  183. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/entry_points.txt +0 -0
  184. {mlrun-1.4.0rc25.dist-info → mlrun-1.5.0rc2.dist-info}/top_level.txt +0 -0
mlrun/config.py CHANGED
@@ -27,8 +27,8 @@ import copy
27
27
  import json
28
28
  import os
29
29
  import typing
30
- import urllib.parse
31
30
  from collections.abc import Mapping
31
+ from datetime import timedelta
32
32
  from distutils.util import strtobool
33
33
  from os.path import expanduser
34
34
  from threading import Lock
@@ -149,7 +149,7 @@ default_config = {
149
149
  "timeout_mode": "enabled",
150
150
  # timeout in seconds to wait for background task to be updated / finished by the worker responsible for the task
151
151
  "default_timeouts": {
152
- "operations": {"migrations": "3600"},
152
+ "operations": {"migrations": "3600", "load_project": "60"},
153
153
  "runtimes": {"dask": "600"},
154
154
  },
155
155
  },
@@ -286,6 +286,7 @@ default_config = {
286
286
  # - mlrun.runtimes.constants.NuclioIngressAddTemplatedIngressModes
287
287
  # - mlrun.runtimes.function.enrich_function_with_ingress
288
288
  "add_templated_ingress_host_mode": "never",
289
+ "explicit_ack": "enabled",
289
290
  },
290
291
  "logs": {
291
292
  "decode": {
@@ -416,7 +417,8 @@ default_config = {
416
417
  "default_http_sink": "http://nuclio-{project}-model-monitoring-stream.mlrun.svc.cluster.local:8080",
417
418
  "batch_processing_function_branch": "master",
418
419
  "parquet_batching_max_events": 10000,
419
- # See mlrun.common.schemas.ModelEndpointStoreType for available options
420
+ "parquet_batching_timeout_secs": timedelta(minutes=30).total_seconds(),
421
+ # See mlrun.model_monitoring.stores.ModelEndpointStoreType for available options
420
422
  "store_type": "v3io-nosql",
421
423
  "endpoint_store_connection": "",
422
424
  },
@@ -456,7 +458,7 @@ default_config = {
456
458
  },
457
459
  "default_targets": "parquet,nosql",
458
460
  "default_job_image": "mlrun/mlrun",
459
- "flush_interval": 300,
461
+ "flush_interval": None,
460
462
  },
461
463
  "ui": {
462
464
  "projects_prefix": "projects", # The UI link prefix for projects
@@ -515,7 +517,11 @@ default_config = {
515
517
  "debug": {
516
518
  "expose_internal_api_endpoints": False,
517
519
  },
518
- "default_workflow_runner_name": "workflow-runner-{}",
520
+ "workflows": {
521
+ "default_workflow_runner_name": "workflow-runner-{}",
522
+ # Default timeout seconds for retrieving workflow id after execution:
523
+ "timeouts": {"local": 120, "kfp": 30},
524
+ },
519
525
  "log_collector": {
520
526
  "address": "localhost:8282",
521
527
  # log collection mode can be one of: "sidecar", "legacy", "best-effort"
@@ -775,7 +781,6 @@ class Config:
775
781
  return semver.VersionInfo.parse(f"{semver_compatible_igz_version}.0")
776
782
 
777
783
  def verify_security_context_enrichment_mode_is_allowed(self):
778
-
779
784
  # TODO: move SecurityContextEnrichmentModes to a different package so that we could use it here without
780
785
  # importing mlrun.api
781
786
  if config.function.spec.security_context.enrichment_mode == "disabled":
@@ -932,36 +937,6 @@ class Config:
932
937
  # when dbpath is set we want to connect to it which will sync configuration from it to the client
933
938
  mlrun.db.get_run_db(value, force_reconnect=True)
934
939
 
935
- @property
936
- def iguazio_api_url(self):
937
- """
938
- we want to be able to run with old versions of the service who runs the API (which doesn't configure this
939
- value) so we're doing best effort to try and resolve it from other configurations
940
- TODO: Remove this hack when 0.6.x is old enough
941
- """
942
- if not self._iguazio_api_url:
943
- if self.httpdb.builder.docker_registry and self.igz_version:
944
- return self._extract_iguazio_api_from_docker_registry_url()
945
- return self._iguazio_api_url
946
-
947
- def _extract_iguazio_api_from_docker_registry_url(self):
948
- docker_registry_url = self.httpdb.builder.docker_registry
949
- # add schema otherwise parsing go wrong
950
- if "://" not in docker_registry_url:
951
- docker_registry_url = f"http://{docker_registry_url}"
952
- parsed_registry_url = urllib.parse.urlparse(docker_registry_url)
953
- registry_hostname = parsed_registry_url.hostname
954
- # replace the first domain section (app service name) with dashboard
955
- first_dot_index = registry_hostname.find(".")
956
- if first_dot_index < 0:
957
- # if not found it's not the format we know - can't resolve the api url from the registry url
958
- return ""
959
- return f"https://dashboard{registry_hostname[first_dot_index:]}"
960
-
961
- @iguazio_api_url.setter
962
- def iguazio_api_url(self, value):
963
- self._iguazio_api_url = value
964
-
965
940
  def is_api_running_on_k8s(self):
966
941
  # determine if the API service is attached to K8s cluster
967
942
  # when there is a cluster the .namespace is set
@@ -1044,6 +1019,40 @@ class Config:
1044
1019
  ver in mlrun.mlconf.ce.mode for ver in ["lite", "full"]
1045
1020
  )
1046
1021
 
1022
+ def get_s3_storage_options(self) -> typing.Dict[str, typing.Any]:
1023
+ """
1024
+ Generate storage options dictionary as required for handling S3 path in fsspec. The model monitoring stream
1025
+ graph uses this method for generating the storage options for S3 parquet target path.
1026
+ :return: A storage options dictionary in which each key-value pair represents a particular configuration,
1027
+ such as endpoint_url or aws access key.
1028
+ """
1029
+ key = mlrun.get_secret_or_env("AWS_ACCESS_KEY_ID")
1030
+ secret = mlrun.get_secret_or_env("AWS_SECRET_ACCESS_KEY")
1031
+
1032
+ force_non_anonymous = mlrun.get_secret_or_env("S3_NON_ANONYMOUS")
1033
+ profile = mlrun.get_secret_or_env("AWS_PROFILE")
1034
+
1035
+ storage_options = dict(
1036
+ anon=not (force_non_anonymous or (key and secret)),
1037
+ key=key,
1038
+ secret=secret,
1039
+ )
1040
+
1041
+ endpoint_url = mlrun.get_secret_or_env("S3_ENDPOINT_URL")
1042
+ if endpoint_url:
1043
+ client_kwargs = {"endpoint_url": endpoint_url}
1044
+ storage_options["client_kwargs"] = client_kwargs
1045
+
1046
+ if profile:
1047
+ storage_options["profile"] = profile
1048
+
1049
+ return storage_options
1050
+
1051
+ def is_explicit_ack(self) -> bool:
1052
+ return self.httpdb.nuclio.explicit_ack == "enabled" and (
1053
+ not self.nuclio_version or self.nuclio_version >= "1.11.20"
1054
+ )
1055
+
1047
1056
 
1048
1057
  # Global configuration
1049
1058
  config = Config.from_dict(default_config)
@@ -1091,12 +1100,6 @@ def _do_populate(env=None, skip_errors=False):
1091
1100
  if data:
1092
1101
  config.update(data, skip_errors=skip_errors)
1093
1102
 
1094
- # HACK to enable config property to both have dynamic default and to use the value from dict/env like other
1095
- # configurations - we just need a key in the dict that is different than the property name, so simply adding prefix
1096
- # underscore
1097
- config._cfg["_iguazio_api_url"] = config._cfg["iguazio_api_url"]
1098
- del config._cfg["iguazio_api_url"]
1099
-
1100
1103
  _validate_config(config)
1101
1104
 
1102
1105
 
@@ -29,8 +29,12 @@ __all__ = [
29
29
  "StreamSource",
30
30
  "KafkaSource",
31
31
  "RedisStore",
32
+ "DatabricksFileSystemDisableCache",
33
+ "DatabricksFileBugFixed",
32
34
  ]
33
35
 
36
+ import fsspec
37
+
34
38
  import mlrun.datastore.wasbfs
35
39
 
36
40
  from ..platforms.iguazio import (
@@ -42,6 +46,7 @@ from ..platforms.iguazio import (
42
46
  from ..utils import logger
43
47
  from .base import DataItem
44
48
  from .datastore import StoreManager, in_memory_store, uri_to_ipython
49
+ from .dbfs_store import DatabricksFileBugFixed, DatabricksFileSystemDisableCache
45
50
  from .s3 import parse_s3_bucket_and_key
46
51
  from .sources import (
47
52
  BigQuerySource,
@@ -62,6 +67,22 @@ from .utils import parse_kafka_url
62
67
 
63
68
  store_manager = StoreManager()
64
69
 
70
+ if hasattr(fsspec, "register_implementation"):
71
+ fsspec.register_implementation(
72
+ "dbfs", DatabricksFileSystemDisableCache, clobber=True
73
+ )
74
+ else:
75
+ from fsspec.registry import known_implementations
76
+
77
+ known_implementations["dbfs"] = {
78
+ "class": "mlrun.datastore.dbfs_store.DatabricksFileSystemDisableCache",
79
+ "err": "Please make sure your fsspec version supports dbfs",
80
+ }
81
+
82
+ del known_implementations
83
+
84
+ del fsspec # clear the module namespace
85
+
65
86
 
66
87
  def set_in_memory_item(key, value):
67
88
  item = store_manager.object(f"memory://{key}")
mlrun/datastore/base.py CHANGED
@@ -261,7 +261,7 @@ class DataStore:
261
261
  updated_args = [f"{base_path}/{filename}"]
262
262
  updated_args.extend(args[1:])
263
263
  dfs.append(df_module.read_csv(*updated_args, **kwargs))
264
- return pd.concat(dfs)
264
+ return df_module.concat(dfs)
265
265
 
266
266
  elif (
267
267
  file_url.endswith(".parquet")
@@ -86,6 +86,10 @@ def schema_to_store(schema):
86
86
  "Google cloud storage packages are missing, use pip install mlrun[google-cloud-storage]"
87
87
  )
88
88
  return GoogleCloudStorageStore
89
+ elif schema == "dbfs":
90
+ from .dbfs_store import DBFSStore
91
+
92
+ return DBFSStore
89
93
  else:
90
94
  raise ValueError(f"unsupported store scheme ({schema})")
91
95
 
@@ -175,6 +179,11 @@ class StoreManager:
175
179
  )
176
180
 
177
181
  store, subpath = self.get_or_create_store(url, secrets=secrets)
182
+ schema, endpoint, parsed_url = parse_url(url)
183
+ # TODO: Modify the URL replacement to be outside of the dataitem. Dataitem class should
184
+ # be implemented as a generic class.
185
+ if endpoint and schema == "dbfs":
186
+ url = url.replace(endpoint, "", 1)
178
187
  return DataItem(key, store, subpath, url, meta=meta, artifact_url=artifact_url)
179
188
 
180
189
  def get_or_create_store(self, url, secrets: dict = None) -> (DataStore, str):
@@ -0,0 +1,168 @@
1
+ # Copyright 2023 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import pathlib
16
+
17
+ import fsspec
18
+ from fsspec.implementations.dbfs import DatabricksFile, DatabricksFileSystem
19
+
20
+ import mlrun.errors
21
+
22
+ from .base import DataStore, FileStats
23
+
24
+
25
+ class DatabricksFileBugFixed(DatabricksFile):
26
+ """Overrides DatabricksFile to add the following fix: https://github.com/fsspec/filesystem_spec/pull/1278"""
27
+
28
+ def _upload_chunk(self, final=False):
29
+ """Internal function to add a chunk of data to a started upload"""
30
+ self.buffer.seek(0)
31
+ data = self.buffer.getvalue()
32
+
33
+ data_chunks = [
34
+ data[start:end] for start, end in self._to_sized_blocks(end=len(data))
35
+ ]
36
+
37
+ for data_chunk in data_chunks:
38
+ self.fs._add_data(handle=self.handle, data=data_chunk)
39
+
40
+ if final:
41
+ self.fs._close_handle(handle=self.handle)
42
+ return True
43
+
44
+ def _fetch_range(self, start, end):
45
+ """Internal function to download a block of data"""
46
+ return_buffer = b""
47
+ for chunk_start, chunk_end in self._to_sized_blocks(start, end):
48
+ return_buffer += self.fs._get_data(
49
+ path=self.path, start=chunk_start, end=chunk_end
50
+ )
51
+
52
+ return return_buffer
53
+
54
+ def _to_sized_blocks(self, start=0, end=100):
55
+ """Helper function to split a range from 0 to total_length into blocksizes"""
56
+ for data_chunk in range(start, end, self.blocksize):
57
+ data_start = data_chunk
58
+ data_end = min(end, data_chunk + self.blocksize)
59
+ yield data_start, data_end
60
+
61
+
62
+ class DatabricksFileSystemDisableCache(DatabricksFileSystem):
63
+ root_marker = "/"
64
+ protocol = "dbfs"
65
+
66
+ def _open(self, path, mode="rb", block_size="default", **kwargs):
67
+ """
68
+ Overwrite the base class method to make sure to create a DBFile.
69
+ All arguments are copied from the base method.
70
+
71
+ Only the default blocksize is allowed.
72
+ """
73
+ return DatabricksFileBugFixed(
74
+ self, path, mode=mode, block_size=block_size, **kwargs
75
+ )
76
+
77
+ # _ls_from_cache is not working properly, so we disable it.
78
+ def _ls_from_cache(self, path):
79
+ pass
80
+
81
+
82
+ # dbfs objects will be represented with the following URL: dbfs://<path>
83
+ class DBFSStore(DataStore):
84
+ def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
85
+ super().__init__(parent, name, schema, endpoint, secrets=secrets)
86
+ self.get_filesystem(silent=False)
87
+
88
+ def get_filesystem(self, silent=True):
89
+ """return fsspec file system object, if supported"""
90
+ if not self._filesystem:
91
+ self._filesystem = fsspec.filesystem("dbfs", **self.get_storage_options())
92
+ return self._filesystem
93
+
94
+ def get_storage_options(self):
95
+ return dict(
96
+ token=self._get_secret_or_env("DATABRICKS_TOKEN"), instance=self.endpoint
97
+ )
98
+
99
+ def _verify_filesystem_and_key(self, key: str):
100
+ if not self._filesystem:
101
+ raise mlrun.errors.MLRunInvalidArgumentError(
102
+ "Performing actions on data-item without a valid filesystem"
103
+ )
104
+ if not key.startswith("/"):
105
+ raise mlrun.errors.MLRunInvalidArgumentError(
106
+ "Invalid key parameter - key must start with '/'"
107
+ )
108
+
109
+ def get(self, key: str, size=None, offset=0) -> bytes:
110
+ self._verify_filesystem_and_key(key)
111
+ if size is not None and size <= 0:
112
+ raise mlrun.errors.MLRunInvalidArgumentError(
113
+ "size cannot be negative or zero"
114
+ )
115
+ start = offset or None
116
+ end = offset + size if size is not None else None
117
+ return self._filesystem.cat_file(key, start=start, end=end)
118
+
119
+ def put(self, key, data, append=False):
120
+
121
+ self._verify_filesystem_and_key(key)
122
+ if append:
123
+ raise mlrun.errors.MLRunInvalidArgumentError(
124
+ "Append mode not supported for Databricks file system"
125
+ )
126
+ # can not use append mode because it overrides data.
127
+ mode = "w"
128
+ if isinstance(data, bytes):
129
+ mode += "b"
130
+ elif not isinstance(data, str):
131
+ raise TypeError(f"Unknown data type {type(data)}")
132
+ with self._filesystem.open(key, mode) as f:
133
+ f.write(data)
134
+
135
+ def upload(self, key: str, src_path: str):
136
+ self._verify_filesystem_and_key(key)
137
+ self._filesystem.put_file(src_path, key, overwrite=True)
138
+
139
+ def stat(self, key: str):
140
+ self._verify_filesystem_and_key(key)
141
+ file = self._filesystem.stat(key)
142
+ if file["type"] == "file":
143
+ size = file["size"]
144
+ elif file["type"] == "directory":
145
+ raise FileNotFoundError("Operation expects a file not a directory!")
146
+ return FileStats(size, None)
147
+
148
+ def listdir(self, key: str):
149
+ """
150
+ Basic ls of file/dir - without recursion.
151
+ """
152
+ self._verify_filesystem_and_key(key)
153
+ if self._filesystem.isfile(key):
154
+ return key
155
+ remote_path = f"{key}/*"
156
+ files = self._filesystem.glob(remote_path)
157
+ # Get only the files and directories under key path, without the key path itself.
158
+ # for example in a filesystem that has this path: /test_mlrun_dbfs_objects/test.txt
159
+ # listdir with the input /test_mlrun_dbfs_objects as a key will return ['test.txt'].
160
+ files = [pathlib.Path(file).name for file in files if "/" in file]
161
+ return files
162
+
163
+ def rm(self, path, recursive=False, maxdepth=None):
164
+ if maxdepth:
165
+ raise mlrun.errors.MLRunInvalidArgumentError(
166
+ "dbfs file system does not support maxdepth option in rm function"
167
+ )
168
+ self.get_filesystem().rm(path=path, recursive=recursive)
@@ -0,0 +1,18 @@
1
+ # Copyright 2023 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+
16
+
17
+ ONE_GB = 1024 * 1024 * 1024
18
+ ONE_MB = 1024 * 1024
@@ -793,6 +793,7 @@ class OnlineSource(BaseSourceDriver):
793
793
  context=context,
794
794
  key_field=self.key_field,
795
795
  full_event=True,
796
+ explicit_ack=mlrun.mlconf.is_explicit_ack(),
796
797
  **source_args,
797
798
  )
798
799
 
@@ -16,12 +16,9 @@
16
16
 
17
17
  import mlrun
18
18
  from mlrun.config import config
19
- from mlrun.utils.helpers import (
20
- is_legacy_artifact,
21
- parse_artifact_uri,
22
- parse_versioned_object_uri,
23
- )
19
+ from mlrun.utils.helpers import is_legacy_artifact, parse_artifact_uri
24
20
 
21
+ from ..common.helpers import parse_versioned_object_uri
25
22
  from ..platforms.iguazio import parse_path
26
23
  from ..utils import DB_SCHEMA, StorePrefix
27
24
  from .targets import get_online_target
mlrun/datastore/v3io.py CHANGED
@@ -22,6 +22,7 @@ import fsspec
22
22
  import v3io.dataplane
23
23
 
24
24
  import mlrun
25
+ from mlrun.datastore.helpers import ONE_GB, ONE_MB
25
26
 
26
27
  from ..platforms.iguazio import parse_path, split_path
27
28
  from .base import (
@@ -36,8 +37,6 @@ from .base import (
36
37
  )
37
38
 
38
39
  V3IO_LOCAL_ROOT = "v3io"
39
- ONE_GB = 1024 * 1024 * 1024
40
- ONE_MB = 1024 * 1024
41
40
 
42
41
 
43
42
  class V3ioStore(DataStore):
mlrun/db/__init__.py CHANGED
@@ -12,14 +12,9 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  from os import environ
15
- from urllib.parse import urlparse
16
15
 
17
16
  from ..config import config
18
- from ..platforms import add_or_refresh_credentials
19
- from ..utils import logger
20
17
  from .base import RunDBError, RunDBInterface # noqa
21
- from .nopdb import NopDB
22
- from .sqldb import SQLDB
23
18
 
24
19
 
25
20
  def get_or_set_dburl(default=""):
@@ -29,69 +24,10 @@ def get_or_set_dburl(default=""):
29
24
  return config.dbpath
30
25
 
31
26
 
32
- def get_httpdb_kwargs(host, username, password):
33
- username = username or config.httpdb.user
34
- password = password or config.httpdb.password
35
-
36
- username, password, token = add_or_refresh_credentials(
37
- host, username, password, config.httpdb.token
38
- )
39
-
40
- return {
41
- "user": username,
42
- "password": password,
43
- "token": token,
44
- }
45
-
46
-
47
- _run_db = None
48
- _last_db_url = None
49
-
50
-
51
27
  def get_run_db(url="", secrets=None, force_reconnect=False):
52
28
  """Returns the runtime database"""
53
- global _run_db, _last_db_url
54
-
55
- if not url:
56
- url = get_or_set_dburl("./")
57
-
58
- if (
59
- _last_db_url is not None
60
- and url == _last_db_url
61
- and _run_db
62
- and not force_reconnect
63
- ):
64
- return _run_db
65
- _last_db_url = url
66
-
67
- parsed_url = urlparse(url)
68
- scheme = parsed_url.scheme.lower()
69
- kwargs = {}
70
- if "://" not in str(url) or scheme in ["file", "s3", "v3io", "v3ios"]:
71
- logger.warning(
72
- "Could not detect path to API server, not connected to API server!"
73
- )
74
- logger.warning(
75
- "MLRUN_DBPATH is not set. Set this environment variable to the URL of the API server"
76
- " in order to connect"
77
- )
78
- cls = NopDB
79
-
80
- elif scheme in ("http", "https"):
81
- # import here to avoid circular imports
82
- from .httpdb import HTTPRunDB
83
-
84
- cls = HTTPRunDB
85
- kwargs = get_httpdb_kwargs(
86
- parsed_url.hostname, parsed_url.username, parsed_url.password
87
- )
88
- endpoint = parsed_url.hostname
89
- if parsed_url.port:
90
- endpoint += f":{parsed_url.port}"
91
- url = f"{parsed_url.scheme}://{endpoint}{parsed_url.path}"
92
- else:
93
- cls = SQLDB
29
+ # import here to avoid circular import
30
+ import mlrun.db.factory
94
31
 
95
- _run_db = cls(url, **kwargs)
96
- _run_db.connect(secrets=secrets)
97
- return _run_db
32
+ run_db_factory = mlrun.db.factory.RunDBFactory()
33
+ return run_db_factory.create_run_db(url, secrets, force_reconnect)
mlrun/db/base.py CHANGED
@@ -621,3 +621,15 @@ class RunDBInterface(ABC):
621
621
  notifications: typing.List[mlrun.model.Notification],
622
622
  ):
623
623
  pass
624
+
625
+ def store_run_notifications(
626
+ self,
627
+ notification_objects: typing.List[mlrun.model.Notification],
628
+ run_uid: str,
629
+ project: str = None,
630
+ mask_params: bool = True,
631
+ ):
632
+ pass
633
+
634
+ def watch_log(self, uid, project="", watch=True, offset=0):
635
+ pass
mlrun/db/factory.py ADDED
@@ -0,0 +1,65 @@
1
+ # Copyright 2023 MLRun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from dependency_injector import containers, providers
15
+
16
+ import mlrun.db
17
+ import mlrun.db.httpdb
18
+ import mlrun.db.nopdb
19
+ import mlrun.utils.singleton
20
+ from mlrun.utils import logger
21
+
22
+
23
+ class RunDBFactory(
24
+ metaclass=mlrun.utils.singleton.AbstractSingleton,
25
+ ):
26
+ def __init__(self):
27
+ self._run_db = None
28
+ self._last_db_url = None
29
+ self._rundb_container = RunDBContainer()
30
+
31
+ def create_run_db(self, url="", secrets=None, force_reconnect=False):
32
+ """Returns the runtime database"""
33
+ if not url:
34
+ url = mlrun.db.get_or_set_dburl("./")
35
+
36
+ if (
37
+ self._last_db_url is not None
38
+ and url == self._last_db_url
39
+ and self._run_db
40
+ and not force_reconnect
41
+ ):
42
+ return self._run_db
43
+
44
+ self._last_db_url = url
45
+
46
+ if "://" not in str(url):
47
+ logger.warning(
48
+ "Could not detect path to API server, not connected to API server!"
49
+ )
50
+ logger.warning(
51
+ "MLRUN_DBPATH is misconfigured. Set this environment variable to the URL of the API server"
52
+ " in order to connect"
53
+ )
54
+ self._run_db = self._rundb_container.nop(url)
55
+
56
+ else:
57
+ self._run_db = self._rundb_container.run_db(url)
58
+
59
+ self._run_db.connect(secrets=secrets)
60
+ return self._run_db
61
+
62
+
63
+ class RunDBContainer(containers.DeclarativeContainer):
64
+ nop = providers.Factory(mlrun.db.nopdb.NopDB)
65
+ run_db = providers.Factory(mlrun.db.httpdb.HTTPRunDB)