mlrun 1.7.2rc3__py3-none-any.whl → 1.8.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (222) hide show
  1. mlrun/__init__.py +14 -12
  2. mlrun/__main__.py +3 -3
  3. mlrun/alerts/alert.py +19 -12
  4. mlrun/artifacts/__init__.py +0 -2
  5. mlrun/artifacts/base.py +34 -11
  6. mlrun/artifacts/dataset.py +16 -16
  7. mlrun/artifacts/manager.py +13 -13
  8. mlrun/artifacts/model.py +66 -53
  9. mlrun/common/constants.py +6 -0
  10. mlrun/common/formatters/__init__.py +1 -0
  11. mlrun/common/formatters/feature_set.py +1 -0
  12. mlrun/common/formatters/function.py +1 -0
  13. mlrun/common/formatters/model_endpoint.py +30 -0
  14. mlrun/common/formatters/pipeline.py +1 -2
  15. mlrun/common/model_monitoring/__init__.py +0 -3
  16. mlrun/common/model_monitoring/helpers.py +1 -1
  17. mlrun/common/runtimes/constants.py +1 -2
  18. mlrun/common/schemas/__init__.py +4 -2
  19. mlrun/common/schemas/artifact.py +0 -6
  20. mlrun/common/schemas/common.py +50 -0
  21. mlrun/common/schemas/model_monitoring/__init__.py +8 -1
  22. mlrun/common/schemas/model_monitoring/constants.py +62 -12
  23. mlrun/common/schemas/model_monitoring/model_endpoint_v2.py +149 -0
  24. mlrun/common/schemas/model_monitoring/model_endpoints.py +21 -5
  25. mlrun/common/schemas/partition.py +122 -0
  26. mlrun/config.py +43 -15
  27. mlrun/data_types/__init__.py +0 -2
  28. mlrun/data_types/data_types.py +0 -1
  29. mlrun/data_types/infer.py +3 -1
  30. mlrun/data_types/spark.py +4 -4
  31. mlrun/data_types/to_pandas.py +2 -11
  32. mlrun/datastore/__init__.py +0 -2
  33. mlrun/datastore/alibaba_oss.py +4 -1
  34. mlrun/datastore/azure_blob.py +4 -1
  35. mlrun/datastore/base.py +12 -4
  36. mlrun/datastore/datastore.py +9 -3
  37. mlrun/datastore/datastore_profile.py +1 -1
  38. mlrun/datastore/dbfs_store.py +4 -1
  39. mlrun/datastore/filestore.py +4 -1
  40. mlrun/datastore/google_cloud_storage.py +4 -1
  41. mlrun/datastore/hdfs.py +4 -1
  42. mlrun/datastore/inmem.py +4 -1
  43. mlrun/datastore/redis.py +4 -1
  44. mlrun/datastore/s3.py +4 -1
  45. mlrun/datastore/sources.py +51 -49
  46. mlrun/datastore/store_resources.py +0 -2
  47. mlrun/datastore/targets.py +22 -23
  48. mlrun/datastore/utils.py +2 -2
  49. mlrun/datastore/v3io.py +4 -1
  50. mlrun/datastore/wasbfs/fs.py +13 -12
  51. mlrun/db/base.py +126 -62
  52. mlrun/db/factory.py +3 -0
  53. mlrun/db/httpdb.py +767 -231
  54. mlrun/db/nopdb.py +126 -57
  55. mlrun/errors.py +2 -2
  56. mlrun/execution.py +55 -29
  57. mlrun/feature_store/__init__.py +0 -2
  58. mlrun/feature_store/api.py +40 -40
  59. mlrun/feature_store/common.py +9 -9
  60. mlrun/feature_store/feature_set.py +20 -18
  61. mlrun/feature_store/feature_vector.py +27 -24
  62. mlrun/feature_store/retrieval/base.py +14 -9
  63. mlrun/feature_store/retrieval/job.py +2 -1
  64. mlrun/feature_store/steps.py +2 -2
  65. mlrun/features.py +30 -13
  66. mlrun/frameworks/__init__.py +1 -2
  67. mlrun/frameworks/_common/__init__.py +1 -2
  68. mlrun/frameworks/_common/artifacts_library.py +2 -2
  69. mlrun/frameworks/_common/mlrun_interface.py +10 -6
  70. mlrun/frameworks/_common/model_handler.py +29 -27
  71. mlrun/frameworks/_common/producer.py +3 -1
  72. mlrun/frameworks/_dl_common/__init__.py +1 -2
  73. mlrun/frameworks/_dl_common/loggers/__init__.py +1 -2
  74. mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +4 -4
  75. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +3 -3
  76. mlrun/frameworks/_ml_common/__init__.py +1 -2
  77. mlrun/frameworks/_ml_common/loggers/__init__.py +1 -2
  78. mlrun/frameworks/_ml_common/model_handler.py +21 -21
  79. mlrun/frameworks/_ml_common/plans/__init__.py +1 -2
  80. mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +3 -1
  81. mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
  82. mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
  83. mlrun/frameworks/auto_mlrun/__init__.py +1 -2
  84. mlrun/frameworks/auto_mlrun/auto_mlrun.py +22 -15
  85. mlrun/frameworks/huggingface/__init__.py +1 -2
  86. mlrun/frameworks/huggingface/model_server.py +9 -9
  87. mlrun/frameworks/lgbm/__init__.py +47 -44
  88. mlrun/frameworks/lgbm/callbacks/__init__.py +1 -2
  89. mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -2
  90. mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -2
  91. mlrun/frameworks/lgbm/mlrun_interfaces/__init__.py +1 -2
  92. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +5 -5
  93. mlrun/frameworks/lgbm/model_handler.py +15 -11
  94. mlrun/frameworks/lgbm/model_server.py +11 -7
  95. mlrun/frameworks/lgbm/utils.py +2 -2
  96. mlrun/frameworks/onnx/__init__.py +1 -2
  97. mlrun/frameworks/onnx/dataset.py +3 -3
  98. mlrun/frameworks/onnx/mlrun_interface.py +2 -2
  99. mlrun/frameworks/onnx/model_handler.py +7 -5
  100. mlrun/frameworks/onnx/model_server.py +8 -6
  101. mlrun/frameworks/parallel_coordinates.py +11 -11
  102. mlrun/frameworks/pytorch/__init__.py +22 -23
  103. mlrun/frameworks/pytorch/callbacks/__init__.py +1 -2
  104. mlrun/frameworks/pytorch/callbacks/callback.py +2 -1
  105. mlrun/frameworks/pytorch/callbacks/logging_callback.py +15 -8
  106. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +19 -12
  107. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +22 -15
  108. mlrun/frameworks/pytorch/callbacks_handler.py +36 -30
  109. mlrun/frameworks/pytorch/mlrun_interface.py +17 -17
  110. mlrun/frameworks/pytorch/model_handler.py +21 -17
  111. mlrun/frameworks/pytorch/model_server.py +13 -9
  112. mlrun/frameworks/sklearn/__init__.py +19 -18
  113. mlrun/frameworks/sklearn/estimator.py +2 -2
  114. mlrun/frameworks/sklearn/metric.py +3 -3
  115. mlrun/frameworks/sklearn/metrics_library.py +8 -6
  116. mlrun/frameworks/sklearn/mlrun_interface.py +3 -2
  117. mlrun/frameworks/sklearn/model_handler.py +4 -3
  118. mlrun/frameworks/tf_keras/__init__.py +11 -12
  119. mlrun/frameworks/tf_keras/callbacks/__init__.py +1 -2
  120. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +17 -14
  121. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +15 -12
  122. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +21 -18
  123. mlrun/frameworks/tf_keras/model_handler.py +17 -13
  124. mlrun/frameworks/tf_keras/model_server.py +12 -8
  125. mlrun/frameworks/xgboost/__init__.py +19 -18
  126. mlrun/frameworks/xgboost/model_handler.py +13 -9
  127. mlrun/launcher/base.py +3 -4
  128. mlrun/launcher/local.py +1 -1
  129. mlrun/launcher/remote.py +1 -1
  130. mlrun/lists.py +4 -3
  131. mlrun/model.py +108 -44
  132. mlrun/model_monitoring/__init__.py +1 -2
  133. mlrun/model_monitoring/api.py +6 -6
  134. mlrun/model_monitoring/applications/_application_steps.py +13 -15
  135. mlrun/model_monitoring/applications/histogram_data_drift.py +41 -15
  136. mlrun/model_monitoring/applications/results.py +55 -3
  137. mlrun/model_monitoring/controller.py +185 -223
  138. mlrun/model_monitoring/db/_schedules.py +156 -0
  139. mlrun/model_monitoring/db/_stats.py +189 -0
  140. mlrun/model_monitoring/db/stores/__init__.py +1 -1
  141. mlrun/model_monitoring/db/stores/base/store.py +6 -65
  142. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -25
  143. mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -97
  144. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +2 -58
  145. mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -15
  146. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +6 -257
  147. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +9 -271
  148. mlrun/model_monitoring/db/tsdb/base.py +74 -22
  149. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +66 -35
  150. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +33 -0
  151. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +284 -51
  152. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +1 -0
  153. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +35 -17
  154. mlrun/model_monitoring/helpers.py +97 -1
  155. mlrun/model_monitoring/model_endpoint.py +4 -2
  156. mlrun/model_monitoring/stream_processing.py +2 -2
  157. mlrun/model_monitoring/tracking_policy.py +10 -3
  158. mlrun/model_monitoring/writer.py +47 -26
  159. mlrun/package/__init__.py +3 -6
  160. mlrun/package/context_handler.py +1 -1
  161. mlrun/package/packager.py +12 -9
  162. mlrun/package/packagers/__init__.py +0 -2
  163. mlrun/package/packagers/default_packager.py +14 -11
  164. mlrun/package/packagers/numpy_packagers.py +16 -7
  165. mlrun/package/packagers/pandas_packagers.py +18 -18
  166. mlrun/package/packagers/python_standard_library_packagers.py +25 -11
  167. mlrun/package/packagers_manager.py +31 -14
  168. mlrun/package/utils/__init__.py +0 -3
  169. mlrun/package/utils/_pickler.py +6 -6
  170. mlrun/platforms/__init__.py +3 -3
  171. mlrun/platforms/iguazio.py +4 -1
  172. mlrun/projects/__init__.py +1 -6
  173. mlrun/projects/operations.py +27 -27
  174. mlrun/projects/pipelines.py +85 -215
  175. mlrun/projects/project.py +444 -158
  176. mlrun/run.py +9 -9
  177. mlrun/runtimes/__init__.py +1 -3
  178. mlrun/runtimes/base.py +13 -10
  179. mlrun/runtimes/daskjob.py +9 -9
  180. mlrun/runtimes/generators.py +2 -1
  181. mlrun/runtimes/kubejob.py +4 -5
  182. mlrun/runtimes/mpijob/__init__.py +0 -2
  183. mlrun/runtimes/mpijob/abstract.py +7 -6
  184. mlrun/runtimes/nuclio/api_gateway.py +7 -7
  185. mlrun/runtimes/nuclio/application/application.py +11 -11
  186. mlrun/runtimes/nuclio/function.py +14 -13
  187. mlrun/runtimes/nuclio/serving.py +9 -9
  188. mlrun/runtimes/pod.py +74 -29
  189. mlrun/runtimes/remotesparkjob.py +3 -2
  190. mlrun/runtimes/sparkjob/__init__.py +0 -2
  191. mlrun/runtimes/sparkjob/spark3job.py +21 -11
  192. mlrun/runtimes/utils.py +6 -5
  193. mlrun/serving/merger.py +6 -4
  194. mlrun/serving/remote.py +18 -17
  195. mlrun/serving/routers.py +27 -27
  196. mlrun/serving/server.py +1 -1
  197. mlrun/serving/states.py +76 -71
  198. mlrun/serving/utils.py +13 -2
  199. mlrun/serving/v1_serving.py +3 -2
  200. mlrun/serving/v2_serving.py +4 -4
  201. mlrun/track/__init__.py +1 -1
  202. mlrun/track/tracker.py +2 -2
  203. mlrun/track/trackers/mlflow_tracker.py +6 -5
  204. mlrun/utils/async_http.py +1 -1
  205. mlrun/utils/helpers.py +72 -28
  206. mlrun/utils/logger.py +104 -2
  207. mlrun/utils/notifications/notification/base.py +23 -4
  208. mlrun/utils/notifications/notification/console.py +1 -1
  209. mlrun/utils/notifications/notification/git.py +6 -6
  210. mlrun/utils/notifications/notification/ipython.py +5 -4
  211. mlrun/utils/notifications/notification/slack.py +1 -1
  212. mlrun/utils/notifications/notification/webhook.py +13 -17
  213. mlrun/utils/notifications/notification_pusher.py +23 -19
  214. mlrun/utils/regex.py +1 -1
  215. mlrun/utils/version/version.json +2 -2
  216. {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0rc1.dist-info}/METADATA +186 -186
  217. mlrun-1.8.0rc1.dist-info/RECORD +356 -0
  218. {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0rc1.dist-info}/WHEEL +1 -1
  219. mlrun-1.7.2rc3.dist-info/RECORD +0 -351
  220. {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0rc1.dist-info}/LICENSE +0 -0
  221. {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0rc1.dist-info}/entry_points.txt +0 -0
  222. {mlrun-1.7.2rc3.dist-info → mlrun-1.8.0rc1.dist-info}/top_level.txt +0 -0
mlrun/config.py CHANGED
@@ -102,6 +102,9 @@ default_config = {
102
102
  "log_level": "INFO",
103
103
  # log formatter (options: human | human_extended | json)
104
104
  "log_formatter": "human",
105
+ # custom logger format, workes only with log_formatter: custom
106
+ # Note that your custom format must include those 4 fields - timestamp, level, message and more
107
+ "log_format_override": None,
105
108
  "submit_timeout": "180", # timeout when submitting a new k8s resource
106
109
  # runtimes cleanup interval in seconds
107
110
  "runtimes_cleanup_interval": "300",
@@ -120,14 +123,6 @@ default_config = {
120
123
  "projects": {
121
124
  "summaries": {
122
125
  "cache_interval": "30",
123
- "feature_gates": {
124
- "artifacts": "enabled",
125
- "schedules": "enabled",
126
- "feature_sets": "enabled",
127
- "models": "enabled",
128
- "runs": "enabled",
129
- "pipelines": "enabled",
130
- },
131
126
  },
132
127
  },
133
128
  },
@@ -140,6 +135,9 @@ default_config = {
140
135
  "delete_crd_resources_timeout": "5 minutes",
141
136
  },
142
137
  },
138
+ "object_retentions": {
139
+ "alert_activation": 14 * 7, # days
140
+ },
143
141
  # the grace period (in seconds) that will be given to runtime resources (after they're in terminal state)
144
142
  # before deleting them (4 hours)
145
143
  "runtime_resources_deletion_grace_period": "14400",
@@ -314,7 +312,7 @@ default_config = {
314
312
  },
315
313
  "request_timeout": 45, # seconds
316
314
  },
317
- # see server.api.utils.helpers.ensure_running_on_chief
315
+ # see server.py.services.api.utils.helpers.ensure_running_on_chief
318
316
  "ensure_function_running_on_chief_mode": "enabled",
319
317
  },
320
318
  "port": 8080,
@@ -794,7 +792,7 @@ default_config = {
794
792
  "grafana_url": "",
795
793
  "alerts": {
796
794
  # supported modes: "enabled", "disabled".
797
- "mode": "disabled",
795
+ "mode": "enabled",
798
796
  # maximum number of alerts we allow to be configured.
799
797
  # user will get an error when exceeding this
800
798
  "max_allowed": 10000,
@@ -851,6 +849,22 @@ class Config:
851
849
  name = self.__class__.__name__
852
850
  return f"{name}({self._cfg!r})"
853
851
 
852
+ def __iter__(self):
853
+ if isinstance(self._cfg, Mapping):
854
+ return self._cfg.__iter__()
855
+
856
+ def items(self):
857
+ if isinstance(self._cfg, Mapping):
858
+ return iter(self._cfg.items())
859
+
860
+ def keys(self):
861
+ if isinstance(self._cfg, Mapping):
862
+ return iter(self.data.keys())
863
+
864
+ def values(self):
865
+ if isinstance(self._cfg, Mapping):
866
+ return iter(self.data.values())
867
+
854
868
  def update(self, cfg, skip_errors=False):
855
869
  for key, value in cfg.items():
856
870
  if hasattr(self, key):
@@ -1043,6 +1057,17 @@ class Config:
1043
1057
  f"is not allowed for iguazio version: {igz_version} < 3.5.1"
1044
1058
  )
1045
1059
 
1060
+ def validate_object_retentions(self):
1061
+ for table_name, retention_days in self.object_retentions.items():
1062
+ if retention_days < 7 and not os.getenv("PARTITION_INTERVAL"):
1063
+ raise mlrun.errors.MLRunInvalidArgumentError(
1064
+ f"{table_name} partition interval must be greater than a week"
1065
+ )
1066
+ elif retention_days > 53 * 7:
1067
+ raise mlrun.errors.MLRunInvalidArgumentError(
1068
+ f"{table_name} partition interval must be less than a year"
1069
+ )
1070
+
1046
1071
  def resolve_chief_api_url(self) -> str:
1047
1072
  if self.httpdb.clusterization.chief.url:
1048
1073
  return self.httpdb.clusterization.chief.url
@@ -1201,9 +1226,9 @@ class Config:
1201
1226
 
1202
1227
  def get_model_monitoring_file_target_path(
1203
1228
  self,
1204
- project: str = "",
1205
- kind: str = "",
1206
- target: str = "online",
1229
+ project: str,
1230
+ kind: str,
1231
+ target: typing.Literal["online", "offline"] = "online",
1207
1232
  artifact_path: typing.Optional[str] = None,
1208
1233
  function_name: typing.Optional[str] = None,
1209
1234
  **kwargs,
@@ -1381,9 +1406,12 @@ def _validate_config(config):
1381
1406
  pass
1382
1407
 
1383
1408
  config.verify_security_context_enrichment_mode_is_allowed()
1409
+ config.validate_object_retentions()
1384
1410
 
1385
1411
 
1386
- def _verify_gpu_requests_and_limits(requests_gpu: str = None, limits_gpu: str = None):
1412
+ def _verify_gpu_requests_and_limits(
1413
+ requests_gpu: typing.Optional[str] = None, limits_gpu: typing.Optional[str] = None
1414
+ ):
1387
1415
  # https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/
1388
1416
  if requests_gpu and not limits_gpu:
1389
1417
  raise mlrun.errors.MLRunConflictError(
@@ -1396,7 +1424,7 @@ def _verify_gpu_requests_and_limits(requests_gpu: str = None, limits_gpu: str =
1396
1424
  )
1397
1425
 
1398
1426
 
1399
- def _convert_resources_to_str(config: dict = None):
1427
+ def _convert_resources_to_str(config: typing.Optional[dict] = None):
1400
1428
  resources_types = ["cpu", "memory", "gpu"]
1401
1429
  resource_requirements = ["requests", "limits"]
1402
1430
  if not config.get("default_function_pod_resources"):
@@ -11,8 +11,6 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- #
15
- # flake8: noqa - this is until we take care of the F401 violations with respect to __all__ & sphinx
16
14
 
17
15
  from .data_types import (
18
16
  InferOptions,
@@ -124,7 +124,6 @@ def spark_to_value_type(data_type):
124
124
  "double": ValueType.DOUBLE,
125
125
  "boolean": ValueType.BOOL,
126
126
  "timestamp": ValueType.DATETIME,
127
- "timestamp_ntz": ValueType.DATETIME,
128
127
  "string": ValueType.STRING,
129
128
  "array": "list",
130
129
  "map": "dict",
mlrun/data_types/infer.py CHANGED
@@ -12,6 +12,8 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  #
15
+ from typing import Optional
16
+
15
17
  import numpy as np
16
18
  import packaging.version
17
19
  import pandas as pd
@@ -29,7 +31,7 @@ def infer_schema_from_df(
29
31
  df: pd.DataFrame,
30
32
  features,
31
33
  entities,
32
- timestamp_key: str = None,
34
+ timestamp_key: Optional[str] = None,
33
35
  entity_columns=None,
34
36
  options: InferOptions = InferOptions.Null,
35
37
  ):
mlrun/data_types/spark.py CHANGED
@@ -14,11 +14,12 @@
14
14
  #
15
15
  from datetime import datetime
16
16
  from os import environ
17
+ from typing import Optional
17
18
 
18
19
  import numpy as np
19
20
  import pytz
20
21
  from pyspark.sql.functions import to_utc_timestamp
21
- from pyspark.sql.types import BooleanType, DoubleType
22
+ from pyspark.sql.types import BooleanType, DoubleType, TimestampType
22
23
 
23
24
  from mlrun.feature_store.retrieval.spark_merger import spark_df_to_pandas
24
25
  from mlrun.utils import logger
@@ -35,7 +36,7 @@ def infer_schema_from_df_spark(
35
36
  df,
36
37
  features,
37
38
  entities,
38
- timestamp_key: str = None,
39
+ timestamp_key: Optional[str] = None,
39
40
  entity_columns=None,
40
41
  options: InferOptions = InferOptions.Null,
41
42
  ):
@@ -143,8 +144,7 @@ def get_df_stats_spark(df, options, num_bins=20, sample_size=None):
143
144
  timestamp_columns = set()
144
145
  boolean_columns = set()
145
146
  for field in df_after_type_casts.schema.fields:
146
- # covers TimestampType and TimestampNTZType, which was added in PySpark 3.4.0
147
- is_timestamp = field.dataType.typeName().startswith("timestamp")
147
+ is_timestamp = isinstance(field.dataType, TimestampType)
148
148
  is_boolean = isinstance(field.dataType, BooleanType)
149
149
  if is_timestamp:
150
150
  df_after_type_casts = df_after_type_casts.withColumn(
@@ -244,15 +244,6 @@ def _to_corrected_pandas_type(dt):
244
244
 
245
245
 
246
246
  def spark_df_to_pandas(spark_df):
247
- import pyspark
248
-
249
- if semver.parse(pyspark.__version__) >= semver.Version(3, 5, 0):
250
-
251
- def to_pandas(spark_df_inner):
252
- return spark_df_inner.toPandas()
253
- else:
254
- to_pandas = _to_pandas
255
-
256
247
  # as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
257
248
  # when we upgrade pyspark, we should check whether this workaround is still necessary
258
249
  # see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
@@ -271,9 +262,9 @@ def spark_df_to_pandas(spark_df):
271
262
  )
272
263
  type_conversion_dict[field.name] = "datetime64[ns]"
273
264
 
274
- df = to_pandas(spark_df)
265
+ df = _to_pandas(spark_df)
275
266
  if type_conversion_dict:
276
267
  df = df.astype(type_conversion_dict)
277
268
  return df
278
269
  else:
279
- return to_pandas(spark_df)
270
+ return _to_pandas(spark_df)
@@ -12,8 +12,6 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- # flake8: noqa - this is until we take care of the F401 violations with respect to __all__ & sphinx
16
-
17
15
  __all__ = [
18
16
  "DataItem",
19
17
  "get_store_resource",
@@ -15,6 +15,7 @@
15
15
  import time
16
16
  from datetime import datetime
17
17
  from pathlib import Path
18
+ from typing import Optional
18
19
  from urllib.parse import urlparse
19
20
 
20
21
  import oss2
@@ -28,7 +29,9 @@ from .base import DataStore, FileStats, make_datastore_schema_sanitizer
28
29
  class OSSStore(DataStore):
29
30
  using_bucket = True
30
31
 
31
- def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
32
+ def __init__(
33
+ self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
34
+ ):
32
35
  super().__init__(parent, name, schema, endpoint, secrets)
33
36
  # will be used in case user asks to assume a role and work through fsspec
34
37
 
@@ -14,6 +14,7 @@
14
14
 
15
15
  import time
16
16
  from pathlib import Path
17
+ from typing import Optional
17
18
  from urllib.parse import urlparse
18
19
 
19
20
  from azure.storage.blob import BlobServiceClient
@@ -36,7 +37,9 @@ class AzureBlobStore(DataStore):
36
37
  1024 * 1024 * 8
37
38
  ) # for service_client property only, does not affect filesystem
38
39
 
39
- def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
40
+ def __init__(
41
+ self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
42
+ ):
40
43
  super().__init__(parent, name, schema, endpoint, secrets=secrets)
41
44
  self._service_client = None
42
45
  self._storage_options = None
mlrun/datastore/base.py CHANGED
@@ -48,7 +48,7 @@ class FileStats:
48
48
  class DataStore:
49
49
  using_bucket = False
50
50
 
51
- def __init__(self, parent, name, kind, endpoint="", secrets: dict = None):
51
+ def __init__(self, parent, name, kind, endpoint="", secrets: Optional[dict] = None):
52
52
  self._parent = parent
53
53
  self.kind = kind
54
54
  self.name = name
@@ -500,12 +500,18 @@ class DataItem:
500
500
  """DataItem url e.g. /dir/path, s3://bucket/path"""
501
501
  return self._url
502
502
 
503
- def get(self, size=None, offset=0, encoding=None):
503
+ def get(
504
+ self,
505
+ size: Optional[int] = None,
506
+ offset: int = 0,
507
+ encoding: Optional[str] = None,
508
+ ) -> Union[bytes, str]:
504
509
  """read all or a byte range and return the content
505
510
 
506
511
  :param size: number of bytes to get
507
512
  :param offset: fetch from offset (in bytes)
508
513
  :param encoding: encoding (e.g. "utf-8") for converting bytes to str
514
+ :return: the bytes/str content
509
515
  """
510
516
  body = self._store.get(self._path, size=size, offset=offset)
511
517
  if encoding and isinstance(body, bytes):
@@ -519,7 +525,7 @@ class DataItem:
519
525
  """
520
526
  self._store.download(self._path, target_path)
521
527
 
522
- def put(self, data, append=False):
528
+ def put(self, data: Union[bytes, str], append: bool = False) -> None:
523
529
  """write/upload the data, append is only supported by some datastores
524
530
 
525
531
  :param data: data (bytes/str) to write
@@ -687,7 +693,9 @@ def basic_auth_header(user, password):
687
693
 
688
694
 
689
695
  class HttpStore(DataStore):
690
- def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
696
+ def __init__(
697
+ self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
698
+ ):
691
699
  super().__init__(parent, name, schema, endpoint, secrets)
692
700
  self._https_auth_token = None
693
701
  self._schema = schema
@@ -11,6 +11,7 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ from typing import Optional
14
15
  from urllib.parse import urlparse
15
16
 
16
17
  from mergedeep import merge
@@ -178,12 +179,17 @@ class StoreManager:
178
179
  # which accepts a feature vector uri and generate the offline vector (parquet) for it if it doesnt exist
179
180
  if not target and not allow_empty_resources:
180
181
  raise mlrun.errors.MLRunInvalidArgumentError(
181
- f"resource {url} does not have a valid/persistent offline target"
182
+ f"Resource {url} does not have a valid/persistent offline target"
182
183
  )
183
184
  return resource, target or ""
184
185
 
185
186
  def object(
186
- self, url, key="", project="", allow_empty_resources=None, secrets: dict = None
187
+ self,
188
+ url,
189
+ key="",
190
+ project="",
191
+ allow_empty_resources=None,
192
+ secrets: Optional[dict] = None,
187
193
  ) -> DataItem:
188
194
  meta = artifact_url = None
189
195
  if is_store_uri(url):
@@ -205,7 +211,7 @@ class StoreManager:
205
211
  )
206
212
 
207
213
  def get_or_create_store(
208
- self, url, secrets: dict = None, project_name=""
214
+ self, url, secrets: Optional[dict] = None, project_name=""
209
215
  ) -> (DataStore, str, str):
210
216
  schema, endpoint, parsed_url = parse_url(url)
211
217
  subpath = parsed_url.path
@@ -489,7 +489,7 @@ class DatastoreProfile2Json(pydantic.BaseModel):
489
489
  )
490
490
 
491
491
 
492
- def datastore_profile_read(url, project_name="", secrets: dict = None):
492
+ def datastore_profile_read(url, project_name="", secrets: typing.Optional[dict] = None):
493
493
  parsed_url = urlparse(url)
494
494
  if parsed_url.scheme.lower() != "ds":
495
495
  raise mlrun.errors.MLRunInvalidArgumentError(
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import pathlib
16
+ from typing import Optional
16
17
 
17
18
  from fsspec.implementations.dbfs import DatabricksFile, DatabricksFileSystem
18
19
  from fsspec.registry import get_filesystem_class
@@ -81,7 +82,9 @@ class DatabricksFileSystemDisableCache(DatabricksFileSystem):
81
82
 
82
83
  # dbfs objects will be represented with the following URL: dbfs://<path>
83
84
  class DBFSStore(DataStore):
84
- def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
85
+ def __init__(
86
+ self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
87
+ ):
85
88
  super().__init__(parent, name, schema, endpoint, secrets=secrets)
86
89
 
87
90
  @property
@@ -14,6 +14,7 @@
14
14
  import time
15
15
  from os import listdir, makedirs, path, stat
16
16
  from shutil import copyfile
17
+ from typing import Optional
17
18
 
18
19
  import fsspec
19
20
 
@@ -23,7 +24,9 @@ from .base import DataStore, FileStats
23
24
 
24
25
 
25
26
  class FileStore(DataStore):
26
- def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
27
+ def __init__(
28
+ self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
29
+ ):
27
30
  super().__init__(parent, name, "file", endpoint, secrets=secrets)
28
31
 
29
32
  self._item_path, self._real_path = None, None
@@ -14,6 +14,7 @@
14
14
  import json
15
15
  import os
16
16
  from pathlib import Path
17
+ from typing import Optional
17
18
 
18
19
  from fsspec.registry import get_filesystem_class
19
20
  from google.auth.credentials import Credentials
@@ -33,7 +34,9 @@ class GoogleCloudStorageStore(DataStore):
33
34
  workers = 8
34
35
  chunk_size = 32 * 1024 * 1024
35
36
 
36
- def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
37
+ def __init__(
38
+ self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
39
+ ):
37
40
  super().__init__(parent, name, schema, endpoint, secrets=secrets)
38
41
  self._storage_client = None
39
42
  self._storage_options = None
mlrun/datastore/hdfs.py CHANGED
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import os
15
+ from typing import Optional
15
16
  from urllib.parse import urlparse
16
17
 
17
18
  import fsspec
@@ -20,7 +21,9 @@ from mlrun.datastore.base import DataStore
20
21
 
21
22
 
22
23
  class HdfsStore(DataStore):
23
- def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
24
+ def __init__(
25
+ self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
26
+ ):
24
27
  super().__init__(parent, name, schema, endpoint, secrets)
25
28
 
26
29
  self.host = self._get_secret_or_env("HDFS_HOST")
mlrun/datastore/inmem.py CHANGED
@@ -17,6 +17,7 @@ from io import BytesIO, StringIO
17
17
  import pandas as pd
18
18
 
19
19
  import mlrun
20
+ import mlrun.utils.helpers
20
21
 
21
22
  from .base import DataStore, FileStats
22
23
 
@@ -35,7 +36,9 @@ class InMemoryStore(DataStore):
35
36
 
36
37
  def _get_item(self, key):
37
38
  if key not in self._items:
38
- raise ValueError(f"item {key} not found in memory store")
39
+ raise mlrun.errors.MLRunNotFoundError(
40
+ f"item {key} not found in memory store"
41
+ )
39
42
  return self._items[key]
40
43
 
41
44
  def get(self, key, size=None, offset=0):
mlrun/datastore/redis.py CHANGED
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ from typing import Optional
15
16
  from urllib.parse import urlparse
16
17
 
17
18
  import redis
@@ -30,7 +31,9 @@ class RedisStore(DataStore):
30
31
  - key and value sizes are limited to 512MB
31
32
  """
32
33
 
33
- def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
34
+ def __init__(
35
+ self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
36
+ ):
34
37
  redis_default_port = "6379"
35
38
  super().__init__(parent, name, schema, endpoint, secrets=secrets)
36
39
  self.headers = None
mlrun/datastore/s3.py CHANGED
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import time
16
+ from typing import Optional
16
17
 
17
18
  import boto3
18
19
  from boto3.s3.transfer import TransferConfig
@@ -26,7 +27,9 @@ from .base import DataStore, FileStats, get_range, make_datastore_schema_sanitiz
26
27
  class S3Store(DataStore):
27
28
  using_bucket = True
28
29
 
29
- def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
30
+ def __init__(
31
+ self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
32
+ ):
30
33
  super().__init__(parent, name, schema, endpoint, secrets)
31
34
  # will be used in case user asks to assume a role and work through fsspec
32
35
  self._temp_credentials = None