mlrun 1.7.0rc28__py3-none-any.whl → 1.7.0rc55__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (135) hide show
  1. mlrun/__main__.py +4 -2
  2. mlrun/alerts/alert.py +75 -8
  3. mlrun/artifacts/base.py +1 -0
  4. mlrun/artifacts/manager.py +9 -2
  5. mlrun/common/constants.py +4 -1
  6. mlrun/common/db/sql_session.py +3 -2
  7. mlrun/common/formatters/__init__.py +1 -0
  8. mlrun/common/formatters/artifact.py +1 -0
  9. mlrun/{model_monitoring/application.py → common/formatters/feature_set.py} +20 -6
  10. mlrun/common/formatters/run.py +3 -0
  11. mlrun/common/helpers.py +0 -1
  12. mlrun/common/schemas/__init__.py +3 -1
  13. mlrun/common/schemas/alert.py +15 -12
  14. mlrun/common/schemas/api_gateway.py +6 -6
  15. mlrun/common/schemas/auth.py +5 -0
  16. mlrun/common/schemas/client_spec.py +0 -1
  17. mlrun/common/schemas/common.py +7 -4
  18. mlrun/common/schemas/frontend_spec.py +7 -0
  19. mlrun/common/schemas/function.py +7 -0
  20. mlrun/common/schemas/model_monitoring/__init__.py +4 -3
  21. mlrun/common/schemas/model_monitoring/constants.py +41 -26
  22. mlrun/common/schemas/model_monitoring/model_endpoints.py +23 -47
  23. mlrun/common/schemas/notification.py +69 -12
  24. mlrun/common/schemas/project.py +45 -12
  25. mlrun/common/schemas/workflow.py +10 -2
  26. mlrun/common/types.py +1 -0
  27. mlrun/config.py +91 -35
  28. mlrun/data_types/data_types.py +6 -1
  29. mlrun/data_types/spark.py +2 -2
  30. mlrun/data_types/to_pandas.py +57 -25
  31. mlrun/datastore/__init__.py +1 -0
  32. mlrun/datastore/alibaba_oss.py +3 -2
  33. mlrun/datastore/azure_blob.py +125 -37
  34. mlrun/datastore/base.py +42 -21
  35. mlrun/datastore/datastore.py +4 -2
  36. mlrun/datastore/datastore_profile.py +1 -1
  37. mlrun/datastore/dbfs_store.py +3 -7
  38. mlrun/datastore/filestore.py +1 -3
  39. mlrun/datastore/google_cloud_storage.py +85 -29
  40. mlrun/datastore/inmem.py +4 -1
  41. mlrun/datastore/redis.py +1 -0
  42. mlrun/datastore/s3.py +25 -12
  43. mlrun/datastore/sources.py +76 -4
  44. mlrun/datastore/spark_utils.py +30 -0
  45. mlrun/datastore/storeytargets.py +151 -0
  46. mlrun/datastore/targets.py +102 -131
  47. mlrun/datastore/v3io.py +1 -0
  48. mlrun/db/base.py +15 -6
  49. mlrun/db/httpdb.py +57 -28
  50. mlrun/db/nopdb.py +29 -5
  51. mlrun/errors.py +20 -3
  52. mlrun/execution.py +46 -5
  53. mlrun/feature_store/api.py +25 -1
  54. mlrun/feature_store/common.py +6 -11
  55. mlrun/feature_store/feature_vector.py +3 -1
  56. mlrun/feature_store/retrieval/job.py +4 -1
  57. mlrun/feature_store/retrieval/spark_merger.py +10 -39
  58. mlrun/feature_store/steps.py +8 -0
  59. mlrun/frameworks/_common/plan.py +3 -3
  60. mlrun/frameworks/_ml_common/plan.py +1 -1
  61. mlrun/frameworks/parallel_coordinates.py +2 -3
  62. mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
  63. mlrun/k8s_utils.py +48 -2
  64. mlrun/launcher/client.py +6 -6
  65. mlrun/launcher/local.py +2 -2
  66. mlrun/model.py +215 -34
  67. mlrun/model_monitoring/api.py +38 -24
  68. mlrun/model_monitoring/applications/__init__.py +1 -2
  69. mlrun/model_monitoring/applications/_application_steps.py +60 -29
  70. mlrun/model_monitoring/applications/base.py +2 -174
  71. mlrun/model_monitoring/applications/context.py +197 -70
  72. mlrun/model_monitoring/applications/evidently_base.py +11 -85
  73. mlrun/model_monitoring/applications/histogram_data_drift.py +21 -16
  74. mlrun/model_monitoring/applications/results.py +4 -4
  75. mlrun/model_monitoring/controller.py +110 -282
  76. mlrun/model_monitoring/db/stores/__init__.py +8 -3
  77. mlrun/model_monitoring/db/stores/base/store.py +3 -0
  78. mlrun/model_monitoring/db/stores/sqldb/models/base.py +9 -7
  79. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +18 -3
  80. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +43 -23
  81. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +48 -35
  82. mlrun/model_monitoring/db/tsdb/__init__.py +7 -2
  83. mlrun/model_monitoring/db/tsdb/base.py +147 -15
  84. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +94 -55
  85. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -3
  86. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +144 -38
  87. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +44 -3
  88. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +246 -57
  89. mlrun/model_monitoring/helpers.py +70 -50
  90. mlrun/model_monitoring/stream_processing.py +96 -195
  91. mlrun/model_monitoring/writer.py +13 -5
  92. mlrun/package/packagers/default_packager.py +2 -2
  93. mlrun/projects/operations.py +16 -8
  94. mlrun/projects/pipelines.py +126 -115
  95. mlrun/projects/project.py +286 -129
  96. mlrun/render.py +3 -3
  97. mlrun/run.py +38 -19
  98. mlrun/runtimes/__init__.py +19 -8
  99. mlrun/runtimes/base.py +4 -1
  100. mlrun/runtimes/daskjob.py +1 -1
  101. mlrun/runtimes/funcdoc.py +1 -1
  102. mlrun/runtimes/kubejob.py +6 -6
  103. mlrun/runtimes/local.py +12 -5
  104. mlrun/runtimes/nuclio/api_gateway.py +68 -8
  105. mlrun/runtimes/nuclio/application/application.py +307 -70
  106. mlrun/runtimes/nuclio/function.py +63 -14
  107. mlrun/runtimes/nuclio/serving.py +10 -10
  108. mlrun/runtimes/pod.py +25 -19
  109. mlrun/runtimes/remotesparkjob.py +2 -5
  110. mlrun/runtimes/sparkjob/spark3job.py +16 -17
  111. mlrun/runtimes/utils.py +34 -0
  112. mlrun/serving/routers.py +2 -5
  113. mlrun/serving/server.py +37 -19
  114. mlrun/serving/states.py +30 -3
  115. mlrun/serving/v2_serving.py +44 -35
  116. mlrun/track/trackers/mlflow_tracker.py +5 -0
  117. mlrun/utils/async_http.py +1 -1
  118. mlrun/utils/db.py +18 -0
  119. mlrun/utils/helpers.py +150 -36
  120. mlrun/utils/http.py +1 -1
  121. mlrun/utils/notifications/notification/__init__.py +0 -1
  122. mlrun/utils/notifications/notification/webhook.py +8 -1
  123. mlrun/utils/notifications/notification_pusher.py +1 -1
  124. mlrun/utils/v3io_clients.py +2 -2
  125. mlrun/utils/version/version.json +2 -2
  126. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/METADATA +153 -66
  127. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/RECORD +131 -134
  128. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/WHEEL +1 -1
  129. mlrun/feature_store/retrieval/conversion.py +0 -271
  130. mlrun/model_monitoring/controller_handler.py +0 -37
  131. mlrun/model_monitoring/evidently_application.py +0 -20
  132. mlrun/model_monitoring/prometheus.py +0 -216
  133. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/LICENSE +0 -0
  134. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/entry_points.txt +0 -0
  135. {mlrun-1.7.0rc28.dist-info → mlrun-1.7.0rc55.dist-info}/top_level.txt +0 -0
mlrun/config.py CHANGED
@@ -27,6 +27,7 @@ import copy
27
27
  import json
28
28
  import os
29
29
  import typing
30
+ import warnings
30
31
  from collections.abc import Mapping
31
32
  from datetime import timedelta
32
33
  from distutils.util import strtobool
@@ -35,6 +36,7 @@ from threading import Lock
35
36
 
36
37
  import dotenv
37
38
  import semver
39
+ import urllib3.exceptions
38
40
  import yaml
39
41
 
40
42
  import mlrun.common.constants
@@ -52,6 +54,11 @@ default_config = {
52
54
  "kubernetes": {
53
55
  "kubeconfig_path": "", # local path to kubeconfig file (for development purposes),
54
56
  # empty by default as the API already running inside k8s cluster
57
+ "pagination": {
58
+ # pagination config for interacting with k8s API
59
+ "list_pods_limit": 200,
60
+ "list_crd_objects_limit": 200,
61
+ },
55
62
  },
56
63
  "dbpath": "", # db/api url
57
64
  # url to nuclio dashboard api (can be with user & token, e.g. https://username:password@dashboard-url.com)
@@ -64,11 +71,15 @@ default_config = {
64
71
  "api_base_version": "v1",
65
72
  "version": "", # will be set to current version
66
73
  "images_tag": "", # tag to use with mlrun images e.g. mlrun/mlrun (defaults to version)
67
- "images_registry": "", # registry to use with mlrun images e.g. quay.io/ (defaults to empty, for dockerhub)
74
+ # registry to use with mlrun images that start with "mlrun/" e.g. quay.io/ (defaults to empty, for dockerhub)
75
+ "images_registry": "",
76
+ # registry to use with non-mlrun images (don't start with "mlrun/") specified in 'images_to_enrich_registry'
77
+ # defaults to empty, for dockerhub
78
+ "vendor_images_registry": "",
68
79
  # comma separated list of images that are in the specified images_registry, and therefore will be enriched with this
69
80
  # registry when used. default to mlrun/* which means any image which is of the mlrun repository (mlrun/mlrun,
70
81
  # mlrun/ml-base, etc...)
71
- "images_to_enrich_registry": "^mlrun/*",
82
+ "images_to_enrich_registry": "^mlrun/*,python:3.9",
72
83
  "kfp_url": "",
73
84
  "kfp_ttl": "14400", # KFP ttl in sec, after that completed PODs will be deleted
74
85
  "kfp_image": "mlrun/mlrun", # image to use for KFP runner (defaults to mlrun/mlrun)
@@ -104,7 +115,12 @@ default_config = {
104
115
  # max number of parallel abort run jobs in runs monitoring
105
116
  "concurrent_abort_stale_runs_workers": 10,
106
117
  "list_runs_time_period_in_days": 7, # days
107
- }
118
+ },
119
+ "projects": {
120
+ "summaries": {
121
+ "cache_interval": "30",
122
+ },
123
+ },
108
124
  },
109
125
  "crud": {
110
126
  "runs": {
@@ -138,6 +154,11 @@ default_config = {
138
154
  "datasets": {
139
155
  "max_preview_columns": 100,
140
156
  },
157
+ "limits": {
158
+ "max_chunk_size": 1024 * 1024 * 1, # 1MB
159
+ "max_preview_size": 1024 * 1024 * 10, # 10MB
160
+ "max_download_size": 1024 * 1024 * 100, # 100MB
161
+ },
141
162
  },
142
163
  # FIXME: Adding these defaults here so we won't need to patch the "installing component" (provazio-controller) to
143
164
  # configure this values on field systems, for newer system this will be configured correctly
@@ -238,7 +259,7 @@ default_config = {
238
259
  },
239
260
  "application": {
240
261
  "default_sidecar_internal_port": 8050,
241
- "default_authentication_mode": "accessKey",
262
+ "default_authentication_mode": mlrun.common.schemas.APIGatewayAuthenticationMode.none,
242
263
  },
243
264
  },
244
265
  # TODO: function defaults should be moved to the function spec config above
@@ -250,7 +271,7 @@ default_config = {
250
271
  "remote": "mlrun/mlrun",
251
272
  "dask": "mlrun/ml-base",
252
273
  "mpijob": "mlrun/mlrun",
253
- "application": "python:3.9-slim",
274
+ "application": "python:3.9",
254
275
  },
255
276
  # see enrich_function_preemption_spec for more info,
256
277
  # and mlrun.common.schemas.function.PreemptionModes for available options
@@ -265,6 +286,16 @@ default_config = {
265
286
  "url": "",
266
287
  "service": "mlrun-api-chief",
267
288
  "port": 8080,
289
+ "feature_gates": {
290
+ "scheduler": "enabled",
291
+ "project_sync": "enabled",
292
+ "cleanup": "enabled",
293
+ "runs_monitoring": "enabled",
294
+ "pagination_cache": "enabled",
295
+ "project_summaries": "enabled",
296
+ "start_logs": "enabled",
297
+ "stop_logs": "enabled",
298
+ },
268
299
  },
269
300
  "worker": {
270
301
  "sync_with_chief": {
@@ -302,7 +333,7 @@ default_config = {
302
333
  "http": {
303
334
  # when True, the client will verify the server's TLS
304
335
  # set to False for backwards compatibility.
305
- "verify": False,
336
+ "verify": True,
306
337
  },
307
338
  "db": {
308
339
  "commit_retry_timeout": 30,
@@ -433,7 +464,6 @@ default_config = {
433
464
  "followers": "",
434
465
  # This is used as the interval for the sync loop both when mlrun is leader and follower
435
466
  "periodic_sync_interval": "1 minute",
436
- "counters_cache_ttl": "2 minutes",
437
467
  "project_owners_cache_ttl": "30 seconds",
438
468
  # access key to be used when the leader is iguazio and polling is done from it
439
469
  "iguazio_access_key": "",
@@ -462,10 +492,10 @@ default_config = {
462
492
  # pip install <requirement_specifier>, e.g. mlrun==0.5.4, mlrun~=0.5,
463
493
  # git+https://github.com/mlrun/mlrun@development. by default uses the version
464
494
  "mlrun_version_specifier": "",
465
- "kaniko_image": "gcr.io/kaniko-project/executor:v1.21.1", # kaniko builder image
495
+ "kaniko_image": "gcr.io/kaniko-project/executor:v1.23.2", # kaniko builder image
466
496
  "kaniko_init_container_image": "alpine:3.18",
467
497
  # image for kaniko init container when docker registry is ECR
468
- "kaniko_aws_cli_image": "amazon/aws-cli:2.7.10",
498
+ "kaniko_aws_cli_image": "amazon/aws-cli:2.17.16",
469
499
  # kaniko sometimes fails to get filesystem from image, this is a workaround to retry the process
470
500
  # a known issue in Kaniko - https://github.com/GoogleContainerTools/kaniko/issues/1717
471
501
  "kaniko_image_fs_extraction_retries": "3",
@@ -509,7 +539,6 @@ default_config = {
509
539
  "store_prefixes": {
510
540
  "default": "v3io:///users/pipelines/{project}/model-endpoints/{kind}",
511
541
  "user_space": "v3io:///projects/{project}/model-endpoints/{kind}",
512
- "stream": "", # TODO: Delete in 1.9.0
513
542
  "monitoring_application": "v3io:///users/pipelines/{project}/monitoring-apps/",
514
543
  },
515
544
  # Offline storage path can be either relative or a full path. This path is used for general offline data
@@ -522,7 +551,6 @@ default_config = {
522
551
  "parquet_batching_max_events": 10_000,
523
552
  "parquet_batching_timeout_secs": timedelta(minutes=1).total_seconds(),
524
553
  # See mlrun.model_monitoring.db.stores.ObjectStoreFactory for available options
525
- "store_type": "v3io-nosql", # TODO: Delete in 1.9.0
526
554
  "endpoint_store_connection": "",
527
555
  # See mlrun.model_monitoring.db.tsdb.ObjectTSDBFactory for available options
528
556
  "tsdb_connection": "",
@@ -706,7 +734,7 @@ default_config = {
706
734
  "grafana_url": "",
707
735
  "alerts": {
708
736
  # supported modes: "enabled", "disabled".
709
- "mode": "enabled",
737
+ "mode": "disabled",
710
738
  # maximum number of alerts we allow to be configured.
711
739
  # user will get an error when exceeding this
712
740
  "max_allowed": 10000,
@@ -768,7 +796,21 @@ class Config:
768
796
  for key, value in cfg.items():
769
797
  if hasattr(self, key):
770
798
  if isinstance(value, dict):
771
- getattr(self, key).update(value)
799
+ # ignore the `skip_errors` flag here
800
+ # if the key does not align with what mlrun config expects it is a user
801
+ # input error that can lead to unexpected behavior.
802
+ # raise the exception to ensure configuration is loaded correctly and do not
803
+ # ignore any errors.
804
+ config_value = getattr(self, key)
805
+ try:
806
+ config_value.update(value)
807
+ except AttributeError as exc:
808
+ if not isinstance(config_value, (dict, Config)):
809
+ raise ValueError(
810
+ f"Can not update `{key}` config. "
811
+ f"Expected a configuration but received {type(value)}"
812
+ ) from exc
813
+ raise exc
772
814
  else:
773
815
  try:
774
816
  setattr(self, key, value)
@@ -840,7 +882,7 @@ class Config:
840
882
  f"Unable to decode {attribute_path}"
841
883
  )
842
884
  parsed_attribute_value = json.loads(decoded_attribute_value)
843
- if type(parsed_attribute_value) != expected_type:
885
+ if not isinstance(parsed_attribute_value, expected_type):
844
886
  raise mlrun.errors.MLRunInvalidArgumentTypeError(
845
887
  f"Expected type {expected_type}, got {type(parsed_attribute_value)}"
846
888
  )
@@ -1032,6 +1074,14 @@ class Config:
1032
1074
  resource_requirement.pop(gpu)
1033
1075
  return resource_requirement
1034
1076
 
1077
+ def force_api_gateway_ssl_redirect(self):
1078
+ """
1079
+ Get the default value for the ssl_redirect configuration.
1080
+ In Iguazio we always want to redirect to HTTPS, in other cases we don't.
1081
+ :return: True if we should redirect to HTTPS, False otherwise.
1082
+ """
1083
+ return self.is_running_on_iguazio()
1084
+
1035
1085
  def to_dict(self):
1036
1086
  return copy.deepcopy(self._cfg)
1037
1087
 
@@ -1064,6 +1114,9 @@ class Config:
1064
1114
  # importing here to avoid circular dependency
1065
1115
  import mlrun.db
1066
1116
 
1117
+ # It ensures that SSL verification is set before establishing a connection
1118
+ _configure_ssl_verification(self.httpdb.http.verify)
1119
+
1067
1120
  # when dbpath is set we want to connect to it which will sync configuration from it to the client
1068
1121
  mlrun.db.get_run_db(value, force_reconnect=True)
1069
1122
 
@@ -1092,10 +1145,10 @@ class Config:
1092
1145
  project: str = "",
1093
1146
  kind: str = "",
1094
1147
  target: str = "online",
1095
- artifact_path: str = None,
1096
- function_name: str = None,
1148
+ artifact_path: typing.Optional[str] = None,
1149
+ function_name: typing.Optional[str] = None,
1097
1150
  **kwargs,
1098
- ) -> typing.Union[str, list[str]]:
1151
+ ) -> str:
1099
1152
  """Get the full path from the configuration based on the provided project and kind.
1100
1153
 
1101
1154
  :param project: Project name.
@@ -1111,8 +1164,7 @@ class Config:
1111
1164
  relative artifact path will be taken from the global MLRun artifact path.
1112
1165
  :param function_name: Application name, None for model_monitoring_stream.
1113
1166
 
1114
- :return: Full configured path for the provided kind. Can be either a single path
1115
- or a list of paths in the case of the online model monitoring stream path.
1167
+ :return: Full configured path for the provided kind.
1116
1168
  """
1117
1169
 
1118
1170
  if target != "offline":
@@ -1133,17 +1185,11 @@ class Config:
1133
1185
  if function_name is None
1134
1186
  else f"{kind}-{function_name.lower()}",
1135
1187
  )
1136
- elif kind == "stream": # return list for mlrun<1.6.3 BC
1137
- return [
1138
- mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
1139
- project=project,
1140
- kind=kind,
1141
- ), # old stream uri (pipelines) for BC ML-6043
1142
- mlrun.mlconf.model_endpoint_monitoring.store_prefixes.user_space.format(
1143
- project=project,
1144
- kind=kind,
1145
- ), # new stream uri (projects)
1146
- ]
1188
+ elif kind == "stream":
1189
+ return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.user_space.format(
1190
+ project=project,
1191
+ kind=kind,
1192
+ )
1147
1193
  else:
1148
1194
  return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
1149
1195
  project=project,
@@ -1206,12 +1252,11 @@ class Config:
1206
1252
 
1207
1253
  return storage_options
1208
1254
 
1209
- def is_explicit_ack(self, version=None) -> bool:
1210
- if not version:
1211
- version = self.nuclio_version
1255
+ def is_explicit_ack_enabled(self) -> bool:
1212
1256
  return self.httpdb.nuclio.explicit_ack == "enabled" and (
1213
- not version
1214
- or semver.VersionInfo.parse(version) >= semver.VersionInfo.parse("1.12.10")
1257
+ not self.nuclio_version
1258
+ or semver.VersionInfo.parse(self.nuclio_version)
1259
+ >= semver.VersionInfo.parse("1.12.10")
1215
1260
  )
1216
1261
 
1217
1262
 
@@ -1261,6 +1306,7 @@ def _do_populate(env=None, skip_errors=False):
1261
1306
  if data:
1262
1307
  config.update(data, skip_errors=skip_errors)
1263
1308
 
1309
+ _configure_ssl_verification(config.httpdb.http.verify)
1264
1310
  _validate_config(config)
1265
1311
 
1266
1312
 
@@ -1320,6 +1366,16 @@ def _convert_str(value, typ):
1320
1366
  return typ(value)
1321
1367
 
1322
1368
 
1369
+ def _configure_ssl_verification(verify_ssl: bool) -> None:
1370
+ """Configure SSL verification warnings based on the setting."""
1371
+ if not verify_ssl:
1372
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
1373
+ else:
1374
+ # If the user changes the `verify` setting to `True` at runtime using `mlrun.set_env_from_file` after
1375
+ # importing `mlrun`, we need to reload the `mlrun` configuration and enable this warning.
1376
+ warnings.simplefilter("default", urllib3.exceptions.InsecureRequestWarning)
1377
+
1378
+
1323
1379
  def read_env(env=None, prefix=env_prefix):
1324
1380
  """Read configuration from environment"""
1325
1381
  env = os.environ if env is None else env
@@ -70,6 +70,11 @@ def pa_type_to_value_type(type_):
70
70
  if isinstance(type_, TimestampType):
71
71
  return ValueType.DATETIME
72
72
 
73
+ # pandas category type translates to pyarrow DictionaryType
74
+ # we need to unpack the value type (ML-7868)
75
+ if isinstance(type_, pyarrow.DictionaryType):
76
+ type_ = type_.value_type
77
+
73
78
  type_map = {
74
79
  pyarrow.bool_(): ValueType.BOOL,
75
80
  pyarrow.int64(): ValueType.INT64,
@@ -139,7 +144,7 @@ def gbq_to_pandas_dtype(gbq_type):
139
144
  "BOOL": "bool",
140
145
  "FLOAT": "float64",
141
146
  "INTEGER": pd.Int64Dtype(),
142
- "TIMESTAMP": "datetime64[ns]",
147
+ "TIMESTAMP": "datetime64[ns, UTC]",
143
148
  }
144
149
  return type_map.get(gbq_type, "object")
145
150
 
mlrun/data_types/spark.py CHANGED
@@ -20,10 +20,10 @@ import pytz
20
20
  from pyspark.sql.functions import to_utc_timestamp
21
21
  from pyspark.sql.types import BooleanType, DoubleType, TimestampType
22
22
 
23
+ from mlrun.feature_store.retrieval.spark_merger import spark_df_to_pandas
23
24
  from mlrun.utils import logger
24
25
 
25
26
  from .data_types import InferOptions, spark_to_value_type
26
- from .to_pandas import toPandas
27
27
 
28
28
  try:
29
29
  import pyspark.sql.functions as funcs
@@ -75,7 +75,7 @@ def get_df_preview_spark(df, preview_lines=20):
75
75
  """capture preview data from spark df"""
76
76
  df = df.limit(preview_lines)
77
77
 
78
- result_dict = toPandas(df).to_dict(orient="split")
78
+ result_dict = spark_df_to_pandas(df).to_dict(orient="split")
79
79
  return [result_dict["columns"], *result_dict["data"]]
80
80
 
81
81
 
@@ -15,23 +15,13 @@
15
15
  import warnings
16
16
  from collections import Counter
17
17
 
18
- from pyspark.sql.types import (
19
- BooleanType,
20
- ByteType,
21
- DoubleType,
22
- FloatType,
23
- IntegerType,
24
- IntegralType,
25
- LongType,
26
- MapType,
27
- ShortType,
28
- TimestampType,
29
- )
30
-
31
-
32
- def toPandas(spark_df):
18
+ import pandas as pd
19
+ import semver
20
+
21
+
22
+ def _to_pandas(spark_df):
33
23
  """
34
- Modified version of spark DataFrame.toPandas()
24
+ Modified version of spark DataFrame.toPandas() -
35
25
  https://github.com/apache/spark/blob/v3.2.3/python/pyspark/sql/pandas/conversion.py#L35
36
26
 
37
27
  The original code (which is only replaced in pyspark 3.5.0) fails with Pandas 2 installed, with the following error:
@@ -40,6 +30,12 @@ def toPandas(spark_df):
40
30
  This modification adds the missing unit to the dtype.
41
31
  """
42
32
  from pyspark.sql.dataframe import DataFrame
33
+ from pyspark.sql.types import (
34
+ BooleanType,
35
+ IntegralType,
36
+ MapType,
37
+ TimestampType,
38
+ )
43
39
 
44
40
  assert isinstance(spark_df, DataFrame)
45
41
 
@@ -48,7 +44,6 @@ def toPandas(spark_df):
48
44
  require_minimum_pandas_version()
49
45
 
50
46
  import numpy as np
51
- import pandas as pd
52
47
 
53
48
  timezone = spark_df.sql_ctx._conf.sessionLocalTimeZone()
54
49
 
@@ -217,22 +212,59 @@ def toPandas(spark_df):
217
212
 
218
213
  def _to_corrected_pandas_type(dt):
219
214
  import numpy as np
215
+ from pyspark.sql.types import (
216
+ BooleanType,
217
+ ByteType,
218
+ DoubleType,
219
+ FloatType,
220
+ IntegerType,
221
+ LongType,
222
+ ShortType,
223
+ TimestampType,
224
+ )
220
225
 
221
- if type(dt) == ByteType:
226
+ if isinstance(dt, ByteType):
222
227
  return np.int8
223
- elif type(dt) == ShortType:
228
+ elif isinstance(dt, ShortType):
224
229
  return np.int16
225
- elif type(dt) == IntegerType:
230
+ elif isinstance(dt, IntegerType):
226
231
  return np.int32
227
- elif type(dt) == LongType:
232
+ elif isinstance(dt, LongType):
228
233
  return np.int64
229
- elif type(dt) == FloatType:
234
+ elif isinstance(dt, FloatType):
230
235
  return np.float32
231
- elif type(dt) == DoubleType:
236
+ elif isinstance(dt, DoubleType):
232
237
  return np.float64
233
- elif type(dt) == BooleanType:
238
+ elif isinstance(dt, BooleanType):
234
239
  return bool
235
- elif type(dt) == TimestampType:
240
+ elif isinstance(dt, TimestampType):
236
241
  return "datetime64[ns]"
237
242
  else:
238
243
  return None
244
+
245
+
246
+ def spark_df_to_pandas(spark_df):
247
+ # as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
248
+ # when we upgrade pyspark, we should check whether this workaround is still necessary
249
+ # see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
250
+ if semver.parse(pd.__version__)["major"] >= 2:
251
+ import pyspark.sql.functions as pyspark_functions
252
+
253
+ type_conversion_dict = {}
254
+ for field in spark_df.schema.fields:
255
+ if str(field.dataType) == "TimestampType":
256
+ spark_df = spark_df.withColumn(
257
+ field.name,
258
+ pyspark_functions.date_format(
259
+ pyspark_functions.to_timestamp(field.name),
260
+ "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
261
+ ),
262
+ )
263
+ type_conversion_dict[field.name] = "datetime64[ns]"
264
+
265
+ df = _to_pandas(spark_df)
266
+ if type_conversion_dict:
267
+ df = df.astype(type_conversion_dict)
268
+ return df
269
+ else:
270
+ return _to_pandas(spark_df)
@@ -117,6 +117,7 @@ def get_stream_pusher(stream_path: str, **kwargs):
117
117
  return OutputStream(stream_path, **kwargs)
118
118
  elif stream_path.startswith("v3io"):
119
119
  endpoint, stream_path = parse_path(stream_path)
120
+ endpoint = kwargs.pop("endpoint", None) or endpoint
120
121
  return OutputStream(stream_path, endpoint=endpoint, **kwargs)
121
122
  elif stream_path.startswith("dummy://"):
122
123
  return _DummyStream(**kwargs)
@@ -22,7 +22,7 @@ from fsspec.registry import get_filesystem_class
22
22
 
23
23
  import mlrun.errors
24
24
 
25
- from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
25
+ from .base import DataStore, FileStats, make_datastore_schema_sanitizer
26
26
 
27
27
 
28
28
  class OSSStore(DataStore):
@@ -53,7 +53,7 @@ class OSSStore(DataStore):
53
53
  except ImportError as exc:
54
54
  raise ImportError("ALIBABA ossfs not installed") from exc
55
55
  filesystem_class = get_filesystem_class(protocol=self.kind)
56
- self._filesystem = makeDatastoreSchemaSanitizer(
56
+ self._filesystem = make_datastore_schema_sanitizer(
57
57
  filesystem_class,
58
58
  using_bucket=self.using_bucket,
59
59
  **self.get_storage_options(),
@@ -85,6 +85,7 @@ class OSSStore(DataStore):
85
85
  return oss.get_object(key).read()
86
86
 
87
87
  def put(self, key, data, append=False):
88
+ data, _ = self._prepare_put_data(data, append)
88
89
  bucket, key = self.get_bucket_and_key(key)
89
90
  oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
90
91
  oss.put_object(key, data)