mlrun 1.6.4rc8__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (305) hide show
  1. mlrun/__init__.py +11 -1
  2. mlrun/__main__.py +40 -122
  3. mlrun/alerts/__init__.py +15 -0
  4. mlrun/alerts/alert.py +248 -0
  5. mlrun/api/schemas/__init__.py +5 -4
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +47 -257
  8. mlrun/artifacts/dataset.py +11 -192
  9. mlrun/artifacts/manager.py +79 -47
  10. mlrun/artifacts/model.py +31 -159
  11. mlrun/artifacts/plots.py +23 -380
  12. mlrun/common/constants.py +74 -1
  13. mlrun/common/db/sql_session.py +5 -5
  14. mlrun/common/formatters/__init__.py +21 -0
  15. mlrun/common/formatters/artifact.py +45 -0
  16. mlrun/common/formatters/base.py +113 -0
  17. mlrun/common/formatters/feature_set.py +33 -0
  18. mlrun/common/formatters/function.py +46 -0
  19. mlrun/common/formatters/pipeline.py +53 -0
  20. mlrun/common/formatters/project.py +51 -0
  21. mlrun/common/formatters/run.py +29 -0
  22. mlrun/common/helpers.py +12 -3
  23. mlrun/common/model_monitoring/helpers.py +9 -5
  24. mlrun/{runtimes → common/runtimes}/constants.py +37 -9
  25. mlrun/common/schemas/__init__.py +31 -5
  26. mlrun/common/schemas/alert.py +202 -0
  27. mlrun/common/schemas/api_gateway.py +196 -0
  28. mlrun/common/schemas/artifact.py +25 -4
  29. mlrun/common/schemas/auth.py +16 -5
  30. mlrun/common/schemas/background_task.py +1 -1
  31. mlrun/common/schemas/client_spec.py +4 -2
  32. mlrun/common/schemas/common.py +7 -4
  33. mlrun/common/schemas/constants.py +3 -0
  34. mlrun/common/schemas/feature_store.py +74 -44
  35. mlrun/common/schemas/frontend_spec.py +15 -7
  36. mlrun/common/schemas/function.py +12 -1
  37. mlrun/common/schemas/hub.py +11 -18
  38. mlrun/common/schemas/memory_reports.py +2 -2
  39. mlrun/common/schemas/model_monitoring/__init__.py +20 -4
  40. mlrun/common/schemas/model_monitoring/constants.py +123 -42
  41. mlrun/common/schemas/model_monitoring/grafana.py +13 -9
  42. mlrun/common/schemas/model_monitoring/model_endpoints.py +101 -54
  43. mlrun/common/schemas/notification.py +71 -14
  44. mlrun/common/schemas/object.py +2 -2
  45. mlrun/{model_monitoring/controller_handler.py → common/schemas/pagination.py} +9 -12
  46. mlrun/common/schemas/pipeline.py +8 -1
  47. mlrun/common/schemas/project.py +69 -18
  48. mlrun/common/schemas/runs.py +7 -1
  49. mlrun/common/schemas/runtime_resource.py +8 -12
  50. mlrun/common/schemas/schedule.py +4 -4
  51. mlrun/common/schemas/tag.py +1 -2
  52. mlrun/common/schemas/workflow.py +12 -4
  53. mlrun/common/types.py +14 -1
  54. mlrun/config.py +154 -69
  55. mlrun/data_types/data_types.py +6 -1
  56. mlrun/data_types/spark.py +2 -2
  57. mlrun/data_types/to_pandas.py +67 -37
  58. mlrun/datastore/__init__.py +6 -8
  59. mlrun/datastore/alibaba_oss.py +131 -0
  60. mlrun/datastore/azure_blob.py +143 -42
  61. mlrun/datastore/base.py +102 -58
  62. mlrun/datastore/datastore.py +34 -13
  63. mlrun/datastore/datastore_profile.py +146 -20
  64. mlrun/datastore/dbfs_store.py +3 -7
  65. mlrun/datastore/filestore.py +1 -4
  66. mlrun/datastore/google_cloud_storage.py +97 -33
  67. mlrun/datastore/hdfs.py +56 -0
  68. mlrun/datastore/inmem.py +6 -3
  69. mlrun/datastore/redis.py +7 -2
  70. mlrun/datastore/s3.py +34 -12
  71. mlrun/datastore/snowflake_utils.py +45 -0
  72. mlrun/datastore/sources.py +303 -111
  73. mlrun/datastore/spark_utils.py +31 -2
  74. mlrun/datastore/store_resources.py +9 -7
  75. mlrun/datastore/storeytargets.py +151 -0
  76. mlrun/datastore/targets.py +453 -176
  77. mlrun/datastore/utils.py +72 -58
  78. mlrun/datastore/v3io.py +6 -1
  79. mlrun/db/base.py +274 -41
  80. mlrun/db/factory.py +1 -1
  81. mlrun/db/httpdb.py +893 -225
  82. mlrun/db/nopdb.py +291 -33
  83. mlrun/errors.py +36 -6
  84. mlrun/execution.py +115 -42
  85. mlrun/feature_store/__init__.py +0 -2
  86. mlrun/feature_store/api.py +65 -73
  87. mlrun/feature_store/common.py +7 -12
  88. mlrun/feature_store/feature_set.py +76 -55
  89. mlrun/feature_store/feature_vector.py +39 -31
  90. mlrun/feature_store/ingestion.py +7 -6
  91. mlrun/feature_store/retrieval/base.py +16 -11
  92. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  93. mlrun/feature_store/retrieval/job.py +13 -4
  94. mlrun/feature_store/retrieval/local_merger.py +2 -0
  95. mlrun/feature_store/retrieval/spark_merger.py +24 -32
  96. mlrun/feature_store/steps.py +45 -34
  97. mlrun/features.py +11 -21
  98. mlrun/frameworks/_common/artifacts_library.py +9 -9
  99. mlrun/frameworks/_common/mlrun_interface.py +5 -5
  100. mlrun/frameworks/_common/model_handler.py +48 -48
  101. mlrun/frameworks/_common/plan.py +5 -6
  102. mlrun/frameworks/_common/producer.py +3 -4
  103. mlrun/frameworks/_common/utils.py +5 -5
  104. mlrun/frameworks/_dl_common/loggers/logger.py +6 -7
  105. mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +9 -9
  106. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +23 -47
  107. mlrun/frameworks/_ml_common/artifacts_library.py +1 -2
  108. mlrun/frameworks/_ml_common/loggers/logger.py +3 -4
  109. mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +4 -5
  110. mlrun/frameworks/_ml_common/model_handler.py +24 -24
  111. mlrun/frameworks/_ml_common/pkl_model_server.py +2 -2
  112. mlrun/frameworks/_ml_common/plan.py +2 -2
  113. mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +2 -3
  114. mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +2 -3
  115. mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
  116. mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +3 -3
  117. mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
  118. mlrun/frameworks/_ml_common/utils.py +4 -4
  119. mlrun/frameworks/auto_mlrun/auto_mlrun.py +9 -9
  120. mlrun/frameworks/huggingface/model_server.py +4 -4
  121. mlrun/frameworks/lgbm/__init__.py +33 -33
  122. mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
  123. mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -5
  124. mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -5
  125. mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -3
  126. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +6 -6
  127. mlrun/frameworks/lgbm/model_handler.py +10 -10
  128. mlrun/frameworks/lgbm/model_server.py +6 -6
  129. mlrun/frameworks/lgbm/utils.py +5 -5
  130. mlrun/frameworks/onnx/dataset.py +8 -8
  131. mlrun/frameworks/onnx/mlrun_interface.py +3 -3
  132. mlrun/frameworks/onnx/model_handler.py +6 -6
  133. mlrun/frameworks/onnx/model_server.py +7 -7
  134. mlrun/frameworks/parallel_coordinates.py +6 -6
  135. mlrun/frameworks/pytorch/__init__.py +18 -18
  136. mlrun/frameworks/pytorch/callbacks/callback.py +4 -5
  137. mlrun/frameworks/pytorch/callbacks/logging_callback.py +17 -17
  138. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +11 -11
  139. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +23 -29
  140. mlrun/frameworks/pytorch/callbacks_handler.py +38 -38
  141. mlrun/frameworks/pytorch/mlrun_interface.py +20 -20
  142. mlrun/frameworks/pytorch/model_handler.py +17 -17
  143. mlrun/frameworks/pytorch/model_server.py +7 -7
  144. mlrun/frameworks/sklearn/__init__.py +13 -13
  145. mlrun/frameworks/sklearn/estimator.py +4 -4
  146. mlrun/frameworks/sklearn/metrics_library.py +14 -14
  147. mlrun/frameworks/sklearn/mlrun_interface.py +16 -9
  148. mlrun/frameworks/sklearn/model_handler.py +2 -2
  149. mlrun/frameworks/tf_keras/__init__.py +10 -7
  150. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +15 -15
  151. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +11 -11
  152. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +19 -23
  153. mlrun/frameworks/tf_keras/mlrun_interface.py +9 -11
  154. mlrun/frameworks/tf_keras/model_handler.py +14 -14
  155. mlrun/frameworks/tf_keras/model_server.py +6 -6
  156. mlrun/frameworks/xgboost/__init__.py +13 -13
  157. mlrun/frameworks/xgboost/model_handler.py +6 -6
  158. mlrun/k8s_utils.py +61 -17
  159. mlrun/launcher/__init__.py +1 -1
  160. mlrun/launcher/base.py +16 -15
  161. mlrun/launcher/client.py +13 -11
  162. mlrun/launcher/factory.py +1 -1
  163. mlrun/launcher/local.py +23 -13
  164. mlrun/launcher/remote.py +17 -10
  165. mlrun/lists.py +7 -6
  166. mlrun/model.py +478 -103
  167. mlrun/model_monitoring/__init__.py +1 -1
  168. mlrun/model_monitoring/api.py +163 -371
  169. mlrun/{runtimes/mpijob/v1alpha1.py → model_monitoring/applications/__init__.py} +9 -15
  170. mlrun/model_monitoring/applications/_application_steps.py +188 -0
  171. mlrun/model_monitoring/applications/base.py +108 -0
  172. mlrun/model_monitoring/applications/context.py +341 -0
  173. mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
  174. mlrun/model_monitoring/applications/histogram_data_drift.py +354 -0
  175. mlrun/model_monitoring/applications/results.py +99 -0
  176. mlrun/model_monitoring/controller.py +131 -278
  177. mlrun/model_monitoring/db/__init__.py +18 -0
  178. mlrun/model_monitoring/db/stores/__init__.py +136 -0
  179. mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
  180. mlrun/model_monitoring/db/stores/base/store.py +213 -0
  181. mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
  182. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
  183. mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
  184. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
  185. mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
  186. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
  187. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
  188. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
  189. mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
  190. mlrun/model_monitoring/db/tsdb/base.py +448 -0
  191. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  192. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  193. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +279 -0
  194. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
  195. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +507 -0
  196. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  197. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
  198. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
  199. mlrun/model_monitoring/features_drift_table.py +134 -106
  200. mlrun/model_monitoring/helpers.py +199 -55
  201. mlrun/model_monitoring/metrics/__init__.py +13 -0
  202. mlrun/model_monitoring/metrics/histogram_distance.py +127 -0
  203. mlrun/model_monitoring/model_endpoint.py +3 -2
  204. mlrun/model_monitoring/stream_processing.py +134 -398
  205. mlrun/model_monitoring/tracking_policy.py +9 -2
  206. mlrun/model_monitoring/writer.py +161 -125
  207. mlrun/package/__init__.py +6 -6
  208. mlrun/package/context_handler.py +5 -5
  209. mlrun/package/packager.py +7 -7
  210. mlrun/package/packagers/default_packager.py +8 -8
  211. mlrun/package/packagers/numpy_packagers.py +15 -15
  212. mlrun/package/packagers/pandas_packagers.py +5 -5
  213. mlrun/package/packagers/python_standard_library_packagers.py +10 -10
  214. mlrun/package/packagers_manager.py +19 -23
  215. mlrun/package/utils/_formatter.py +6 -6
  216. mlrun/package/utils/_pickler.py +2 -2
  217. mlrun/package/utils/_supported_format.py +4 -4
  218. mlrun/package/utils/log_hint_utils.py +2 -2
  219. mlrun/package/utils/type_hint_utils.py +4 -9
  220. mlrun/platforms/__init__.py +11 -10
  221. mlrun/platforms/iguazio.py +24 -203
  222. mlrun/projects/operations.py +52 -25
  223. mlrun/projects/pipelines.py +191 -197
  224. mlrun/projects/project.py +1227 -400
  225. mlrun/render.py +16 -19
  226. mlrun/run.py +209 -184
  227. mlrun/runtimes/__init__.py +83 -15
  228. mlrun/runtimes/base.py +51 -35
  229. mlrun/runtimes/daskjob.py +17 -10
  230. mlrun/runtimes/databricks_job/databricks_cancel_task.py +1 -1
  231. mlrun/runtimes/databricks_job/databricks_runtime.py +8 -7
  232. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  233. mlrun/runtimes/funcdoc.py +1 -29
  234. mlrun/runtimes/function_reference.py +1 -1
  235. mlrun/runtimes/kubejob.py +34 -128
  236. mlrun/runtimes/local.py +40 -11
  237. mlrun/runtimes/mpijob/__init__.py +0 -20
  238. mlrun/runtimes/mpijob/abstract.py +9 -10
  239. mlrun/runtimes/mpijob/v1.py +1 -1
  240. mlrun/{model_monitoring/stores/models/sqlite.py → runtimes/nuclio/__init__.py} +7 -9
  241. mlrun/runtimes/nuclio/api_gateway.py +769 -0
  242. mlrun/runtimes/nuclio/application/__init__.py +15 -0
  243. mlrun/runtimes/nuclio/application/application.py +758 -0
  244. mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
  245. mlrun/runtimes/{function.py → nuclio/function.py} +200 -83
  246. mlrun/runtimes/{nuclio.py → nuclio/nuclio.py} +6 -6
  247. mlrun/runtimes/{serving.py → nuclio/serving.py} +65 -68
  248. mlrun/runtimes/pod.py +281 -101
  249. mlrun/runtimes/remotesparkjob.py +12 -9
  250. mlrun/runtimes/sparkjob/spark3job.py +67 -51
  251. mlrun/runtimes/utils.py +41 -75
  252. mlrun/secrets.py +9 -5
  253. mlrun/serving/__init__.py +8 -1
  254. mlrun/serving/remote.py +2 -7
  255. mlrun/serving/routers.py +85 -69
  256. mlrun/serving/server.py +69 -44
  257. mlrun/serving/states.py +209 -36
  258. mlrun/serving/utils.py +22 -14
  259. mlrun/serving/v1_serving.py +6 -7
  260. mlrun/serving/v2_serving.py +133 -54
  261. mlrun/track/tracker.py +2 -1
  262. mlrun/track/tracker_manager.py +3 -3
  263. mlrun/track/trackers/mlflow_tracker.py +6 -2
  264. mlrun/utils/async_http.py +6 -8
  265. mlrun/utils/azure_vault.py +1 -1
  266. mlrun/utils/clones.py +1 -2
  267. mlrun/utils/condition_evaluator.py +3 -3
  268. mlrun/utils/db.py +21 -3
  269. mlrun/utils/helpers.py +405 -225
  270. mlrun/utils/http.py +3 -6
  271. mlrun/utils/logger.py +112 -16
  272. mlrun/utils/notifications/notification/__init__.py +17 -13
  273. mlrun/utils/notifications/notification/base.py +50 -2
  274. mlrun/utils/notifications/notification/console.py +2 -0
  275. mlrun/utils/notifications/notification/git.py +24 -1
  276. mlrun/utils/notifications/notification/ipython.py +3 -1
  277. mlrun/utils/notifications/notification/slack.py +96 -21
  278. mlrun/utils/notifications/notification/webhook.py +59 -2
  279. mlrun/utils/notifications/notification_pusher.py +149 -30
  280. mlrun/utils/regex.py +9 -0
  281. mlrun/utils/retryer.py +208 -0
  282. mlrun/utils/singleton.py +1 -1
  283. mlrun/utils/v3io_clients.py +4 -6
  284. mlrun/utils/version/version.json +2 -2
  285. mlrun/utils/version/version.py +2 -6
  286. mlrun-1.7.0.dist-info/METADATA +378 -0
  287. mlrun-1.7.0.dist-info/RECORD +351 -0
  288. {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/WHEEL +1 -1
  289. mlrun/feature_store/retrieval/conversion.py +0 -273
  290. mlrun/kfpops.py +0 -868
  291. mlrun/model_monitoring/application.py +0 -310
  292. mlrun/model_monitoring/batch.py +0 -1095
  293. mlrun/model_monitoring/prometheus.py +0 -219
  294. mlrun/model_monitoring/stores/__init__.py +0 -111
  295. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -576
  296. mlrun/model_monitoring/stores/model_endpoint_store.py +0 -147
  297. mlrun/model_monitoring/stores/models/__init__.py +0 -27
  298. mlrun/model_monitoring/stores/models/base.py +0 -84
  299. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -384
  300. mlrun/platforms/other.py +0 -306
  301. mlrun-1.6.4rc8.dist-info/METADATA +0 -272
  302. mlrun-1.6.4rc8.dist-info/RECORD +0 -314
  303. {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/LICENSE +0 -0
  304. {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/entry_points.txt +0 -0
  305. {mlrun-1.6.4rc8.dist-info → mlrun-1.7.0.dist-info}/top_level.txt +0 -0
@@ -15,23 +15,13 @@
15
15
  import warnings
16
16
  from collections import Counter
17
17
 
18
- from pyspark.sql.types import (
19
- BooleanType,
20
- ByteType,
21
- DoubleType,
22
- FloatType,
23
- IntegerType,
24
- IntegralType,
25
- LongType,
26
- MapType,
27
- ShortType,
28
- TimestampType,
29
- )
30
-
31
-
32
- def toPandas(spark_df):
18
+ import pandas as pd
19
+ import semver
20
+
21
+
22
+ def _to_pandas(spark_df):
33
23
  """
34
- Modified version of spark DataFrame.toPandas()
24
+ Modified version of spark DataFrame.toPandas() -
35
25
  https://github.com/apache/spark/blob/v3.2.3/python/pyspark/sql/pandas/conversion.py#L35
36
26
 
37
27
  The original code (which is only replaced in pyspark 3.5.0) fails with Pandas 2 installed, with the following error:
@@ -40,6 +30,12 @@ def toPandas(spark_df):
40
30
  This modification adds the missing unit to the dtype.
41
31
  """
42
32
  from pyspark.sql.dataframe import DataFrame
33
+ from pyspark.sql.types import (
34
+ BooleanType,
35
+ IntegralType,
36
+ MapType,
37
+ TimestampType,
38
+ )
43
39
 
44
40
  assert isinstance(spark_df, DataFrame)
45
41
 
@@ -48,7 +44,6 @@ def toPandas(spark_df):
48
44
  require_minimum_pandas_version()
49
45
 
50
46
  import numpy as np
51
- import pandas as pd
52
47
 
53
48
  timezone = spark_df.sql_ctx._conf.sessionLocalTimeZone()
54
49
 
@@ -65,10 +60,10 @@ def toPandas(spark_df):
65
60
  msg = (
66
61
  "toPandas attempted Arrow optimization because "
67
62
  "'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, "
68
- "failed by the reason below:\n %s\n"
63
+ f"failed by the reason below:\n {e}\n"
69
64
  "Attempting non-optimization as "
70
65
  "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "
71
- "true." % str(e)
66
+ "true."
72
67
  )
73
68
  warnings.warn(msg)
74
69
  use_arrow = False
@@ -78,7 +73,7 @@ def toPandas(spark_df):
78
73
  "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
79
74
  "reached the error below and will not continue because automatic fallback "
80
75
  "with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to "
81
- "false.\n %s" % str(e)
76
+ f"false.\n {e}"
82
77
  )
83
78
  warnings.warn(msg)
84
79
  raise
@@ -94,9 +89,7 @@ def toPandas(spark_df):
94
89
  )
95
90
 
96
91
  # Rename columns to avoid duplicated column names.
97
- tmp_column_names = [
98
- "col_{}".format(i) for i in range(len(spark_df.columns))
99
- ]
92
+ tmp_column_names = [f"col_{i}" for i in range(len(spark_df.columns))]
100
93
  self_destruct = spark_df.sql_ctx._conf.arrowPySparkSelfDestructEnabled()
101
94
  batches = spark_df.toDF(*tmp_column_names)._collect_as_arrow(
102
95
  split_batches=self_destruct
@@ -146,7 +139,7 @@ def toPandas(spark_df):
146
139
  "reached the error below and can not continue. Note that "
147
140
  "'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an "
148
141
  "effect on failures in the middle of "
149
- "computation.\n %s" % str(e)
142
+ f"computation.\n {e}"
150
143
  )
151
144
  warnings.warn(msg)
152
145
  raise
@@ -156,10 +149,10 @@ def toPandas(spark_df):
156
149
  column_counter = Counter(spark_df.columns)
157
150
 
158
151
  dtype = [None] * len(spark_df.schema)
159
- for fieldIdx, field in enumerate(spark_df.schema):
152
+ for field_idx, field in enumerate(spark_df.schema):
160
153
  # For duplicate column name, we use `iloc` to access it.
161
154
  if column_counter[field.name] > 1:
162
- pandas_col = pdf.iloc[:, fieldIdx]
155
+ pandas_col = pdf.iloc[:, field_idx]
163
156
  else:
164
157
  pandas_col = pdf[field.name]
165
158
 
@@ -173,12 +166,12 @@ def toPandas(spark_df):
173
166
  and field.nullable
174
167
  and pandas_col.isnull().any()
175
168
  ):
176
- dtype[fieldIdx] = pandas_type
169
+ dtype[field_idx] = pandas_type
177
170
  # Ensure we fall back to nullable numpy types, even when whole column is null:
178
171
  if isinstance(field.dataType, IntegralType) and pandas_col.isnull().any():
179
- dtype[fieldIdx] = np.float64
172
+ dtype[field_idx] = np.float64
180
173
  if isinstance(field.dataType, BooleanType) and pandas_col.isnull().any():
181
- dtype[fieldIdx] = object
174
+ dtype[field_idx] = object
182
175
 
183
176
  df = pd.DataFrame()
184
177
  for index, t in enumerate(dtype):
@@ -219,22 +212,59 @@ def toPandas(spark_df):
219
212
 
220
213
  def _to_corrected_pandas_type(dt):
221
214
  import numpy as np
215
+ from pyspark.sql.types import (
216
+ BooleanType,
217
+ ByteType,
218
+ DoubleType,
219
+ FloatType,
220
+ IntegerType,
221
+ LongType,
222
+ ShortType,
223
+ TimestampType,
224
+ )
222
225
 
223
- if type(dt) == ByteType:
226
+ if isinstance(dt, ByteType):
224
227
  return np.int8
225
- elif type(dt) == ShortType:
228
+ elif isinstance(dt, ShortType):
226
229
  return np.int16
227
- elif type(dt) == IntegerType:
230
+ elif isinstance(dt, IntegerType):
228
231
  return np.int32
229
- elif type(dt) == LongType:
232
+ elif isinstance(dt, LongType):
230
233
  return np.int64
231
- elif type(dt) == FloatType:
234
+ elif isinstance(dt, FloatType):
232
235
  return np.float32
233
- elif type(dt) == DoubleType:
236
+ elif isinstance(dt, DoubleType):
234
237
  return np.float64
235
- elif type(dt) == BooleanType:
238
+ elif isinstance(dt, BooleanType):
236
239
  return bool
237
- elif type(dt) == TimestampType:
240
+ elif isinstance(dt, TimestampType):
238
241
  return "datetime64[ns]"
239
242
  else:
240
243
  return None
244
+
245
+
246
+ def spark_df_to_pandas(spark_df):
247
+ # as of pyspark 3.2.3, toPandas fails to convert timestamps unless we work around the issue
248
+ # when we upgrade pyspark, we should check whether this workaround is still necessary
249
+ # see https://stackoverflow.com/questions/76389694/transforming-pyspark-to-pandas-dataframe
250
+ if semver.parse(pd.__version__)["major"] >= 2:
251
+ import pyspark.sql.functions as pyspark_functions
252
+
253
+ type_conversion_dict = {}
254
+ for field in spark_df.schema.fields:
255
+ if str(field.dataType) == "TimestampType":
256
+ spark_df = spark_df.withColumn(
257
+ field.name,
258
+ pyspark_functions.date_format(
259
+ pyspark_functions.to_timestamp(field.name),
260
+ "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSS",
261
+ ),
262
+ )
263
+ type_conversion_dict[field.name] = "datetime64[ns]"
264
+
265
+ df = _to_pandas(spark_df)
266
+ if type_conversion_dict:
267
+ df = df.astype(type_conversion_dict)
268
+ return df
269
+ else:
270
+ return _to_pandas(spark_df)
@@ -64,7 +64,7 @@ from .store_resources import (
64
64
  parse_store_uri,
65
65
  )
66
66
  from .targets import CSVTarget, NoSqlTarget, ParquetTarget, StreamTarget
67
- from .utils import parse_kafka_url
67
+ from .utils import get_kafka_brokers_from_dict, parse_kafka_url
68
68
 
69
69
  store_manager = StoreManager()
70
70
 
@@ -107,19 +107,17 @@ def get_stream_pusher(stream_path: str, **kwargs):
107
107
  :param stream_path: path/url of stream
108
108
  """
109
109
 
110
- if stream_path.startswith("kafka://") or "kafka_bootstrap_servers" in kwargs:
111
- topic, bootstrap_servers = parse_kafka_url(
112
- stream_path, kwargs.get("kafka_bootstrap_servers")
113
- )
114
- return KafkaOutputStream(
115
- topic, bootstrap_servers, kwargs.get("kafka_producer_options")
116
- )
110
+ kafka_brokers = get_kafka_brokers_from_dict(kwargs)
111
+ if stream_path.startswith("kafka://") or kafka_brokers:
112
+ topic, brokers = parse_kafka_url(stream_path, kafka_brokers)
113
+ return KafkaOutputStream(topic, brokers, kwargs.get("kafka_producer_options"))
117
114
  elif stream_path.startswith("http://") or stream_path.startswith("https://"):
118
115
  return HTTPOutputStream(stream_path=stream_path)
119
116
  elif "://" not in stream_path:
120
117
  return OutputStream(stream_path, **kwargs)
121
118
  elif stream_path.startswith("v3io"):
122
119
  endpoint, stream_path = parse_path(stream_path)
120
+ endpoint = kwargs.pop("endpoint", None) or endpoint
123
121
  return OutputStream(stream_path, endpoint=endpoint, **kwargs)
124
122
  elif stream_path.startswith("dummy://"):
125
123
  return _DummyStream(**kwargs)
@@ -0,0 +1,131 @@
1
+ # Copyright 2023 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import time
16
+ from datetime import datetime
17
+ from pathlib import Path
18
+ from urllib.parse import urlparse
19
+
20
+ import oss2
21
+ from fsspec.registry import get_filesystem_class
22
+
23
+ import mlrun.errors
24
+
25
+ from .base import DataStore, FileStats, make_datastore_schema_sanitizer
26
+
27
+
28
+ class OSSStore(DataStore):
29
+ using_bucket = True
30
+
31
+ def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
32
+ super().__init__(parent, name, schema, endpoint, secrets)
33
+ # will be used in case user asks to assume a role and work through fsspec
34
+
35
+ access_key_id = self._get_secret_or_env("ALIBABA_ACCESS_KEY_ID")
36
+ secret_key = self._get_secret_or_env("ALIBABA_SECRET_ACCESS_KEY")
37
+ endpoint_url = self._get_secret_or_env("ALIBABA_ENDPOINT_URL")
38
+ if access_key_id and secret_key and endpoint_url:
39
+ self.auth = oss2.Auth(access_key_id, secret_key)
40
+ self.endpoint_url = endpoint_url
41
+ else:
42
+ raise mlrun.errors.MLRunInvalidArgumentError(
43
+ "missing ALIBABA_ACCESS_KEY_ID or ALIBABA_SECRET_ACCESS_KEY ALIBABA_ENDPOINT_URL in environment"
44
+ )
45
+
46
+ @property
47
+ def filesystem(self):
48
+ """return fsspec file system object, if supported"""
49
+ if self._filesystem:
50
+ return self._filesystem
51
+ try:
52
+ import ossfs # noqa
53
+ except ImportError as exc:
54
+ raise ImportError("ALIBABA ossfs not installed") from exc
55
+ filesystem_class = get_filesystem_class(protocol=self.kind)
56
+ self._filesystem = make_datastore_schema_sanitizer(
57
+ filesystem_class,
58
+ using_bucket=self.using_bucket,
59
+ **self.get_storage_options(),
60
+ )
61
+ return self._filesystem
62
+
63
+ def get_storage_options(self):
64
+ res = dict(
65
+ endpoint=self._get_secret_or_env("ALIBABA_ENDPOINT_URL"),
66
+ key=self._get_secret_or_env("ALIBABA_ACCESS_KEY_ID"),
67
+ secret=self._get_secret_or_env("ALIBABA_SECRET_ACCESS_KEY"),
68
+ )
69
+ return self._sanitize_storage_options(res)
70
+
71
+ def get_bucket_and_key(self, key):
72
+ path = self._join(key)[1:]
73
+ return self.endpoint, path
74
+
75
+ def upload(self, key, src_path):
76
+ bucket, key = self.get_bucket_and_key(key)
77
+ oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
78
+ oss.put_object(key, open(src_path, "rb"))
79
+
80
+ def get(self, key, size=None, offset=0):
81
+ bucket, key = self.get_bucket_and_key(key)
82
+ oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
83
+ if size or offset:
84
+ return oss.get_object(key, byte_range=self.get_range(size, offset)).read()
85
+ return oss.get_object(key).read()
86
+
87
+ def put(self, key, data, append=False):
88
+ data, _ = self._prepare_put_data(data, append)
89
+ bucket, key = self.get_bucket_and_key(key)
90
+ oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
91
+ oss.put_object(key, data)
92
+
93
+ def stat(self, key):
94
+ bucket, key = self.get_bucket_and_key(key)
95
+ oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
96
+ obj = oss.get_object_meta(key)
97
+ size = obj.content_length
98
+ modified = datetime.fromtimestamp(obj.last_modified)
99
+ return FileStats(size, time.mktime(modified.timetuple()))
100
+
101
+ def listdir(self, key):
102
+ remote_path = self._convert_key_to_remote_path(key)
103
+ if self.filesystem.isfile(remote_path):
104
+ return key
105
+ remote_path = f"{remote_path}/**"
106
+ files = self.filesystem.glob(remote_path)
107
+ key_length = len(key)
108
+ files = [
109
+ f.split("/", 1)[1][key_length:] for f in files if len(f.split("/")) > 1
110
+ ]
111
+ return files
112
+
113
+ def delete(self, key):
114
+ bucket, key = self.get_bucket_and_key(key)
115
+ oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
116
+ oss.delete_object(key)
117
+
118
+ def _convert_key_to_remote_path(self, key):
119
+ key = key.strip("/")
120
+ schema = urlparse(key).scheme
121
+ # if called without passing dataitem - like in fset.purge_targets,
122
+ # key will include schema.
123
+ if not schema:
124
+ key = Path(self.endpoint, key).as_posix()
125
+ return key
126
+
127
+ @staticmethod
128
+ def get_range(size, offset):
129
+ if size:
130
+ return [offset, size]
131
+ return [offset, None]
@@ -16,12 +16,13 @@ import time
16
16
  from pathlib import Path
17
17
  from urllib.parse import urlparse
18
18
 
19
+ from azure.storage.blob import BlobServiceClient
19
20
  from azure.storage.blob._shared.base_client import parse_connection_str
20
21
  from fsspec.registry import get_filesystem_class
21
22
 
22
23
  import mlrun.errors
23
24
 
24
- from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
25
+ from .base import DataStore, FileStats, make_datastore_schema_sanitizer
25
26
 
26
27
  # Azure blobs will be represented with the following URL: az://<container name>. The storage account is already
27
28
  # pointed to by the connection string, so the user is not expected to specify it in any way.
@@ -29,47 +30,131 @@ from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
29
30
 
30
31
  class AzureBlobStore(DataStore):
31
32
  using_bucket = True
33
+ max_concurrency = 100
34
+ max_blocksize = 1024 * 1024 * 4
35
+ max_single_put_size = (
36
+ 1024 * 1024 * 8
37
+ ) # for service_client property only, does not affect filesystem
32
38
 
33
39
  def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
34
40
  super().__init__(parent, name, schema, endpoint, secrets=secrets)
41
+ self._service_client = None
42
+ self._storage_options = None
43
+
44
+ def get_storage_options(self):
45
+ return self.storage_options
46
+
47
+ @property
48
+ def storage_options(self):
49
+ if not self._storage_options:
50
+ res = dict(
51
+ account_name=self._get_secret_or_env("account_name")
52
+ or self._get_secret_or_env("AZURE_STORAGE_ACCOUNT_NAME"),
53
+ account_key=self._get_secret_or_env("account_key")
54
+ or self._get_secret_or_env("AZURE_STORAGE_ACCOUNT_KEY"),
55
+ connection_string=self._get_secret_or_env("connection_string")
56
+ or self._get_secret_or_env("AZURE_STORAGE_CONNECTION_STRING"),
57
+ tenant_id=self._get_secret_or_env("tenant_id")
58
+ or self._get_secret_or_env("AZURE_STORAGE_TENANT_ID"),
59
+ client_id=self._get_secret_or_env("client_id")
60
+ or self._get_secret_or_env("AZURE_STORAGE_CLIENT_ID"),
61
+ client_secret=self._get_secret_or_env("client_secret")
62
+ or self._get_secret_or_env("AZURE_STORAGE_CLIENT_SECRET"),
63
+ sas_token=self._get_secret_or_env("sas_token")
64
+ or self._get_secret_or_env("AZURE_STORAGE_SAS_TOKEN"),
65
+ credential=self._get_secret_or_env("credential"),
66
+ )
67
+ self._storage_options = self._sanitize_storage_options(res)
68
+ return self._storage_options
35
69
 
36
70
  @property
37
71
  def filesystem(self):
38
72
  """return fsspec file system object, if supported"""
39
- if self._filesystem:
40
- return self._filesystem
41
73
  try:
42
74
  import adlfs # noqa
43
75
  except ImportError as exc:
44
76
  raise ImportError("Azure adlfs not installed") from exc
45
- # in order to support az and wasbs kinds.
46
- filesystem_class = get_filesystem_class(protocol=self.kind)
47
- self._filesystem = makeDatastoreSchemaSanitizer(
48
- filesystem_class,
49
- using_bucket=self.using_bucket,
50
- **self.get_storage_options(),
51
- )
77
+
78
+ if not self._filesystem:
79
+ # in order to support az and wasbs kinds
80
+ filesystem_class = get_filesystem_class(protocol=self.kind)
81
+ self._filesystem = make_datastore_schema_sanitizer(
82
+ filesystem_class,
83
+ using_bucket=self.using_bucket,
84
+ blocksize=self.max_blocksize,
85
+ **self.storage_options,
86
+ )
52
87
  return self._filesystem
53
88
 
54
- def get_storage_options(self):
55
- res = dict(
56
- account_name=self._get_secret_or_env("account_name")
57
- or self._get_secret_or_env("AZURE_STORAGE_ACCOUNT_NAME"),
58
- account_key=self._get_secret_or_env("account_key")
59
- or self._get_secret_or_env("AZURE_STORAGE_KEY"),
60
- connection_string=self._get_secret_or_env("connection_string")
61
- or self._get_secret_or_env("AZURE_STORAGE_CONNECTION_STRING"),
62
- tenant_id=self._get_secret_or_env("tenant_id")
63
- or self._get_secret_or_env("AZURE_STORAGE_TENANT_ID"),
64
- client_id=self._get_secret_or_env("client_id")
65
- or self._get_secret_or_env("AZURE_STORAGE_CLIENT_ID"),
66
- client_secret=self._get_secret_or_env("client_secret")
67
- or self._get_secret_or_env("AZURE_STORAGE_CLIENT_SECRET"),
68
- sas_token=self._get_secret_or_env("sas_token")
69
- or self._get_secret_or_env("AZURE_STORAGE_SAS_TOKEN"),
70
- credential=self._get_secret_or_env("credential"),
71
- )
72
- return self._sanitize_storage_options(res)
89
+ @property
90
+ def service_client(self):
91
+ try:
92
+ import azure # noqa
93
+ except ImportError as exc:
94
+ raise ImportError("Azure not installed") from exc
95
+
96
+ if not self._service_client:
97
+ self._do_connect()
98
+ return self._service_client
99
+
100
+ def _do_connect(self):
101
+ """
102
+
103
+ Creates a client for azure.
104
+ Raises MLRunInvalidArgumentError if none of the connection details are available
105
+ based on do_connect in AzureBlobFileSystem:
106
+ https://github.com/fsspec/adlfs/blob/2023.9.0/adlfs/spec.py#L422
107
+ """
108
+ from azure.identity import ClientSecretCredential
109
+
110
+ storage_options = self.storage_options
111
+ connection_string = storage_options.get("connection_string")
112
+ client_name = storage_options.get("account_name")
113
+ account_key = storage_options.get("account_key")
114
+ sas_token = storage_options.get("sas_token")
115
+ client_id = storage_options.get("client_id")
116
+ credential = storage_options.get("credential")
117
+
118
+ credential_from_client_id = None
119
+ if (
120
+ credential is None
121
+ and account_key is None
122
+ and sas_token is None
123
+ and client_id is not None
124
+ ):
125
+ credential_from_client_id = ClientSecretCredential(
126
+ tenant_id=storage_options.get("tenant_id"),
127
+ client_id=client_id,
128
+ client_secret=storage_options.get("client_secret"),
129
+ )
130
+ try:
131
+ if connection_string is not None:
132
+ self._service_client = BlobServiceClient.from_connection_string(
133
+ conn_str=connection_string,
134
+ max_block_size=self.max_blocksize,
135
+ max_single_put_size=self.max_single_put_size,
136
+ )
137
+ elif client_name is not None:
138
+ account_url = f"https://{client_name}.blob.core.windows.net"
139
+ cred = credential_from_client_id or credential or account_key
140
+ if not cred and sas_token is not None:
141
+ if not sas_token.startswith("?"):
142
+ sas_token = f"?{sas_token}"
143
+ account_url = account_url + sas_token
144
+ self._service_client = BlobServiceClient(
145
+ account_url=account_url,
146
+ credential=cred,
147
+ max_block_size=self.max_blocksize,
148
+ max_single_put_size=self.max_single_put_size,
149
+ )
150
+ else:
151
+ raise mlrun.errors.MLRunInvalidArgumentError(
152
+ "Must provide either a connection_string or account_name with credentials"
153
+ )
154
+ except Exception as e:
155
+ raise mlrun.errors.MLRunInvalidArgumentError(
156
+ f"unable to connect to account for {e}"
157
+ )
73
158
 
74
159
  def _convert_key_to_remote_path(self, key):
75
160
  key = key.strip("/")
@@ -82,7 +167,15 @@ class AzureBlobStore(DataStore):
82
167
 
83
168
  def upload(self, key, src_path):
84
169
  remote_path = self._convert_key_to_remote_path(key)
85
- self.filesystem.put_file(src_path, remote_path, overwrite=True)
170
+ container, remote_path = remote_path.split("/", 1)
171
+ container_client = self.service_client.get_container_client(container=container)
172
+ with open(file=src_path, mode="rb") as data:
173
+ container_client.upload_blob(
174
+ name=remote_path,
175
+ data=data,
176
+ overwrite=True,
177
+ max_concurrency=self.max_concurrency,
178
+ )
86
179
 
87
180
  def get(self, key, size=None, offset=0):
88
181
  remote_path = self._convert_key_to_remote_path(key)
@@ -96,12 +189,7 @@ class AzureBlobStore(DataStore):
96
189
  "Append mode not supported for Azure blob datastore"
97
190
  )
98
191
  remote_path = self._convert_key_to_remote_path(key)
99
- if isinstance(data, bytes):
100
- mode = "wb"
101
- elif isinstance(data, str):
102
- mode = "w"
103
- else:
104
- raise TypeError("Data type unknown. Unable to put in Azure!")
192
+ data, mode = self._prepare_put_data(data, append)
105
193
  with self.filesystem.open(remote_path, mode) as f:
106
194
  f.write(data)
107
195
 
@@ -135,7 +223,7 @@ class AzureBlobStore(DataStore):
135
223
 
136
224
  def get_spark_options(self):
137
225
  res = {}
138
- st = self.get_storage_options()
226
+ st = self.storage_options
139
227
  service = "blob"
140
228
  primary_url = None
141
229
  if st.get("connection_string"):
@@ -158,18 +246,17 @@ class AzureBlobStore(DataStore):
158
246
  st[key] = parsed_value
159
247
 
160
248
  account_name = st.get("account_name")
161
- if not account_name:
162
- raise mlrun.errors.MLRunInvalidArgumentError(
163
- "Property 'account_name' is absent both in storage settings and connection string"
164
- )
165
249
  if primary_url:
166
250
  if primary_url.startswith("http://"):
167
251
  primary_url = primary_url[len("http://") :]
168
252
  if primary_url.startswith("https://"):
169
253
  primary_url = primary_url[len("https://") :]
170
254
  host = primary_url
171
- else:
255
+ elif account_name:
172
256
  host = f"{account_name}.{service}.core.windows.net"
257
+ else:
258
+ return res
259
+
173
260
  if "account_key" in st:
174
261
  res[f"spark.hadoop.fs.azure.account.key.{host}"] = st["account_key"]
175
262
 
@@ -199,3 +286,17 @@ class AzureBlobStore(DataStore):
199
286
  )
200
287
  res[f"spark.hadoop.fs.azure.sas.fixed.token.{host}"] = st["sas_token"]
201
288
  return res
289
+
290
+ @property
291
+ def spark_url(self):
292
+ spark_options = self.get_spark_options()
293
+ url = f"wasbs://{self.endpoint}"
294
+ prefix = "spark.hadoop.fs.azure.account.key."
295
+ if spark_options:
296
+ for key in spark_options:
297
+ if key.startswith(prefix):
298
+ account_key = key[len(prefix) :]
299
+ if not url.endswith(account_key):
300
+ url += f"@{account_key}"
301
+ break
302
+ return url