mlrun 1.3.3__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (444) hide show
  1. mlrun/__init__.py +3 -3
  2. mlrun/__main__.py +79 -37
  3. mlrun/api/__init__.py +1 -1
  4. mlrun/api/api/__init__.py +1 -1
  5. mlrun/api/api/api.py +4 -4
  6. mlrun/api/api/deps.py +10 -21
  7. mlrun/api/api/endpoints/__init__.py +1 -1
  8. mlrun/api/api/endpoints/artifacts.py +64 -36
  9. mlrun/api/api/endpoints/auth.py +4 -4
  10. mlrun/api/api/endpoints/background_tasks.py +11 -11
  11. mlrun/api/api/endpoints/client_spec.py +5 -5
  12. mlrun/api/api/endpoints/clusterization_spec.py +6 -4
  13. mlrun/api/api/endpoints/feature_store.py +124 -115
  14. mlrun/api/api/endpoints/files.py +22 -14
  15. mlrun/api/api/endpoints/frontend_spec.py +28 -21
  16. mlrun/api/api/endpoints/functions.py +142 -87
  17. mlrun/api/api/endpoints/grafana_proxy.py +89 -442
  18. mlrun/api/api/endpoints/healthz.py +20 -7
  19. mlrun/api/api/endpoints/hub.py +320 -0
  20. mlrun/api/api/endpoints/internal/__init__.py +1 -1
  21. mlrun/api/api/endpoints/internal/config.py +1 -1
  22. mlrun/api/api/endpoints/internal/memory_reports.py +9 -9
  23. mlrun/api/api/endpoints/logs.py +11 -11
  24. mlrun/api/api/endpoints/model_endpoints.py +74 -70
  25. mlrun/api/api/endpoints/operations.py +13 -9
  26. mlrun/api/api/endpoints/pipelines.py +93 -88
  27. mlrun/api/api/endpoints/projects.py +35 -35
  28. mlrun/api/api/endpoints/runs.py +69 -27
  29. mlrun/api/api/endpoints/runtime_resources.py +28 -28
  30. mlrun/api/api/endpoints/schedules.py +98 -41
  31. mlrun/api/api/endpoints/secrets.py +37 -32
  32. mlrun/api/api/endpoints/submit.py +12 -12
  33. mlrun/api/api/endpoints/tags.py +20 -22
  34. mlrun/api/api/utils.py +251 -42
  35. mlrun/api/constants.py +1 -1
  36. mlrun/api/crud/__init__.py +18 -15
  37. mlrun/api/crud/artifacts.py +10 -10
  38. mlrun/api/crud/client_spec.py +4 -4
  39. mlrun/api/crud/clusterization_spec.py +3 -3
  40. mlrun/api/crud/feature_store.py +54 -46
  41. mlrun/api/crud/functions.py +3 -3
  42. mlrun/api/crud/hub.py +312 -0
  43. mlrun/api/crud/logs.py +11 -9
  44. mlrun/api/crud/model_monitoring/__init__.py +3 -3
  45. mlrun/api/crud/model_monitoring/grafana.py +435 -0
  46. mlrun/api/crud/model_monitoring/model_endpoints.py +352 -129
  47. mlrun/api/crud/notifications.py +149 -0
  48. mlrun/api/crud/pipelines.py +67 -52
  49. mlrun/api/crud/projects.py +51 -23
  50. mlrun/api/crud/runs.py +7 -5
  51. mlrun/api/crud/runtime_resources.py +13 -13
  52. mlrun/api/{db/filedb → crud/runtimes}/__init__.py +1 -1
  53. mlrun/api/crud/runtimes/nuclio/__init__.py +14 -0
  54. mlrun/api/crud/runtimes/nuclio/function.py +505 -0
  55. mlrun/api/crud/runtimes/nuclio/helpers.py +310 -0
  56. mlrun/api/crud/secrets.py +88 -46
  57. mlrun/api/crud/tags.py +5 -5
  58. mlrun/api/db/__init__.py +1 -1
  59. mlrun/api/db/base.py +102 -54
  60. mlrun/api/db/init_db.py +2 -3
  61. mlrun/api/db/session.py +4 -12
  62. mlrun/api/db/sqldb/__init__.py +1 -1
  63. mlrun/api/db/sqldb/db.py +439 -196
  64. mlrun/api/db/sqldb/helpers.py +1 -1
  65. mlrun/api/db/sqldb/models/__init__.py +3 -3
  66. mlrun/api/db/sqldb/models/models_mysql.py +82 -64
  67. mlrun/api/db/sqldb/models/models_sqlite.py +76 -64
  68. mlrun/api/db/sqldb/session.py +27 -20
  69. mlrun/api/initial_data.py +82 -24
  70. mlrun/api/launcher.py +196 -0
  71. mlrun/api/main.py +91 -22
  72. mlrun/api/middlewares.py +6 -5
  73. mlrun/api/migrations_mysql/env.py +1 -1
  74. mlrun/api/migrations_mysql/versions/28383af526f3_market_place_to_hub.py +40 -0
  75. mlrun/api/migrations_mysql/versions/32bae1b0e29c_increase_timestamp_fields_precision.py +1 -1
  76. mlrun/api/migrations_mysql/versions/4903aef6a91d_tag_foreign_key_and_cascades.py +1 -1
  77. mlrun/api/migrations_mysql/versions/5f1351c88a19_adding_background_tasks_table.py +1 -1
  78. mlrun/api/migrations_mysql/versions/88e656800d6a_add_requested_logs_column_and_index_to_.py +1 -1
  79. mlrun/api/migrations_mysql/versions/9d16de5f03a7_adding_data_versions_table.py +1 -1
  80. mlrun/api/migrations_mysql/versions/b86f5b53f3d7_adding_name_and_updated_to_runs_table.py +1 -1
  81. mlrun/api/migrations_mysql/versions/c4af40b0bf61_init.py +1 -1
  82. mlrun/api/migrations_mysql/versions/c905d15bd91d_notifications.py +72 -0
  83. mlrun/api/migrations_mysql/versions/ee041e8fdaa0_adding_next_run_time_column_to_schedule_.py +1 -1
  84. mlrun/api/migrations_sqlite/env.py +1 -1
  85. mlrun/api/migrations_sqlite/versions/11f8dd2dc9fe_init.py +1 -1
  86. mlrun/api/migrations_sqlite/versions/1c954f8cb32d_schedule_last_run_uri.py +1 -1
  87. mlrun/api/migrations_sqlite/versions/2b6d23c715aa_adding_feature_sets.py +1 -1
  88. mlrun/api/migrations_sqlite/versions/4acd9430b093_market_place_to_hub.py +77 -0
  89. mlrun/api/migrations_sqlite/versions/6401142f2d7c_adding_next_run_time_column_to_schedule_.py +1 -1
  90. mlrun/api/migrations_sqlite/versions/64d90a1a69bc_adding_background_tasks_table.py +1 -1
  91. mlrun/api/migrations_sqlite/versions/803438ecd005_add_requested_logs_column_to_runs.py +1 -1
  92. mlrun/api/migrations_sqlite/versions/863114f0c659_refactoring_feature_set.py +1 -1
  93. mlrun/api/migrations_sqlite/versions/959ae00528ad_notifications.py +63 -0
  94. mlrun/api/migrations_sqlite/versions/accf9fc83d38_adding_data_versions_table.py +1 -1
  95. mlrun/api/migrations_sqlite/versions/b68e8e897a28_schedule_labels.py +1 -1
  96. mlrun/api/migrations_sqlite/versions/bcd0c1f9720c_adding_project_labels.py +1 -1
  97. mlrun/api/migrations_sqlite/versions/cf21882f938e_schedule_id.py +1 -1
  98. mlrun/api/migrations_sqlite/versions/d781f58f607f_tag_object_name_string.py +1 -1
  99. mlrun/api/migrations_sqlite/versions/deac06871ace_adding_marketplace_sources_table.py +1 -1
  100. mlrun/api/migrations_sqlite/versions/e1dd5983c06b_schedule_concurrency_limit.py +1 -1
  101. mlrun/api/migrations_sqlite/versions/e5594ed3ab53_adding_name_and_updated_to_runs_table.py +1 -1
  102. mlrun/api/migrations_sqlite/versions/f4249b4ba6fa_adding_feature_vectors.py +1 -1
  103. mlrun/api/migrations_sqlite/versions/f7b5a1a03629_adding_feature_labels.py +1 -1
  104. mlrun/api/schemas/__init__.py +216 -138
  105. mlrun/api/utils/__init__.py +1 -1
  106. mlrun/api/utils/asyncio.py +1 -1
  107. mlrun/api/utils/auth/__init__.py +1 -1
  108. mlrun/api/utils/auth/providers/__init__.py +1 -1
  109. mlrun/api/utils/auth/providers/base.py +7 -7
  110. mlrun/api/utils/auth/providers/nop.py +6 -7
  111. mlrun/api/utils/auth/providers/opa.py +17 -17
  112. mlrun/api/utils/auth/verifier.py +36 -34
  113. mlrun/api/utils/background_tasks.py +24 -24
  114. mlrun/{builder.py → api/utils/builder.py} +216 -123
  115. mlrun/api/utils/clients/__init__.py +1 -1
  116. mlrun/api/utils/clients/chief.py +19 -4
  117. mlrun/api/utils/clients/iguazio.py +106 -60
  118. mlrun/api/utils/clients/log_collector.py +1 -1
  119. mlrun/api/utils/clients/nuclio.py +23 -23
  120. mlrun/api/utils/clients/protocols/grpc.py +2 -2
  121. mlrun/api/utils/db/__init__.py +1 -1
  122. mlrun/api/utils/db/alembic.py +1 -1
  123. mlrun/api/utils/db/backup.py +1 -1
  124. mlrun/api/utils/db/mysql.py +24 -25
  125. mlrun/api/utils/db/sql_collation.py +1 -1
  126. mlrun/api/utils/db/sqlite_migration.py +2 -2
  127. mlrun/api/utils/events/__init__.py +14 -0
  128. mlrun/api/utils/events/base.py +57 -0
  129. mlrun/api/utils/events/events_factory.py +41 -0
  130. mlrun/api/utils/events/iguazio.py +217 -0
  131. mlrun/api/utils/events/nop.py +55 -0
  132. mlrun/api/utils/helpers.py +16 -13
  133. mlrun/api/utils/memory_reports.py +1 -1
  134. mlrun/api/utils/periodic.py +6 -3
  135. mlrun/api/utils/projects/__init__.py +1 -1
  136. mlrun/api/utils/projects/follower.py +33 -33
  137. mlrun/api/utils/projects/leader.py +36 -34
  138. mlrun/api/utils/projects/member.py +27 -27
  139. mlrun/api/utils/projects/remotes/__init__.py +1 -1
  140. mlrun/api/utils/projects/remotes/follower.py +13 -13
  141. mlrun/api/utils/projects/remotes/leader.py +10 -10
  142. mlrun/api/utils/projects/remotes/nop_follower.py +27 -21
  143. mlrun/api/utils/projects/remotes/nop_leader.py +17 -16
  144. mlrun/api/utils/scheduler.py +140 -51
  145. mlrun/api/utils/singletons/__init__.py +1 -1
  146. mlrun/api/utils/singletons/db.py +9 -15
  147. mlrun/api/utils/singletons/k8s.py +677 -5
  148. mlrun/api/utils/singletons/logs_dir.py +1 -1
  149. mlrun/api/utils/singletons/project_member.py +1 -1
  150. mlrun/api/utils/singletons/scheduler.py +1 -1
  151. mlrun/artifacts/__init__.py +2 -2
  152. mlrun/artifacts/base.py +8 -2
  153. mlrun/artifacts/dataset.py +5 -3
  154. mlrun/artifacts/manager.py +7 -1
  155. mlrun/artifacts/model.py +15 -4
  156. mlrun/artifacts/plots.py +1 -1
  157. mlrun/common/__init__.py +1 -1
  158. mlrun/common/constants.py +15 -0
  159. mlrun/common/model_monitoring.py +209 -0
  160. mlrun/common/schemas/__init__.py +167 -0
  161. mlrun/{api → common}/schemas/artifact.py +13 -14
  162. mlrun/{api → common}/schemas/auth.py +10 -8
  163. mlrun/{api → common}/schemas/background_task.py +3 -3
  164. mlrun/{api → common}/schemas/client_spec.py +1 -1
  165. mlrun/{api → common}/schemas/clusterization_spec.py +3 -3
  166. mlrun/{api → common}/schemas/constants.py +21 -8
  167. mlrun/common/schemas/events.py +36 -0
  168. mlrun/{api → common}/schemas/feature_store.py +2 -1
  169. mlrun/{api → common}/schemas/frontend_spec.py +7 -6
  170. mlrun/{api → common}/schemas/function.py +5 -5
  171. mlrun/{api → common}/schemas/http.py +3 -3
  172. mlrun/common/schemas/hub.py +134 -0
  173. mlrun/{api → common}/schemas/k8s.py +3 -3
  174. mlrun/{api → common}/schemas/memory_reports.py +1 -1
  175. mlrun/common/schemas/model_endpoints.py +342 -0
  176. mlrun/common/schemas/notification.py +57 -0
  177. mlrun/{api → common}/schemas/object.py +6 -6
  178. mlrun/{api → common}/schemas/pipeline.py +3 -3
  179. mlrun/{api → common}/schemas/project.py +6 -5
  180. mlrun/common/schemas/regex.py +24 -0
  181. mlrun/common/schemas/runs.py +30 -0
  182. mlrun/{api → common}/schemas/runtime_resource.py +3 -3
  183. mlrun/{api → common}/schemas/schedule.py +19 -7
  184. mlrun/{api → common}/schemas/secret.py +3 -3
  185. mlrun/{api → common}/schemas/tag.py +2 -2
  186. mlrun/common/types.py +25 -0
  187. mlrun/config.py +152 -20
  188. mlrun/data_types/__init__.py +7 -2
  189. mlrun/data_types/data_types.py +4 -2
  190. mlrun/data_types/infer.py +1 -1
  191. mlrun/data_types/spark.py +10 -3
  192. mlrun/datastore/__init__.py +10 -3
  193. mlrun/datastore/azure_blob.py +1 -1
  194. mlrun/datastore/base.py +185 -53
  195. mlrun/datastore/datastore.py +1 -1
  196. mlrun/datastore/filestore.py +1 -1
  197. mlrun/datastore/google_cloud_storage.py +1 -1
  198. mlrun/datastore/inmem.py +4 -1
  199. mlrun/datastore/redis.py +1 -1
  200. mlrun/datastore/s3.py +1 -1
  201. mlrun/datastore/sources.py +192 -70
  202. mlrun/datastore/spark_udf.py +44 -0
  203. mlrun/datastore/store_resources.py +4 -4
  204. mlrun/datastore/targets.py +115 -45
  205. mlrun/datastore/utils.py +127 -5
  206. mlrun/datastore/v3io.py +1 -1
  207. mlrun/datastore/wasbfs/__init__.py +1 -1
  208. mlrun/datastore/wasbfs/fs.py +1 -1
  209. mlrun/db/__init__.py +7 -5
  210. mlrun/db/base.py +112 -68
  211. mlrun/db/httpdb.py +445 -277
  212. mlrun/db/nopdb.py +491 -0
  213. mlrun/db/sqldb.py +112 -65
  214. mlrun/errors.py +6 -1
  215. mlrun/execution.py +44 -22
  216. mlrun/feature_store/__init__.py +1 -1
  217. mlrun/feature_store/api.py +143 -95
  218. mlrun/feature_store/common.py +16 -20
  219. mlrun/feature_store/feature_set.py +42 -12
  220. mlrun/feature_store/feature_vector.py +32 -21
  221. mlrun/feature_store/ingestion.py +9 -12
  222. mlrun/feature_store/retrieval/__init__.py +3 -2
  223. mlrun/feature_store/retrieval/base.py +388 -66
  224. mlrun/feature_store/retrieval/dask_merger.py +63 -151
  225. mlrun/feature_store/retrieval/job.py +30 -12
  226. mlrun/feature_store/retrieval/local_merger.py +40 -133
  227. mlrun/feature_store/retrieval/spark_merger.py +129 -127
  228. mlrun/feature_store/retrieval/storey_merger.py +173 -0
  229. mlrun/feature_store/steps.py +132 -15
  230. mlrun/features.py +8 -3
  231. mlrun/frameworks/__init__.py +1 -1
  232. mlrun/frameworks/_common/__init__.py +1 -1
  233. mlrun/frameworks/_common/artifacts_library.py +1 -1
  234. mlrun/frameworks/_common/mlrun_interface.py +1 -1
  235. mlrun/frameworks/_common/model_handler.py +1 -1
  236. mlrun/frameworks/_common/plan.py +1 -1
  237. mlrun/frameworks/_common/producer.py +1 -1
  238. mlrun/frameworks/_common/utils.py +1 -1
  239. mlrun/frameworks/_dl_common/__init__.py +1 -1
  240. mlrun/frameworks/_dl_common/loggers/__init__.py +1 -1
  241. mlrun/frameworks/_dl_common/loggers/logger.py +1 -1
  242. mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +1 -1
  243. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +1 -1
  244. mlrun/frameworks/_dl_common/model_handler.py +1 -1
  245. mlrun/frameworks/_dl_common/utils.py +1 -1
  246. mlrun/frameworks/_ml_common/__init__.py +1 -1
  247. mlrun/frameworks/_ml_common/artifacts_library.py +1 -1
  248. mlrun/frameworks/_ml_common/loggers/__init__.py +1 -1
  249. mlrun/frameworks/_ml_common/loggers/logger.py +1 -1
  250. mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +1 -1
  251. mlrun/frameworks/_ml_common/model_handler.py +1 -1
  252. mlrun/frameworks/_ml_common/pkl_model_server.py +13 -1
  253. mlrun/frameworks/_ml_common/plan.py +1 -1
  254. mlrun/frameworks/_ml_common/plans/__init__.py +1 -1
  255. mlrun/frameworks/_ml_common/plans/calibration_curve_plan.py +1 -6
  256. mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +1 -1
  257. mlrun/frameworks/_ml_common/plans/dataset_plan.py +1 -1
  258. mlrun/frameworks/_ml_common/plans/feature_importance_plan.py +1 -1
  259. mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +1 -1
  260. mlrun/frameworks/_ml_common/producer.py +1 -1
  261. mlrun/frameworks/_ml_common/utils.py +1 -1
  262. mlrun/frameworks/auto_mlrun/__init__.py +1 -1
  263. mlrun/frameworks/auto_mlrun/auto_mlrun.py +1 -1
  264. mlrun/frameworks/huggingface/__init__.py +1 -1
  265. mlrun/frameworks/huggingface/model_server.py +1 -1
  266. mlrun/frameworks/lgbm/__init__.py +1 -1
  267. mlrun/frameworks/lgbm/callbacks/__init__.py +1 -1
  268. mlrun/frameworks/lgbm/callbacks/callback.py +1 -1
  269. mlrun/frameworks/lgbm/callbacks/logging_callback.py +1 -1
  270. mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +1 -1
  271. mlrun/frameworks/lgbm/mlrun_interfaces/__init__.py +1 -1
  272. mlrun/frameworks/lgbm/mlrun_interfaces/booster_mlrun_interface.py +1 -1
  273. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +1 -1
  274. mlrun/frameworks/lgbm/mlrun_interfaces/model_mlrun_interface.py +1 -1
  275. mlrun/frameworks/lgbm/model_handler.py +1 -1
  276. mlrun/frameworks/lgbm/model_server.py +1 -1
  277. mlrun/frameworks/lgbm/utils.py +1 -1
  278. mlrun/frameworks/onnx/__init__.py +1 -1
  279. mlrun/frameworks/onnx/dataset.py +1 -1
  280. mlrun/frameworks/onnx/mlrun_interface.py +1 -1
  281. mlrun/frameworks/onnx/model_handler.py +1 -1
  282. mlrun/frameworks/onnx/model_server.py +1 -1
  283. mlrun/frameworks/parallel_coordinates.py +1 -1
  284. mlrun/frameworks/pytorch/__init__.py +1 -1
  285. mlrun/frameworks/pytorch/callbacks/__init__.py +1 -1
  286. mlrun/frameworks/pytorch/callbacks/callback.py +1 -1
  287. mlrun/frameworks/pytorch/callbacks/logging_callback.py +1 -1
  288. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +1 -1
  289. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +1 -1
  290. mlrun/frameworks/pytorch/callbacks_handler.py +1 -1
  291. mlrun/frameworks/pytorch/mlrun_interface.py +1 -1
  292. mlrun/frameworks/pytorch/model_handler.py +1 -1
  293. mlrun/frameworks/pytorch/model_server.py +1 -1
  294. mlrun/frameworks/pytorch/utils.py +1 -1
  295. mlrun/frameworks/sklearn/__init__.py +1 -1
  296. mlrun/frameworks/sklearn/estimator.py +1 -1
  297. mlrun/frameworks/sklearn/metric.py +1 -1
  298. mlrun/frameworks/sklearn/metrics_library.py +1 -1
  299. mlrun/frameworks/sklearn/mlrun_interface.py +1 -1
  300. mlrun/frameworks/sklearn/model_handler.py +1 -1
  301. mlrun/frameworks/sklearn/utils.py +1 -1
  302. mlrun/frameworks/tf_keras/__init__.py +1 -1
  303. mlrun/frameworks/tf_keras/callbacks/__init__.py +1 -1
  304. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
  305. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +1 -1
  306. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +1 -1
  307. mlrun/frameworks/tf_keras/mlrun_interface.py +1 -1
  308. mlrun/frameworks/tf_keras/model_handler.py +1 -1
  309. mlrun/frameworks/tf_keras/model_server.py +1 -1
  310. mlrun/frameworks/tf_keras/utils.py +1 -1
  311. mlrun/frameworks/xgboost/__init__.py +1 -1
  312. mlrun/frameworks/xgboost/mlrun_interface.py +1 -1
  313. mlrun/frameworks/xgboost/model_handler.py +1 -1
  314. mlrun/frameworks/xgboost/utils.py +1 -1
  315. mlrun/k8s_utils.py +14 -765
  316. mlrun/kfpops.py +14 -17
  317. mlrun/launcher/__init__.py +13 -0
  318. mlrun/launcher/base.py +406 -0
  319. mlrun/launcher/client.py +159 -0
  320. mlrun/launcher/factory.py +50 -0
  321. mlrun/launcher/local.py +276 -0
  322. mlrun/launcher/remote.py +178 -0
  323. mlrun/lists.py +10 -2
  324. mlrun/mlutils/__init__.py +1 -1
  325. mlrun/mlutils/data.py +1 -1
  326. mlrun/mlutils/models.py +1 -1
  327. mlrun/mlutils/plots.py +1 -1
  328. mlrun/model.py +252 -14
  329. mlrun/model_monitoring/__init__.py +41 -0
  330. mlrun/model_monitoring/features_drift_table.py +1 -1
  331. mlrun/model_monitoring/helpers.py +123 -38
  332. mlrun/model_monitoring/model_endpoint.py +144 -0
  333. mlrun/model_monitoring/model_monitoring_batch.py +310 -259
  334. mlrun/model_monitoring/stores/__init__.py +106 -0
  335. mlrun/model_monitoring/stores/kv_model_endpoint_store.py +448 -0
  336. mlrun/model_monitoring/stores/model_endpoint_store.py +147 -0
  337. mlrun/model_monitoring/stores/models/__init__.py +23 -0
  338. mlrun/model_monitoring/stores/models/base.py +18 -0
  339. mlrun/model_monitoring/stores/models/mysql.py +100 -0
  340. mlrun/model_monitoring/stores/models/sqlite.py +98 -0
  341. mlrun/model_monitoring/stores/sql_model_endpoint_store.py +370 -0
  342. mlrun/model_monitoring/stream_processing_fs.py +239 -271
  343. mlrun/package/__init__.py +163 -0
  344. mlrun/package/context_handler.py +325 -0
  345. mlrun/package/errors.py +47 -0
  346. mlrun/package/packager.py +298 -0
  347. mlrun/{runtimes/package → package/packagers}/__init__.py +3 -1
  348. mlrun/package/packagers/default_packager.py +422 -0
  349. mlrun/package/packagers/numpy_packagers.py +612 -0
  350. mlrun/package/packagers/pandas_packagers.py +968 -0
  351. mlrun/package/packagers/python_standard_library_packagers.py +616 -0
  352. mlrun/package/packagers_manager.py +786 -0
  353. mlrun/package/utils/__init__.py +53 -0
  354. mlrun/package/utils/_archiver.py +226 -0
  355. mlrun/package/utils/_formatter.py +211 -0
  356. mlrun/package/utils/_pickler.py +234 -0
  357. mlrun/package/utils/_supported_format.py +71 -0
  358. mlrun/package/utils/log_hint_utils.py +93 -0
  359. mlrun/package/utils/type_hint_utils.py +298 -0
  360. mlrun/platforms/__init__.py +1 -1
  361. mlrun/platforms/iguazio.py +34 -2
  362. mlrun/platforms/other.py +1 -1
  363. mlrun/projects/__init__.py +1 -1
  364. mlrun/projects/operations.py +14 -9
  365. mlrun/projects/pipelines.py +31 -13
  366. mlrun/projects/project.py +762 -238
  367. mlrun/render.py +49 -19
  368. mlrun/run.py +57 -326
  369. mlrun/runtimes/__init__.py +3 -9
  370. mlrun/runtimes/base.py +247 -784
  371. mlrun/runtimes/constants.py +1 -1
  372. mlrun/runtimes/daskjob.py +45 -41
  373. mlrun/runtimes/funcdoc.py +43 -7
  374. mlrun/runtimes/function.py +66 -656
  375. mlrun/runtimes/function_reference.py +1 -1
  376. mlrun/runtimes/generators.py +1 -1
  377. mlrun/runtimes/kubejob.py +99 -116
  378. mlrun/runtimes/local.py +59 -66
  379. mlrun/runtimes/mpijob/__init__.py +1 -1
  380. mlrun/runtimes/mpijob/abstract.py +13 -15
  381. mlrun/runtimes/mpijob/v1.py +3 -1
  382. mlrun/runtimes/mpijob/v1alpha1.py +1 -1
  383. mlrun/runtimes/nuclio.py +1 -1
  384. mlrun/runtimes/pod.py +51 -26
  385. mlrun/runtimes/remotesparkjob.py +3 -1
  386. mlrun/runtimes/serving.py +12 -4
  387. mlrun/runtimes/sparkjob/__init__.py +1 -2
  388. mlrun/runtimes/sparkjob/abstract.py +44 -31
  389. mlrun/runtimes/sparkjob/spark3job.py +11 -9
  390. mlrun/runtimes/utils.py +61 -42
  391. mlrun/secrets.py +16 -18
  392. mlrun/serving/__init__.py +3 -2
  393. mlrun/serving/merger.py +1 -1
  394. mlrun/serving/remote.py +1 -1
  395. mlrun/serving/routers.py +39 -42
  396. mlrun/serving/server.py +23 -13
  397. mlrun/serving/serving_wrapper.py +1 -1
  398. mlrun/serving/states.py +172 -39
  399. mlrun/serving/utils.py +1 -1
  400. mlrun/serving/v1_serving.py +1 -1
  401. mlrun/serving/v2_serving.py +29 -21
  402. mlrun/utils/__init__.py +1 -2
  403. mlrun/utils/async_http.py +8 -1
  404. mlrun/utils/azure_vault.py +1 -1
  405. mlrun/utils/clones.py +2 -2
  406. mlrun/utils/condition_evaluator.py +65 -0
  407. mlrun/utils/db.py +52 -0
  408. mlrun/utils/helpers.py +188 -13
  409. mlrun/utils/http.py +89 -54
  410. mlrun/utils/logger.py +48 -8
  411. mlrun/utils/model_monitoring.py +132 -100
  412. mlrun/utils/notifications/__init__.py +1 -1
  413. mlrun/utils/notifications/notification/__init__.py +8 -6
  414. mlrun/utils/notifications/notification/base.py +20 -14
  415. mlrun/utils/notifications/notification/console.py +7 -4
  416. mlrun/utils/notifications/notification/git.py +36 -19
  417. mlrun/utils/notifications/notification/ipython.py +10 -8
  418. mlrun/utils/notifications/notification/slack.py +18 -13
  419. mlrun/utils/notifications/notification_pusher.py +377 -56
  420. mlrun/utils/regex.py +6 -1
  421. mlrun/utils/singleton.py +1 -1
  422. mlrun/utils/v3io_clients.py +1 -1
  423. mlrun/utils/vault.py +270 -269
  424. mlrun/utils/version/__init__.py +1 -1
  425. mlrun/utils/version/version.json +2 -2
  426. mlrun/utils/version/version.py +1 -1
  427. {mlrun-1.3.3.dist-info → mlrun-1.4.0.dist-info}/METADATA +16 -10
  428. mlrun-1.4.0.dist-info/RECORD +434 -0
  429. mlrun/api/api/endpoints/marketplace.py +0 -257
  430. mlrun/api/crud/marketplace.py +0 -221
  431. mlrun/api/crud/model_monitoring/model_endpoint_store.py +0 -847
  432. mlrun/api/db/filedb/db.py +0 -518
  433. mlrun/api/schemas/marketplace.py +0 -128
  434. mlrun/api/schemas/model_endpoints.py +0 -185
  435. mlrun/db/filedb.py +0 -891
  436. mlrun/feature_store/retrieval/online.py +0 -92
  437. mlrun/model_monitoring/constants.py +0 -67
  438. mlrun/runtimes/package/context_handler.py +0 -711
  439. mlrun/runtimes/sparkjob/spark2job.py +0 -59
  440. mlrun-1.3.3.dist-info/RECORD +0 -381
  441. {mlrun-1.3.3.dist-info → mlrun-1.4.0.dist-info}/LICENSE +0 -0
  442. {mlrun-1.3.3.dist-info → mlrun-1.4.0.dist-info}/WHEEL +0 -0
  443. {mlrun-1.3.3.dist-info → mlrun-1.4.0.dist-info}/entry_points.txt +0 -0
  444. {mlrun-1.3.3.dist-info → mlrun-1.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,968 @@
1
+ # Copyright 2023 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+ import importlib
16
+ import os
17
+ import pathlib
18
+ import tempfile
19
+ from abc import ABC, abstractmethod
20
+ from typing import Any, List, Tuple, Union
21
+
22
+ import pandas as pd
23
+
24
+ from mlrun.artifacts import Artifact, DatasetArtifact
25
+ from mlrun.datastore import DataItem
26
+ from mlrun.errors import MLRunInvalidArgumentError
27
+
28
+ from ..utils import ArtifactType, SupportedFormat
29
+ from .default_packager import DefaultPackager
30
+
31
+
32
+ class _Formatter(ABC):
33
+ """
34
+ An abstract class for a pandas formatter - supporting saving and loading dataframes to and from specific file type.
35
+ """
36
+
37
+ @classmethod
38
+ @abstractmethod
39
+ def to(
40
+ cls, obj: pd.DataFrame, file_path: str, flatten: bool = True, **to_kwargs
41
+ ) -> dict:
42
+ """
43
+ Save the given dataframe to the file path given.
44
+
45
+ :param obj: The dataframe to save.
46
+ :param file_path: The file to save to.
47
+ :param flatten: Whether to flatten the dataframe before saving. For some formats it is mandatory to enable
48
+ flattening, otherwise saving and loading the dataframe will cause unexpected behavior
49
+ especially in case it is multi-level or multi-index. Default to True.
50
+ :param to_kwargs: Additional keyword arguments to pass to the relevant `to_x` function.
51
+
52
+ :return A dictionary of keyword arguments for reading the dataframe from file.
53
+ """
54
+ pass
55
+
56
+ @classmethod
57
+ @abstractmethod
58
+ def read(
59
+ cls, file_path: str, unflatten_kwargs: dict = None, **read_kwargs
60
+ ) -> pd.DataFrame:
61
+ """
62
+ Read the dataframe from the given file path.
63
+
64
+ :param file_path: The file to read the dataframe from.
65
+ :param unflatten_kwargs: Unflatten keyword arguments for unflattening the read dataframe.
66
+ :param read_kwargs: Additional keyword arguments to pass to the relevant read function of pandas.
67
+
68
+ :return: The loaded dataframe.
69
+ """
70
+ pass
71
+
72
+ @staticmethod
73
+ def _flatten_dataframe(dataframe: pd.DataFrame) -> Tuple[pd.DataFrame, dict]:
74
+ """
75
+ Flatten the dataframe: moving all indexes to be columns at the start (from column 0) and lowering the columns
76
+ levels to 1, renaming them from tuples. All columns and index info is stored so it can be unflatten later on.
77
+
78
+ :param dataframe: The dataframe to flatten.
79
+
80
+ :return: The flat dataframe.
81
+ """
82
+ # Save columns info:
83
+ columns = list(dataframe.columns)
84
+ if isinstance(dataframe.columns, pd.MultiIndex):
85
+ columns = [list(column_tuple) for column_tuple in columns]
86
+ columns_levels = list(dataframe.columns.names)
87
+
88
+ # Save index info:
89
+ index_levels = list(dataframe.index.names)
90
+
91
+ # Turn multi-index columns into single columns:
92
+ if len(columns_levels) > 1:
93
+ # We turn the column tuple into a string to eliminate parsing issues during savings to text formats:
94
+ dataframe.columns = pd.Index(
95
+ "-".join(column_tuple) for column_tuple in columns
96
+ )
97
+
98
+ # Rename indexes in case they appear in the columns so it won't get overriden when the index reset:
99
+ dataframe.index.set_names(
100
+ names=[
101
+ name
102
+ if name is not None and name not in dataframe.columns
103
+ else f"INDEX_{name}_{i}"
104
+ for i, name in enumerate(dataframe.index.names)
105
+ ],
106
+ inplace=True,
107
+ )
108
+
109
+ # Reset the index, moving the current index to a column:
110
+ dataframe.reset_index(inplace=True)
111
+
112
+ return dataframe, {
113
+ "columns": columns,
114
+ "columns_levels": columns_levels,
115
+ "index_levels": index_levels,
116
+ }
117
+
118
+ @staticmethod
119
+ def _unflatten_dataframe(
120
+ dataframe: pd.DataFrame,
121
+ columns: list,
122
+ columns_levels: list,
123
+ index_levels: list,
124
+ ) -> pd.DataFrame:
125
+ """
126
+ Unflatten the dataframe, moving the indexes from the columns and resuming the columns levels and names.
127
+
128
+ :param dataframe: The dataframe to unflatten.
129
+ :param columns: The original list of columns.
130
+ :param columns_levels: The original columns levels names.
131
+ :param index_levels: The original index levels names.
132
+
133
+ :return: The un-flatted dataframe.
134
+ """
135
+ # Move back index from columns:
136
+ dataframe.set_index(
137
+ keys=list(dataframe.columns[: len(index_levels)]), inplace=True
138
+ )
139
+ dataframe.index.set_names(names=index_levels, inplace=True)
140
+
141
+ # Set the columns back in case they were multi-leveled:
142
+ if len(columns_levels) > 1:
143
+ dataframe.columns = pd.MultiIndex.from_tuples(
144
+ tuples=columns, names=columns_levels
145
+ )
146
+ else:
147
+ dataframe.columns.set_names(names=columns_levels, inplace=True)
148
+
149
+ return dataframe
150
+
151
+
152
+ class _ParquetFormatter(_Formatter):
153
+ """
154
+ A static class for managing pandas parquet files.
155
+ """
156
+
157
+ @classmethod
158
+ def to(
159
+ cls, obj: pd.DataFrame, file_path: str, flatten: bool = True, **to_kwargs
160
+ ) -> dict:
161
+ """
162
+ Save the given dataframe to the parquet file path given.
163
+
164
+ :param obj: The dataframe to save.
165
+ :param file_path: The file to save to.
166
+ :param flatten: Ignored for parquet format.
167
+ :param to_kwargs: Additional keyword arguments to pass to the `to_parquet` function.
168
+
169
+ :return A dictionary of keyword arguments for reading the dataframe from file.
170
+ """
171
+ obj.to_parquet(path=file_path, **to_kwargs)
172
+ return {}
173
+
174
+ @classmethod
175
+ def read(
176
+ cls, file_path: str, unflatten_kwargs: dict = None, **read_kwargs
177
+ ) -> pd.DataFrame:
178
+ """
179
+ Read the dataframe from the given parquet file path.
180
+
181
+ :param file_path: The file to read the dataframe from.
182
+ :param unflatten_kwargs: Ignored for parquet format.
183
+ :param read_kwargs: Additional keyword arguments to pass to the `read_parquet` function.
184
+
185
+ :return: The loaded dataframe.
186
+ """
187
+ return pd.read_parquet(path=file_path, **read_kwargs)
188
+
189
+
190
+ class _CSVFormatter(_Formatter):
191
+ """
192
+ A static class for managing pandas csv files.
193
+ """
194
+
195
+ @classmethod
196
+ def to(
197
+ cls, obj: pd.DataFrame, file_path: str, flatten: bool = True, **to_kwargs
198
+ ) -> dict:
199
+ """
200
+ Save the given dataframe to the csv file path given.
201
+
202
+ :param obj: The dataframe to save.
203
+ :param file_path: The file to save to.
204
+ :param flatten: Whether to flatten the dataframe before saving. For some formats it is mandatory to enable
205
+ flattening, otherwise saving and loading the dataframe will cause unexpected behavior
206
+ especially in case it is multi-level or multi-index. Default to True.
207
+ :param to_kwargs: Additional keyword arguments to pass to the `to_csv` function.
208
+
209
+ :return A dictionary of keyword arguments for reading the dataframe from file.
210
+ """
211
+ # Flatten the dataframe (this format have problems saving multi-level dataframes):
212
+ instructions = {}
213
+ if flatten:
214
+ obj, unflatten_kwargs = cls._flatten_dataframe(dataframe=obj)
215
+ instructions["unflatten_kwargs"] = unflatten_kwargs
216
+
217
+ # Write to csv:
218
+ obj.to_csv(path_or_buf=file_path, **to_kwargs)
219
+
220
+ return instructions
221
+
222
+ @classmethod
223
+ def read(
224
+ cls, file_path: str, unflatten_kwargs: dict = None, **read_kwargs
225
+ ) -> pd.DataFrame:
226
+ """
227
+ Read the dataframe from the given csv file path.
228
+
229
+ :param file_path: The file to read the dataframe from.
230
+ :param unflatten_kwargs: Unflatten keyword arguments for unflattening the read dataframe.
231
+ :param read_kwargs: Additional keyword arguments to pass to the `read_csv` function.
232
+
233
+ :return: The loaded dataframe.
234
+ """
235
+ # Read the csv:
236
+ obj = pd.read_csv(filepath_or_buffer=file_path, **read_kwargs)
237
+
238
+ # Check if it was flattened in packing:
239
+ if unflatten_kwargs is not None:
240
+ # Remove the default index (joined with reset index):
241
+ if obj.columns[0] == "Unnamed: 0":
242
+ obj.drop(columns=["Unnamed: 0"], inplace=True)
243
+ # Unflatten the dataframe:
244
+ obj = cls._unflatten_dataframe(dataframe=obj, **unflatten_kwargs)
245
+
246
+ return obj
247
+
248
+
249
+ class _H5Formatter(_Formatter):
250
+ """
251
+ A static class for managing pandas h5 files.
252
+ """
253
+
254
+ @classmethod
255
+ def to(
256
+ cls, obj: pd.DataFrame, file_path: str, flatten: bool = True, **to_kwargs
257
+ ) -> dict:
258
+ """
259
+ Save the given dataframe to the h5 file path given.
260
+
261
+ :param obj: The dataframe to save.
262
+ :param file_path: The file to save to.
263
+ :param flatten: Ignored for h5 format.
264
+ :param to_kwargs: Additional keyword arguments to pass to the `to_hdf` function.
265
+
266
+ :return A dictionary of keyword arguments for reading the dataframe from file.
267
+ """
268
+ # If user didn't provide a key for the dataframe, use default key 'table':
269
+ key = to_kwargs.pop("key", "table")
270
+
271
+ # Write to h5:
272
+ obj.to_hdf(path_or_buf=file_path, key=key, **to_kwargs)
273
+
274
+ return {"key": key}
275
+
276
+ @classmethod
277
+ def read(
278
+ cls, file_path: str, unflatten_kwargs: dict = None, **read_kwargs
279
+ ) -> pd.DataFrame:
280
+ """
281
+ Read the dataframe from the given h5 file path.
282
+
283
+ :param file_path: The file to read the dataframe from.
284
+ :param unflatten_kwargs: Ignored for h5 format.
285
+ :param read_kwargs: Additional keyword arguments to pass to the `read_hdf` function.
286
+
287
+ :return: The loaded dataframe.
288
+ """
289
+ return pd.read_hdf(path_or_buf=file_path, **read_kwargs)
290
+
291
+
292
+ class _XMLFormatter(_Formatter):
293
+ """
294
+ A static class for managing pandas xml files.
295
+ """
296
+
297
+ @classmethod
298
+ def to(
299
+ cls, obj: pd.DataFrame, file_path: str, flatten: bool = True, **to_kwargs
300
+ ) -> dict:
301
+ """
302
+ Save the given dataframe to the xml file path given.
303
+
304
+ :param obj: The dataframe to save.
305
+ :param file_path: The file to save to.
306
+ :param flatten: Whether to flatten the dataframe before saving. For some formats it is mandatory to enable
307
+ flattening, otherwise saving and loading the dataframe will cause unexpected behavior
308
+ especially in case it is multi-level or multi-index. Default to True.
309
+ :param to_kwargs: Additional keyword arguments to pass to the `to_xml` function.
310
+
311
+ :return A dictionary of keyword arguments for reading the dataframe from file.
312
+ """
313
+ # Get the parser (if not provided, try to use `lxml`, otherwise `etree`):
314
+ parser = to_kwargs.pop("parser", None)
315
+ if parser is None:
316
+ try:
317
+ importlib.import_module("lxml")
318
+ parser = "lxml"
319
+ except ModuleNotFoundError:
320
+ parser = "etree"
321
+ instructions = {"parser": parser}
322
+
323
+ # Flatten the dataframe (this format have problems saving multi-level dataframes):
324
+ if flatten:
325
+ obj, unflatten_kwargs = cls._flatten_dataframe(dataframe=obj)
326
+ instructions["unflatten_kwargs"] = unflatten_kwargs
327
+
328
+ # Write to xml:
329
+ obj.to_xml(path_or_buffer=file_path, parser="etree", **to_kwargs)
330
+
331
+ return instructions
332
+
333
+ @classmethod
334
+ def read(
335
+ cls, file_path: str, unflatten_kwargs: dict = None, **read_kwargs
336
+ ) -> pd.DataFrame:
337
+ """
338
+ Read the dataframe from the given xml file path.
339
+
340
+ :param file_path: The file to read the dataframe from.
341
+ :param unflatten_kwargs: Unflatten keyword arguments for unflattening the read dataframe.
342
+ :param read_kwargs: Additional keyword arguments to pass to the `read_xml` function.
343
+
344
+ :return: The loaded dataframe.
345
+ """
346
+ # Read the xml:
347
+ obj = pd.read_xml(path_or_buffer=file_path, **read_kwargs)
348
+
349
+ # Check if it was flattened in packing:
350
+ if unflatten_kwargs is not None:
351
+ # Remove the default index (joined with reset index):
352
+ if obj.columns[0] == "index":
353
+ obj.drop(columns=["index"], inplace=True)
354
+ # Unflatten the dataframe:
355
+ obj = cls._unflatten_dataframe(dataframe=obj, **unflatten_kwargs)
356
+
357
+ return obj
358
+
359
+
360
+ class _XLSXFormatter(_Formatter):
361
+ """
362
+ A static class for managing pandas xlsx files.
363
+ """
364
+
365
+ @classmethod
366
+ def to(
367
+ cls, obj: pd.DataFrame, file_path: str, flatten: bool = True, **to_kwargs
368
+ ) -> dict:
369
+ """
370
+ Save the given dataframe to the xlsx file path given.
371
+
372
+ :param obj: The dataframe to save.
373
+ :param file_path: The file to save to.
374
+ :param flatten: Whether to flatten the dataframe before saving. For some formats it is mandatory to enable
375
+ flattening, otherwise saving and loading the dataframe will cause unexpected behavior
376
+ especially in case it is multi-level or multi-index. Default to True.
377
+ :param to_kwargs: Additional keyword arguments to pass to the `to_excel` function.
378
+ """
379
+ # Get the engine to pass when unpacked:
380
+ instructions = {"engine": to_kwargs.get("engine", None)}
381
+
382
+ # Flatten the dataframe (this format have problems saving multi-level dataframes):
383
+ if flatten:
384
+ obj, unflatten_kwargs = cls._flatten_dataframe(dataframe=obj)
385
+ instructions["unflatten_kwargs"] = unflatten_kwargs
386
+
387
+ # Write to xlsx:
388
+ obj.to_excel(excel_writer=file_path, **to_kwargs)
389
+
390
+ return instructions
391
+
392
+ @classmethod
393
+ def read(
394
+ cls, file_path: str, unflatten_kwargs: dict = None, **read_kwargs
395
+ ) -> pd.DataFrame:
396
+ """
397
+ Read the dataframe from the given xlsx file path.
398
+
399
+ :param file_path: The file to read the dataframe from.
400
+ :param unflatten_kwargs: Unflatten keyword arguments for unflattening the read dataframe.
401
+ :param read_kwargs: Additional keyword arguments to pass to the `read_excel` function.
402
+
403
+ :return: The loaded dataframe.
404
+ """
405
+ # Read the xlsx:
406
+ obj = pd.read_excel(io=file_path, **read_kwargs)
407
+
408
+ # Check if it was flattened in packing:
409
+ if unflatten_kwargs is not None:
410
+ # Remove the default index (joined with reset index):
411
+ if obj.columns[0] == "Unnamed: 0":
412
+ obj.drop(columns=["Unnamed: 0"], inplace=True)
413
+ # Unflatten the dataframe:
414
+ obj = cls._unflatten_dataframe(dataframe=obj, **unflatten_kwargs)
415
+
416
+ return obj
417
+
418
+
419
+ class _HTMLFormatter(_Formatter):
420
+ """
421
+ A static class for managing pandas html files.
422
+ """
423
+
424
+ @classmethod
425
+ def to(
426
+ cls, obj: pd.DataFrame, file_path: str, flatten: bool = True, **to_kwargs
427
+ ) -> dict:
428
+ """
429
+ Save the given dataframe to the html file path given.
430
+
431
+ :param obj: The dataframe to save.
432
+ :param file_path: The file to save to.
433
+ :param flatten: Whether to flatten the dataframe before saving. For some formats it is mandatory to enable
434
+ flattening, otherwise saving and loading the dataframe will cause unexpected behavior
435
+ especially in case it is multi-level or multi-index. Default to True.
436
+ :param to_kwargs: Additional keyword arguments to pass to the `to_html` function.
437
+
438
+ :return A dictionary of keyword arguments for reading the dataframe from file.
439
+ """
440
+ # Flatten the dataframe (this format have problems saving multi-level dataframes):
441
+ instructions = {}
442
+ if flatten:
443
+ obj, unflatten_kwargs = cls._flatten_dataframe(dataframe=obj)
444
+ instructions["unflatten_kwargs"] = unflatten_kwargs
445
+
446
+ # Write to html:
447
+ obj.to_html(buf=file_path, **to_kwargs)
448
+ return instructions
449
+
450
+ @classmethod
451
+ def read(
452
+ cls, file_path: str, unflatten_kwargs: dict = None, **read_kwargs
453
+ ) -> pd.DataFrame:
454
+ """
455
+ Read dataframes from the given html file path.
456
+
457
+ :param file_path: The file to read the dataframe from.
458
+ :param unflatten_kwargs: Unflatten keyword arguments for unflattening the read dataframe.
459
+ :param read_kwargs: Additional keyword arguments to pass to the `read_html` function.
460
+
461
+ :return: The loaded dataframe.
462
+ """
463
+ # Read the html:
464
+ obj = pd.read_html(io=file_path, **read_kwargs)[0]
465
+
466
+ # Check if it was flattened in packing:
467
+ if unflatten_kwargs is not None:
468
+ # Remove the default index (joined with reset index):
469
+ if obj.columns[0] == "Unnamed: 0":
470
+ obj.drop(columns=["Unnamed: 0"], inplace=True)
471
+ # Unflatten the dataframe:
472
+ obj = cls._unflatten_dataframe(dataframe=obj, **unflatten_kwargs)
473
+
474
+ return obj
475
+
476
+
477
+ class _JSONFormatter(_Formatter):
478
+ """
479
+ A static class for managing pandas json files.
480
+ """
481
+
482
+ @classmethod
483
+ def to(
484
+ cls, obj: pd.DataFrame, file_path: str, flatten: bool = True, **to_kwargs
485
+ ) -> dict:
486
+ """
487
+ Save the given dataframe to the json file path given.
488
+
489
+ :param obj: The dataframe to save.
490
+ :param file_path: The file to save to.
491
+ :param flatten: Whether to flatten the dataframe before saving. For some formats it is mandatory to enable
492
+ flattening, otherwise saving and loading the dataframe will cause unexpected behavior
493
+ especially in case it is multi-level or multi-index. Default to True.
494
+ :param to_kwargs: Additional keyword arguments to pass to the `to_json` function.
495
+
496
+ :return A dictionary of keyword arguments for reading the dataframe from file.
497
+ """
498
+ # Get the orient to pass when unpacked:
499
+ instructions = {"orient": to_kwargs.get("orient", None)}
500
+
501
+ # Flatten the dataframe (this format have problems saving multi-level dataframes):
502
+ if flatten:
503
+ obj, unflatten_kwargs = cls._flatten_dataframe(dataframe=obj)
504
+ instructions["unflatten_kwargs"] = unflatten_kwargs
505
+
506
+ # Write to json:
507
+ obj.to_json(path_or_buf=file_path, **to_kwargs)
508
+
509
+ return instructions
510
+
511
+ @classmethod
512
+ def read(
513
+ cls, file_path: str, unflatten_kwargs: dict = None, **read_kwargs
514
+ ) -> pd.DataFrame:
515
+ """
516
+ Read dataframes from the given json file path.
517
+
518
+ :param file_path: The file to read the dataframe from.
519
+ :param unflatten_kwargs: Unflatten keyword arguments for unflattening the read dataframe.
520
+ :param read_kwargs: Additional keyword arguments to pass to the `read_json` function.
521
+
522
+ :return: The loaded dataframe.
523
+ """
524
+ # Read the json:
525
+ obj = pd.read_json(path_or_buf=file_path, **read_kwargs)
526
+
527
+ # Check if it was flattened in packing:
528
+ if unflatten_kwargs is not None:
529
+ obj = cls._unflatten_dataframe(dataframe=obj, **unflatten_kwargs)
530
+
531
+ return obj
532
+
533
+
534
+ class _FeatherFormatter(_Formatter):
535
+ """
536
+ A static class for managing pandas feather files.
537
+ """
538
+
539
+ @classmethod
540
+ def to(
541
+ cls, obj: pd.DataFrame, file_path: str, flatten: bool = True, **to_kwargs
542
+ ) -> dict:
543
+ """
544
+ Save the given dataframe to the feather file path given.
545
+
546
+ :param obj: The dataframe to save.
547
+ :param file_path: The file to save to.
548
+ :param flatten: Whether to flatten the dataframe before saving. For some formats it is mandatory to enable
549
+ flattening, otherwise saving and loading the dataframe will cause unexpected behavior
550
+ especially in case it is multi-level or multi-index. Default to True.
551
+ :param to_kwargs: Additional keyword arguments to pass to the `to_feather` function.
552
+
553
+ :return A dictionary of keyword arguments for reading the dataframe from file.
554
+ """
555
+ # Flatten the dataframe (this format have problems saving multi-level dataframes):
556
+ instructions = {}
557
+ if flatten:
558
+ obj, unflatten_kwargs = cls._flatten_dataframe(dataframe=obj)
559
+ instructions["unflatten_kwargs"] = unflatten_kwargs
560
+
561
+ # Write to feather:
562
+ obj.to_feather(path=file_path, **to_kwargs)
563
+
564
+ return instructions
565
+
566
+ @classmethod
567
+ def read(
568
+ cls, file_path: str, unflatten_kwargs: dict = None, **read_kwargs
569
+ ) -> pd.DataFrame:
570
+ """
571
+ Read dataframes from the given feather file path.
572
+
573
+ :param file_path: The file to read the dataframe from.
574
+ :param unflatten_kwargs: Unflatten keyword arguments for unflattening the read dataframe.
575
+ :param read_kwargs: Additional keyword arguments to pass to the `read_feather` function.
576
+
577
+ :return: The loaded dataframe.
578
+ """
579
+ # Read the feather:
580
+ obj = pd.read_feather(path=file_path, **read_kwargs)
581
+
582
+ # Check if it was flattened in packing:
583
+ if unflatten_kwargs is not None:
584
+ obj = cls._unflatten_dataframe(dataframe=obj, **unflatten_kwargs)
585
+
586
+ return obj
587
+
588
+
589
+ class _ORCFormatter(_Formatter):
590
+ """
591
+ A static class for managing pandas orc files.
592
+ """
593
+
594
+ @classmethod
595
+ def to(
596
+ cls, obj: pd.DataFrame, file_path: str, flatten: bool = True, **to_kwargs
597
+ ) -> dict:
598
+ """
599
+ Save the given dataframe to the orc file path given.
600
+
601
+ :param obj: The dataframe to save.
602
+ :param file_path: The file to save to.
603
+ :param flatten: Whether to flatten the dataframe before saving. For some formats it is mandatory to enable
604
+ flattening, otherwise saving and loading the dataframe will cause unexpected behavior
605
+ especially in case it is multi-level or multi-index. Default to True.
606
+ :param to_kwargs: Additional keyword arguments to pass to the `to_orc` function.
607
+
608
+ :return A dictionary of keyword arguments for reading the dataframe from file.
609
+ """
610
+ # Flatten the dataframe (this format have problems saving multi-level dataframes):
611
+ instructions = {}
612
+ if flatten:
613
+ obj, unflatten_kwargs = cls._flatten_dataframe(dataframe=obj)
614
+ instructions["unflatten_kwargs"] = unflatten_kwargs
615
+
616
+ # Write to feather:
617
+ obj.to_orc(path=file_path, **to_kwargs)
618
+
619
+ return instructions
620
+
621
+ @classmethod
622
+ def read(
623
+ cls, file_path: str, unflatten_kwargs: dict = None, **read_kwargs
624
+ ) -> pd.DataFrame:
625
+ """
626
+ Read dataframes from the given orc file path.
627
+
628
+ :param file_path: The file to read the dataframe from.
629
+ :param unflatten_kwargs: Unflatten keyword arguments for unflattening the read dataframe.
630
+ :param read_kwargs: Additional keyword arguments to pass to the `read_orc` function.
631
+
632
+ :return: The loaded dataframe.
633
+ """
634
+ # Read the feather:
635
+ obj = pd.read_orc(path=file_path, **read_kwargs)
636
+
637
+ # Check if it was flattened in packing:
638
+ if unflatten_kwargs is not None:
639
+ obj = cls._unflatten_dataframe(dataframe=obj, **unflatten_kwargs)
640
+
641
+ return obj
642
+
643
+
644
+ class PandasSupportedFormat(SupportedFormat[_Formatter]):
645
+ """
646
+ Library of Pandas formats (file extensions) supported by the Pandas packagers.
647
+ """
648
+
649
+ PARQUET = "parquet"
650
+ CSV = "csv"
651
+ H5 = "h5"
652
+ XML = "xml"
653
+ XLSX = "xlsx"
654
+ HTML = "html"
655
+ JSON = "json"
656
+ FEATHER = "feather"
657
+ ORC = "orc"
658
+
659
+ _FORMAT_HANDLERS_MAP = {
660
+ PARQUET: _ParquetFormatter,
661
+ CSV: _CSVFormatter,
662
+ H5: _H5Formatter,
663
+ XML: _XMLFormatter,
664
+ XLSX: _XLSXFormatter,
665
+ HTML: _HTMLFormatter,
666
+ JSON: _JSONFormatter,
667
+ FEATHER: _FeatherFormatter,
668
+ ORC: _ORCFormatter,
669
+ }
670
+
671
+
672
+ # Default file formats for pandas DataFrame and Series file artifacts:
673
+ DEFAULT_PANDAS_FORMAT = PandasSupportedFormat.PARQUET
674
+ NON_STRING_COLUMN_NAMES_DEFAULT_PANDAS_FORMAT = PandasSupportedFormat.CSV
675
+
676
+
677
+ class PandasDataFramePackager(DefaultPackager):
678
+ """
679
+ ``pd.DataFrame`` packager.
680
+ """
681
+
682
+ PACKABLE_OBJECT_TYPE = pd.DataFrame
683
+ DEFAULT_PACKING_ARTIFACT_TYPE = ArtifactType.DATASET
684
+
685
+ @classmethod
686
+ def get_default_unpacking_artifact_type(cls, data_item: DataItem) -> str:
687
+ """
688
+ Get the default artifact type used for unpacking. Returns dataset if the data item represents a
689
+ `DatasetArtifact` and otherwise, file.
690
+
691
+ :param data_item: The about to be unpacked data item.
692
+
693
+ :return: The default artifact type.
694
+ """
695
+ is_artifact = data_item.get_artifact_type()
696
+ if is_artifact and is_artifact == "datasets":
697
+ return ArtifactType.DATASET
698
+ return ArtifactType.FILE
699
+
700
+ @classmethod
701
+ def pack_result(cls, obj: pd.DataFrame, key: str) -> dict:
702
+ """
703
+ Pack a dataframe as a result.
704
+
705
+ :param obj: The dataframe to pack and log.
706
+ :param key: The result's key.
707
+
708
+ :return: The result dictionary.
709
+ """
710
+ # Parse to dictionary according to the indexes in the dataframe:
711
+ if len(obj.index.names) > 1:
712
+ # Multiple indexes:
713
+ orient = "split"
714
+ elif obj.index.name is not None:
715
+ # Not a default index (user would likely want to keep it):
716
+ orient = "dict"
717
+ else:
718
+ # Default index can be ignored:
719
+ orient = "list"
720
+
721
+ # Cast to dictionary:
722
+ dataframe_dictionary = obj.to_dict(orient=orient)
723
+
724
+ # Prepare the result (casting tuples to lists):
725
+ dataframe_dictionary = PandasDataFramePackager._prepare_result(
726
+ obj=dataframe_dictionary
727
+ )
728
+
729
+ return super().pack_result(obj=dataframe_dictionary, key=key)
730
+
731
+ @classmethod
732
+ def pack_file(
733
+ cls,
734
+ obj: pd.DataFrame,
735
+ key: str,
736
+ file_format: str = None,
737
+ flatten: bool = True,
738
+ **to_kwargs,
739
+ ) -> Tuple[Artifact, dict]:
740
+ """
741
+ Pack a dataframe as a file by the given format.
742
+
743
+ :param obj: The series to pack.
744
+ :param key: The key to use for the artifact.
745
+ :param file_format: The file format to save as. Default is parquet or csv (depends on the column names as
746
+ parquet cannot be used for non string column names).
747
+ :param flatten: Whether to flatten the dataframe before saving. For some formats it is mandatory to enable
748
+ flattening, otherwise saving and loading the dataframe will cause unexpected behavior
749
+ especially in case it is multi-level or multi-index. Default to True.
750
+ :param to_kwargs: Additional keyword arguments to pass to the pandas `to_x` functions.
751
+
752
+ :return: The packed artifact and instructions.
753
+ """
754
+ # Set default file format if not given:
755
+ if file_format is None:
756
+ file_format = (
757
+ DEFAULT_PANDAS_FORMAT
758
+ if all(isinstance(name, str) for name in obj.columns)
759
+ else NON_STRING_COLUMN_NAMES_DEFAULT_PANDAS_FORMAT
760
+ )
761
+
762
+ # Save to file:
763
+ formatter = PandasSupportedFormat.get_format_handler(fmt=file_format)
764
+ temp_directory = pathlib.Path(tempfile.mkdtemp())
765
+ cls.add_future_clearing_path(path=temp_directory)
766
+ file_path = temp_directory / f"{key}.{file_format}"
767
+ read_kwargs = formatter.to(
768
+ obj=obj, file_path=str(file_path), flatten=flatten, **to_kwargs
769
+ )
770
+
771
+ # Create the artifact and instructions:
772
+ artifact = Artifact(key=key, src_path=os.path.abspath(file_path))
773
+
774
+ return artifact, {"file_format": file_format, "read_kwargs": read_kwargs}
775
+
776
+ @classmethod
777
+ def pack_dataset(cls, obj: pd.DataFrame, key: str, file_format: str = "parquet"):
778
+ """
779
+ Pack a pandas dataframe as a dataset.
780
+
781
+ :param obj: The dataframe to pack.
782
+ :param key: The key to use for the artifact.
783
+ :param file_format: The file format to save as. Default is parquet.
784
+
785
+ :return: The packed artifact and instructions.
786
+ """
787
+ return DatasetArtifact(key=key, df=obj, format=file_format), {}
788
+
789
+ @classmethod
790
+ def unpack_file(
791
+ cls,
792
+ data_item: DataItem,
793
+ file_format: str = None,
794
+ read_kwargs: dict = None,
795
+ ) -> pd.DataFrame:
796
+ """
797
+ Unpack a pandas dataframe from file.
798
+
799
+ :param data_item: The data item to unpack.
800
+ :param file_format: The file format to use for reading the series. Default is None - will be read by the file
801
+ extension.
802
+ :param read_kwargs: Keyword arguments to pass to the read of the formatter.
803
+
804
+ :return: The unpacked series.
805
+ """
806
+ # Get the file:
807
+ file_path = data_item.local()
808
+ cls.add_future_clearing_path(path=file_path)
809
+
810
+ # Get the archive format by the file extension if needed:
811
+ if file_format is None:
812
+ file_format = PandasSupportedFormat.match_format(path=file_path)
813
+ if file_format is None:
814
+ raise MLRunInvalidArgumentError(
815
+ f"File format of {data_item.key} ('{''.join(pathlib.Path(file_path).suffixes)}') is not supported. "
816
+ f"Supported formats are: {' '.join(PandasSupportedFormat.get_all_formats())}"
817
+ )
818
+
819
+ # Read the object:
820
+ formatter = PandasSupportedFormat.get_format_handler(fmt=file_format)
821
+ if read_kwargs is None:
822
+ read_kwargs = {}
823
+ return formatter.read(file_path=file_path, **read_kwargs)
824
+
825
+ @classmethod
826
+ def unpack_dataset(cls, data_item: DataItem):
827
+ """
828
+ Unpack a padnas dataframe from a dataset artifact.
829
+
830
+ :param data_item: The data item to unpack.
831
+
832
+ :return: The unpacked dataframe.
833
+ """
834
+ return data_item.as_df()
835
+
836
+ @staticmethod
837
+ def _prepare_result(obj: Union[list, dict, tuple]) -> Any:
838
+ """
839
+ A dataframe can be logged as a result when it being cast to a dictionary. If the dataframe has multiple indexes,
840
+ pandas store them as a tuple, which is not json serializable, so we cast them into lists.
841
+
842
+ :param obj: The dataframe dictionary (or list and tuple as it is recursive).
843
+
844
+ :return: Prepared result.
845
+ """
846
+ if isinstance(obj, dict):
847
+ for key, value in obj.items():
848
+ obj[
849
+ PandasDataFramePackager._prepare_result(obj=key)
850
+ ] = PandasDataFramePackager._prepare_result(obj=value)
851
+ elif isinstance(obj, list):
852
+ for i, value in enumerate(obj):
853
+ obj[i] = PandasDataFramePackager._prepare_result(obj=value)
854
+ elif isinstance(obj, tuple):
855
+ obj = [PandasDataFramePackager._prepare_result(obj=value) for value in obj]
856
+ return obj
857
+
858
+
859
+ class PandasSeriesPackager(PandasDataFramePackager):
860
+ """
861
+ ``pd.Series`` packager.
862
+ """
863
+
864
+ PACKABLE_OBJECT_TYPE = pd.Series
865
+ DEFAULT_PACKING_ARTIFACT_TYPE = ArtifactType.FILE
866
+
867
+ @classmethod
868
+ def get_supported_artifact_types(cls) -> List[str]:
869
+ """
870
+ Get all the supported artifact types on this packager. It will be the same as `PandasDataFramePackager` but
871
+ without the 'dataset' artifact type support.
872
+
873
+ :return: A list of all the supported artifact types.
874
+ """
875
+ supported_artifacts = super().get_supported_artifact_types()
876
+ supported_artifacts.remove("dataset")
877
+ return supported_artifacts
878
+
879
+ @classmethod
880
+ def pack_result(cls, obj: pd.Series, key: str) -> dict:
881
+ """
882
+ Pack a series as a result.
883
+
884
+ :param obj: The series to pack and log.
885
+ :param key: The result's key.
886
+
887
+ :return: The result dictionary.
888
+ """
889
+ return super().pack_result(obj=pd.DataFrame(obj), key=key)
890
+
891
+ @classmethod
892
+ def pack_file(
893
+ cls,
894
+ obj: pd.Series,
895
+ key: str,
896
+ file_format: str = None,
897
+ flatten: bool = True,
898
+ **to_kwargs,
899
+ ) -> Tuple[Artifact, dict]:
900
+ """
901
+ Pack a series as a file by the given format.
902
+
903
+ :param obj: The series to pack.
904
+ :param key: The key to use for the artifact.
905
+ :param file_format: The file format to save as. Default is parquet or csv (depends on the column names as
906
+ parquet cannot be used for non string column names).
907
+ :param flatten: Whether to flatten the dataframe before saving. For some formats it is mandatory to enable
908
+ flattening, otherwise saving and loading the dataframe will cause unexpected behavior
909
+ especially in case it is multi-level or multi-index. Default to True.
910
+ :param to_kwargs: Additional keyword arguments to pass to the pandas `to_x` functions.
911
+
912
+ :return: The packed artifact and instructions.
913
+ """
914
+ # Get the series column name:
915
+ column_name = obj.name
916
+
917
+ # Cast to dataframe and call the parent `pack_file`:
918
+ artifact, instructions = super().pack_file(
919
+ obj=pd.DataFrame(obj),
920
+ key=key,
921
+ file_format=file_format,
922
+ flatten=flatten,
923
+ **to_kwargs,
924
+ )
925
+
926
+ # Return the artifact with the updated instructions:
927
+ return artifact, {**instructions, "column_name": column_name}
928
+
929
+ @classmethod
930
+ def unpack_file(
931
+ cls,
932
+ data_item: DataItem,
933
+ file_format: str = None,
934
+ read_kwargs: dict = None,
935
+ column_name: Union[str, int] = None,
936
+ ) -> pd.Series:
937
+ """
938
+ Unpack a pandas series from file.
939
+
940
+ :param data_item: The data item to unpack.
941
+ :param file_format: The file format to use for reading the series. Default is None - will be read by the file
942
+ extension.
943
+ :param read_kwargs: Keyword arguments to pass to the read of the formatter.
944
+ :param column_name: The name of the series column.
945
+
946
+ :return: The unpacked series.
947
+ """
948
+ # Read the object:
949
+ obj = super().unpack_file(
950
+ data_item=data_item,
951
+ file_format=file_format,
952
+ read_kwargs=read_kwargs,
953
+ )
954
+
955
+ # Cast the dataframe into a series:
956
+ if len(obj.columns) != 1:
957
+ raise MLRunInvalidArgumentError(
958
+ f"The data item received is of a `pandas.DataFrame` with more than one column: "
959
+ f"{', '.join(obj.columns)}. Hence it cannot be turned into a `pandas.Series`."
960
+ )
961
+ obj = obj[obj.columns[0]]
962
+
963
+ # Edit the column name (if `read_kwargs` is not None we can be sure it is a packed file artifact, so the column
964
+ # name, even if None, should be set to restore the object as it was):
965
+ if read_kwargs is not None:
966
+ obj.name = column_name
967
+
968
+ return obj