mlrun 1.7.1rc10__py3-none-any.whl → 1.8.0rc11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (259) hide show
  1. mlrun/__init__.py +23 -21
  2. mlrun/__main__.py +3 -3
  3. mlrun/alerts/alert.py +148 -14
  4. mlrun/artifacts/__init__.py +2 -3
  5. mlrun/artifacts/base.py +55 -12
  6. mlrun/artifacts/dataset.py +16 -16
  7. mlrun/artifacts/document.py +378 -0
  8. mlrun/artifacts/manager.py +26 -17
  9. mlrun/artifacts/model.py +66 -53
  10. mlrun/common/constants.py +8 -0
  11. mlrun/common/formatters/__init__.py +1 -0
  12. mlrun/common/formatters/feature_set.py +1 -0
  13. mlrun/common/formatters/function.py +1 -0
  14. mlrun/{model_monitoring/db/stores/base/__init__.py → common/formatters/model_endpoint.py} +16 -1
  15. mlrun/common/formatters/pipeline.py +1 -2
  16. mlrun/common/formatters/project.py +9 -0
  17. mlrun/common/model_monitoring/__init__.py +0 -5
  18. mlrun/common/model_monitoring/helpers.py +1 -29
  19. mlrun/common/runtimes/constants.py +1 -2
  20. mlrun/common/schemas/__init__.py +6 -2
  21. mlrun/common/schemas/alert.py +111 -19
  22. mlrun/common/schemas/api_gateway.py +3 -3
  23. mlrun/common/schemas/artifact.py +11 -7
  24. mlrun/common/schemas/auth.py +6 -4
  25. mlrun/common/schemas/background_task.py +7 -7
  26. mlrun/common/schemas/client_spec.py +2 -3
  27. mlrun/common/schemas/clusterization_spec.py +2 -2
  28. mlrun/common/schemas/common.py +53 -3
  29. mlrun/common/schemas/constants.py +15 -0
  30. mlrun/common/schemas/datastore_profile.py +1 -1
  31. mlrun/common/schemas/feature_store.py +9 -9
  32. mlrun/common/schemas/frontend_spec.py +4 -4
  33. mlrun/common/schemas/function.py +10 -10
  34. mlrun/common/schemas/hub.py +1 -1
  35. mlrun/common/schemas/k8s.py +3 -3
  36. mlrun/common/schemas/memory_reports.py +3 -3
  37. mlrun/common/schemas/model_monitoring/__init__.py +2 -1
  38. mlrun/common/schemas/model_monitoring/constants.py +67 -14
  39. mlrun/common/schemas/model_monitoring/grafana.py +1 -1
  40. mlrun/common/schemas/model_monitoring/model_endpoints.py +92 -147
  41. mlrun/common/schemas/notification.py +24 -3
  42. mlrun/common/schemas/object.py +1 -1
  43. mlrun/common/schemas/pagination.py +4 -4
  44. mlrun/common/schemas/partition.py +137 -0
  45. mlrun/common/schemas/pipeline.py +2 -2
  46. mlrun/common/schemas/project.py +25 -17
  47. mlrun/common/schemas/runs.py +2 -2
  48. mlrun/common/schemas/runtime_resource.py +5 -5
  49. mlrun/common/schemas/schedule.py +1 -1
  50. mlrun/common/schemas/secret.py +1 -1
  51. mlrun/common/schemas/tag.py +3 -3
  52. mlrun/common/schemas/workflow.py +5 -5
  53. mlrun/config.py +68 -10
  54. mlrun/data_types/__init__.py +0 -2
  55. mlrun/data_types/data_types.py +1 -0
  56. mlrun/data_types/infer.py +3 -1
  57. mlrun/data_types/spark.py +5 -3
  58. mlrun/data_types/to_pandas.py +11 -2
  59. mlrun/datastore/__init__.py +2 -2
  60. mlrun/datastore/alibaba_oss.py +4 -1
  61. mlrun/datastore/azure_blob.py +4 -1
  62. mlrun/datastore/base.py +12 -4
  63. mlrun/datastore/datastore.py +9 -3
  64. mlrun/datastore/datastore_profile.py +79 -20
  65. mlrun/datastore/dbfs_store.py +4 -1
  66. mlrun/datastore/filestore.py +4 -1
  67. mlrun/datastore/google_cloud_storage.py +4 -1
  68. mlrun/datastore/hdfs.py +4 -1
  69. mlrun/datastore/inmem.py +4 -1
  70. mlrun/datastore/redis.py +4 -1
  71. mlrun/datastore/s3.py +4 -1
  72. mlrun/datastore/sources.py +52 -51
  73. mlrun/datastore/store_resources.py +7 -4
  74. mlrun/datastore/targets.py +23 -22
  75. mlrun/datastore/utils.py +2 -2
  76. mlrun/datastore/v3io.py +4 -1
  77. mlrun/datastore/vectorstore.py +229 -0
  78. mlrun/datastore/wasbfs/fs.py +13 -12
  79. mlrun/db/base.py +213 -83
  80. mlrun/db/factory.py +0 -3
  81. mlrun/db/httpdb.py +1265 -387
  82. mlrun/db/nopdb.py +205 -74
  83. mlrun/errors.py +2 -2
  84. mlrun/execution.py +136 -50
  85. mlrun/feature_store/__init__.py +0 -2
  86. mlrun/feature_store/api.py +41 -40
  87. mlrun/feature_store/common.py +9 -9
  88. mlrun/feature_store/feature_set.py +20 -18
  89. mlrun/feature_store/feature_vector.py +27 -24
  90. mlrun/feature_store/retrieval/base.py +14 -9
  91. mlrun/feature_store/retrieval/job.py +2 -1
  92. mlrun/feature_store/steps.py +2 -2
  93. mlrun/features.py +30 -13
  94. mlrun/frameworks/__init__.py +1 -2
  95. mlrun/frameworks/_common/__init__.py +1 -2
  96. mlrun/frameworks/_common/artifacts_library.py +2 -2
  97. mlrun/frameworks/_common/mlrun_interface.py +10 -6
  98. mlrun/frameworks/_common/model_handler.py +29 -27
  99. mlrun/frameworks/_common/producer.py +3 -1
  100. mlrun/frameworks/_dl_common/__init__.py +1 -2
  101. mlrun/frameworks/_dl_common/loggers/__init__.py +1 -2
  102. mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +4 -4
  103. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +3 -3
  104. mlrun/frameworks/_ml_common/__init__.py +1 -2
  105. mlrun/frameworks/_ml_common/loggers/__init__.py +1 -2
  106. mlrun/frameworks/_ml_common/model_handler.py +21 -21
  107. mlrun/frameworks/_ml_common/plans/__init__.py +1 -2
  108. mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +3 -1
  109. mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
  110. mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
  111. mlrun/frameworks/auto_mlrun/__init__.py +1 -2
  112. mlrun/frameworks/auto_mlrun/auto_mlrun.py +22 -15
  113. mlrun/frameworks/huggingface/__init__.py +1 -2
  114. mlrun/frameworks/huggingface/model_server.py +9 -9
  115. mlrun/frameworks/lgbm/__init__.py +47 -44
  116. mlrun/frameworks/lgbm/callbacks/__init__.py +1 -2
  117. mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -2
  118. mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -2
  119. mlrun/frameworks/lgbm/mlrun_interfaces/__init__.py +1 -2
  120. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +5 -5
  121. mlrun/frameworks/lgbm/model_handler.py +15 -11
  122. mlrun/frameworks/lgbm/model_server.py +11 -7
  123. mlrun/frameworks/lgbm/utils.py +2 -2
  124. mlrun/frameworks/onnx/__init__.py +1 -2
  125. mlrun/frameworks/onnx/dataset.py +3 -3
  126. mlrun/frameworks/onnx/mlrun_interface.py +2 -2
  127. mlrun/frameworks/onnx/model_handler.py +7 -5
  128. mlrun/frameworks/onnx/model_server.py +8 -6
  129. mlrun/frameworks/parallel_coordinates.py +11 -11
  130. mlrun/frameworks/pytorch/__init__.py +22 -23
  131. mlrun/frameworks/pytorch/callbacks/__init__.py +1 -2
  132. mlrun/frameworks/pytorch/callbacks/callback.py +2 -1
  133. mlrun/frameworks/pytorch/callbacks/logging_callback.py +15 -8
  134. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +19 -12
  135. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +22 -15
  136. mlrun/frameworks/pytorch/callbacks_handler.py +36 -30
  137. mlrun/frameworks/pytorch/mlrun_interface.py +17 -17
  138. mlrun/frameworks/pytorch/model_handler.py +21 -17
  139. mlrun/frameworks/pytorch/model_server.py +13 -9
  140. mlrun/frameworks/sklearn/__init__.py +19 -18
  141. mlrun/frameworks/sklearn/estimator.py +2 -2
  142. mlrun/frameworks/sklearn/metric.py +3 -3
  143. mlrun/frameworks/sklearn/metrics_library.py +8 -6
  144. mlrun/frameworks/sklearn/mlrun_interface.py +3 -2
  145. mlrun/frameworks/sklearn/model_handler.py +4 -3
  146. mlrun/frameworks/tf_keras/__init__.py +11 -12
  147. mlrun/frameworks/tf_keras/callbacks/__init__.py +1 -2
  148. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +17 -14
  149. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +15 -12
  150. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +21 -18
  151. mlrun/frameworks/tf_keras/model_handler.py +17 -13
  152. mlrun/frameworks/tf_keras/model_server.py +12 -8
  153. mlrun/frameworks/xgboost/__init__.py +19 -18
  154. mlrun/frameworks/xgboost/model_handler.py +13 -9
  155. mlrun/launcher/base.py +3 -4
  156. mlrun/launcher/local.py +1 -1
  157. mlrun/launcher/remote.py +1 -1
  158. mlrun/lists.py +4 -3
  159. mlrun/model.py +117 -46
  160. mlrun/model_monitoring/__init__.py +4 -4
  161. mlrun/model_monitoring/api.py +72 -59
  162. mlrun/model_monitoring/applications/_application_steps.py +17 -17
  163. mlrun/model_monitoring/applications/base.py +165 -6
  164. mlrun/model_monitoring/applications/context.py +88 -37
  165. mlrun/model_monitoring/applications/evidently_base.py +0 -1
  166. mlrun/model_monitoring/applications/histogram_data_drift.py +43 -21
  167. mlrun/model_monitoring/applications/results.py +55 -3
  168. mlrun/model_monitoring/controller.py +207 -239
  169. mlrun/model_monitoring/db/__init__.py +0 -2
  170. mlrun/model_monitoring/db/_schedules.py +156 -0
  171. mlrun/model_monitoring/db/_stats.py +189 -0
  172. mlrun/model_monitoring/db/tsdb/base.py +78 -25
  173. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +61 -6
  174. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +33 -0
  175. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +255 -29
  176. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +1 -0
  177. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +78 -17
  178. mlrun/model_monitoring/helpers.py +151 -49
  179. mlrun/model_monitoring/stream_processing.py +99 -283
  180. mlrun/model_monitoring/tracking_policy.py +10 -3
  181. mlrun/model_monitoring/writer.py +48 -36
  182. mlrun/package/__init__.py +3 -6
  183. mlrun/package/context_handler.py +1 -1
  184. mlrun/package/packager.py +12 -9
  185. mlrun/package/packagers/__init__.py +0 -2
  186. mlrun/package/packagers/default_packager.py +14 -11
  187. mlrun/package/packagers/numpy_packagers.py +16 -7
  188. mlrun/package/packagers/pandas_packagers.py +18 -18
  189. mlrun/package/packagers/python_standard_library_packagers.py +25 -11
  190. mlrun/package/packagers_manager.py +31 -14
  191. mlrun/package/utils/__init__.py +0 -3
  192. mlrun/package/utils/_pickler.py +6 -6
  193. mlrun/platforms/__init__.py +47 -16
  194. mlrun/platforms/iguazio.py +4 -1
  195. mlrun/projects/operations.py +27 -27
  196. mlrun/projects/pipelines.py +71 -36
  197. mlrun/projects/project.py +890 -220
  198. mlrun/run.py +53 -10
  199. mlrun/runtimes/__init__.py +1 -3
  200. mlrun/runtimes/base.py +15 -11
  201. mlrun/runtimes/daskjob.py +9 -9
  202. mlrun/runtimes/generators.py +2 -1
  203. mlrun/runtimes/kubejob.py +4 -5
  204. mlrun/runtimes/mounts.py +572 -0
  205. mlrun/runtimes/mpijob/__init__.py +0 -2
  206. mlrun/runtimes/mpijob/abstract.py +7 -6
  207. mlrun/runtimes/nuclio/api_gateway.py +7 -7
  208. mlrun/runtimes/nuclio/application/application.py +11 -11
  209. mlrun/runtimes/nuclio/function.py +19 -17
  210. mlrun/runtimes/nuclio/serving.py +18 -13
  211. mlrun/runtimes/pod.py +154 -45
  212. mlrun/runtimes/remotesparkjob.py +3 -2
  213. mlrun/runtimes/sparkjob/__init__.py +0 -2
  214. mlrun/runtimes/sparkjob/spark3job.py +21 -11
  215. mlrun/runtimes/utils.py +6 -5
  216. mlrun/serving/merger.py +6 -4
  217. mlrun/serving/remote.py +18 -17
  218. mlrun/serving/routers.py +185 -172
  219. mlrun/serving/server.py +7 -1
  220. mlrun/serving/states.py +97 -78
  221. mlrun/serving/utils.py +13 -2
  222. mlrun/serving/v1_serving.py +3 -2
  223. mlrun/serving/v2_serving.py +105 -72
  224. mlrun/track/__init__.py +1 -1
  225. mlrun/track/tracker.py +2 -2
  226. mlrun/track/trackers/mlflow_tracker.py +6 -5
  227. mlrun/utils/async_http.py +1 -1
  228. mlrun/utils/clones.py +1 -1
  229. mlrun/utils/helpers.py +63 -19
  230. mlrun/utils/logger.py +106 -4
  231. mlrun/utils/notifications/notification/__init__.py +22 -19
  232. mlrun/utils/notifications/notification/base.py +33 -14
  233. mlrun/utils/notifications/notification/console.py +6 -6
  234. mlrun/utils/notifications/notification/git.py +11 -11
  235. mlrun/utils/notifications/notification/ipython.py +10 -9
  236. mlrun/utils/notifications/notification/mail.py +176 -0
  237. mlrun/utils/notifications/notification/slack.py +6 -6
  238. mlrun/utils/notifications/notification/webhook.py +6 -6
  239. mlrun/utils/notifications/notification_pusher.py +86 -44
  240. mlrun/utils/regex.py +11 -2
  241. mlrun/utils/version/version.json +2 -2
  242. {mlrun-1.7.1rc10.dist-info → mlrun-1.8.0rc11.dist-info}/METADATA +29 -24
  243. mlrun-1.8.0rc11.dist-info/RECORD +347 -0
  244. mlrun/model_monitoring/db/stores/__init__.py +0 -136
  245. mlrun/model_monitoring/db/stores/base/store.py +0 -213
  246. mlrun/model_monitoring/db/stores/sqldb/__init__.py +0 -13
  247. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -71
  248. mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -190
  249. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +0 -103
  250. mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -40
  251. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +0 -659
  252. mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +0 -13
  253. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +0 -726
  254. mlrun/model_monitoring/model_endpoint.py +0 -118
  255. mlrun-1.7.1rc10.dist-info/RECORD +0 -351
  256. {mlrun-1.7.1rc10.dist-info → mlrun-1.8.0rc11.dist-info}/LICENSE +0 -0
  257. {mlrun-1.7.1rc10.dist-info → mlrun-1.8.0rc11.dist-info}/WHEEL +0 -0
  258. {mlrun-1.7.1rc10.dist-info → mlrun-1.8.0rc11.dist-info}/entry_points.txt +0 -0
  259. {mlrun-1.7.1rc10.dist-info → mlrun-1.8.0rc11.dist-info}/top_level.txt +0 -0
@@ -396,7 +396,7 @@ class BaseStoreTarget(DataTargetBase):
396
396
  self,
397
397
  name: str = "",
398
398
  path=None,
399
- attributes: dict[str, str] = None,
399
+ attributes: Optional[dict[str, str]] = None,
400
400
  after_step=None,
401
401
  columns=None,
402
402
  partitioned: bool = False,
@@ -405,8 +405,8 @@ class BaseStoreTarget(DataTargetBase):
405
405
  time_partitioning_granularity: Optional[str] = None,
406
406
  max_events: Optional[int] = None,
407
407
  flush_after_seconds: Optional[int] = None,
408
- storage_options: dict[str, str] = None,
409
- schema: dict[str, Any] = None,
408
+ storage_options: Optional[dict[str, str]] = None,
409
+ schema: Optional[dict[str, Any]] = None,
410
410
  credentials_prefix=None,
411
411
  ):
412
412
  super().__init__(
@@ -834,16 +834,16 @@ class ParquetTarget(BaseStoreTarget):
834
834
  self,
835
835
  name: str = "",
836
836
  path=None,
837
- attributes: dict[str, str] = None,
837
+ attributes: Optional[dict[str, str]] = None,
838
838
  after_step=None,
839
839
  columns=None,
840
- partitioned: bool = None,
840
+ partitioned: Optional[bool] = None,
841
841
  key_bucketing_number: Optional[int] = None,
842
842
  partition_cols: Optional[list[str]] = None,
843
843
  time_partitioning_granularity: Optional[str] = None,
844
844
  max_events: Optional[int] = 10000,
845
845
  flush_after_seconds: Optional[int] = 900,
846
- storage_options: dict[str, str] = None,
846
+ storage_options: Optional[dict[str, str]] = None,
847
847
  ):
848
848
  self.path = path
849
849
  if partitioned is None:
@@ -1136,7 +1136,8 @@ class CSVTarget(BaseStoreTarget):
1136
1136
  import pyspark.sql.functions as funcs
1137
1137
 
1138
1138
  for col_name, col_type in df.dtypes:
1139
- if col_type == "timestamp":
1139
+ # covers TimestampType and TimestampNTZType, which was added in PySpark 3.4.0
1140
+ if col_type.startswith("timestamp"):
1140
1141
  # df.write.csv saves timestamps with millisecond precision, but we want microsecond precision
1141
1142
  # for compatibility with storey.
1142
1143
  df = df.withColumn(
@@ -1199,7 +1200,7 @@ class SnowflakeTarget(BaseStoreTarget):
1199
1200
  self,
1200
1201
  name: str = "",
1201
1202
  path=None,
1202
- attributes: dict[str, str] = None,
1203
+ attributes: Optional[dict[str, str]] = None,
1203
1204
  after_step=None,
1204
1205
  columns=None,
1205
1206
  partitioned: bool = False,
@@ -1208,15 +1209,15 @@ class SnowflakeTarget(BaseStoreTarget):
1208
1209
  time_partitioning_granularity: Optional[str] = None,
1209
1210
  max_events: Optional[int] = None,
1210
1211
  flush_after_seconds: Optional[int] = None,
1211
- storage_options: dict[str, str] = None,
1212
- schema: dict[str, Any] = None,
1212
+ storage_options: Optional[dict[str, str]] = None,
1213
+ schema: Optional[dict[str, Any]] = None,
1213
1214
  credentials_prefix=None,
1214
- url: str = None,
1215
- user: str = None,
1216
- db_schema: str = None,
1217
- database: str = None,
1218
- warehouse: str = None,
1219
- table_name: str = None,
1215
+ url: Optional[str] = None,
1216
+ user: Optional[str] = None,
1217
+ db_schema: Optional[str] = None,
1218
+ database: Optional[str] = None,
1219
+ warehouse: Optional[str] = None,
1220
+ table_name: Optional[str] = None,
1220
1221
  ):
1221
1222
  attributes = attributes or {}
1222
1223
  if url:
@@ -1903,7 +1904,7 @@ class SQLTarget(BaseStoreTarget):
1903
1904
  self,
1904
1905
  name: str = "",
1905
1906
  path=None,
1906
- attributes: dict[str, str] = None,
1907
+ attributes: Optional[dict[str, str]] = None,
1907
1908
  after_step=None,
1908
1909
  partitioned: bool = False,
1909
1910
  key_bucketing_number: Optional[int] = None,
@@ -1911,16 +1912,16 @@ class SQLTarget(BaseStoreTarget):
1911
1912
  time_partitioning_granularity: Optional[str] = None,
1912
1913
  max_events: Optional[int] = None,
1913
1914
  flush_after_seconds: Optional[int] = None,
1914
- storage_options: dict[str, str] = None,
1915
- db_url: str = None,
1916
- table_name: str = None,
1917
- schema: dict[str, Any] = None,
1915
+ storage_options: Optional[dict[str, str]] = None,
1916
+ db_url: Optional[str] = None,
1917
+ table_name: Optional[str] = None,
1918
+ schema: Optional[dict[str, Any]] = None,
1918
1919
  primary_key_column: str = "",
1919
1920
  if_exists: str = "append",
1920
1921
  create_table: bool = False,
1921
1922
  # create_according_to_data: bool = False,
1922
1923
  varchar_len: int = 50,
1923
- parse_dates: list[str] = None,
1924
+ parse_dates: Optional[list[str]] = None,
1924
1925
  ):
1925
1926
  """
1926
1927
  Write to SqlDB as output target for a flow.
mlrun/datastore/utils.py CHANGED
@@ -26,7 +26,7 @@ import mlrun.datastore
26
26
 
27
27
 
28
28
  def parse_kafka_url(
29
- url: str, brokers: typing.Union[list, str] = None
29
+ url: str, brokers: typing.Optional[typing.Union[list, str]] = None
30
30
  ) -> tuple[str, list]:
31
31
  """Generating Kafka topic and adjusting a list of bootstrap servers.
32
32
 
@@ -71,7 +71,7 @@ def upload_tarball(source_dir, target, secrets=None):
71
71
 
72
72
  def filter_df_start_end_time(
73
73
  df: typing.Union[pd.DataFrame, typing.Iterator[pd.DataFrame]],
74
- time_column: str = None,
74
+ time_column: typing.Optional[str] = None,
75
75
  start_time: pd.Timestamp = None,
76
76
  end_time: pd.Timestamp = None,
77
77
  ) -> typing.Union[pd.DataFrame, typing.Iterator[pd.DataFrame]]:
mlrun/datastore/v3io.py CHANGED
@@ -14,6 +14,7 @@
14
14
 
15
15
  import time
16
16
  from datetime import datetime
17
+ from typing import Optional
17
18
 
18
19
  import fsspec
19
20
  import v3io
@@ -33,7 +34,9 @@ V3IO_DEFAULT_UPLOAD_CHUNK_SIZE = 1024 * 1024 * 10
33
34
 
34
35
 
35
36
  class V3ioStore(DataStore):
36
- def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
37
+ def __init__(
38
+ self, parent, schema, name, endpoint="", secrets: Optional[dict] = None
39
+ ):
37
40
  super().__init__(parent, name, schema, endpoint, secrets=secrets)
38
41
  self.endpoint = self.endpoint or mlrun.mlconf.v3io_api
39
42
 
@@ -0,0 +1,229 @@
1
+ # Copyright 2024 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ from collections.abc import Iterable
17
+ from typing import Optional, Union
18
+
19
+ from mlrun.artifacts import DocumentArtifact
20
+
21
+
22
+ def _extract_collection_name(vectorstore: "VectorStore") -> str: # noqa: F821
23
+ # List of possible attribute names for collection name
24
+ possible_attributes = ["collection_name", "_collection_name"]
25
+
26
+ for attr in possible_attributes:
27
+ if hasattr(vectorstore, attr):
28
+ collection_name = getattr(vectorstore, attr)
29
+ if collection_name:
30
+ return collection_name
31
+
32
+ store_class = vectorstore.__class__.__name__.lower()
33
+ if store_class == "mongodbatlasvectorsearch":
34
+ return vectorstore.collection.name
35
+
36
+ # If we get here, we couldn't find a valid collection name
37
+ raise ValueError(
38
+ "Failed to extract collection name from the vector store. "
39
+ "Please provide the collection name explicitly. "
40
+ )
41
+
42
+
43
+ class VectorStoreCollection:
44
+ """
45
+ A wrapper class for vector store collections with MLRun integration.
46
+
47
+ This class wraps a vector store implementation (like Milvus, Chroma) and provides
48
+ integration with MLRun context for document and artifact management. It delegates
49
+ most operations to the underlying vector store while handling MLRun-specific
50
+ functionality.
51
+
52
+ The class implements attribute delegation through __getattr__ and __setattr__,
53
+ allowing direct access to the underlying vector store's methods and attributes
54
+ while maintaining MLRun integration.
55
+ """
56
+
57
+ def __init__(
58
+ self,
59
+ mlrun_context: Union["MlrunProject", "MLClientCtx"], # noqa: F821
60
+ vector_store: "VectorStore", # noqa: F821
61
+ collection_name: Optional[str] = None,
62
+ ):
63
+ self._collection_impl = vector_store
64
+ self._mlrun_context = mlrun_context
65
+ self.collection_name = collection_name or _extract_collection_name(vector_store)
66
+
67
+ @property
68
+ def __class__(self):
69
+ # Make isinstance() check the wrapped object's class
70
+ return self._collection_impl.__class__
71
+
72
+ def __getattr__(self, name):
73
+ # This method is called when an attribute is not found in the usual places
74
+ # Forward the attribute access to _collection_impl
75
+ return getattr(self._collection_impl, name)
76
+
77
+ def __setattr__(self, name, value):
78
+ if name in ["_collection_impl", "_mlrun_context"] or name in self.__dict__:
79
+ # Use the base class method to avoid recursion
80
+ super().__setattr__(name, value)
81
+ else:
82
+ # Forward the attribute setting to _collection_impl
83
+ setattr(self._collection_impl, name, value)
84
+
85
+ def delete(self, *args, **kwargs):
86
+ self._collection_impl.delete(*args, **kwargs)
87
+
88
+ def add_documents(
89
+ self,
90
+ documents: list["Document"], # noqa: F821
91
+ **kwargs,
92
+ ):
93
+ """
94
+ Add a list of documents to the collection.
95
+
96
+ If the instance has an MLRun context, it will update the MLRun artifacts
97
+ associated with the documents.
98
+
99
+ Args:
100
+ documents (list[Document]): A list of Document objects to be added.
101
+ **kwargs: Additional keyword arguments to be passed to the underlying
102
+ collection implementation.
103
+
104
+ Returns:
105
+ The result of the underlying collection implementation's add_documents method.
106
+ """
107
+ if self._mlrun_context:
108
+ for document in documents:
109
+ mlrun_uri = document.metadata.get(
110
+ DocumentArtifact.METADATA_ARTIFACT_URI_KEY
111
+ )
112
+ if mlrun_uri:
113
+ artifact = self._mlrun_context.get_store_resource(mlrun_uri)
114
+ artifact.collection_add(self.collection_name)
115
+ self._mlrun_context.update_artifact(artifact)
116
+
117
+ return self._collection_impl.add_documents(documents, **kwargs)
118
+
119
+ def add_artifacts(self, artifacts: list[DocumentArtifact], splitter=None, **kwargs):
120
+ """
121
+ Add a list of DocumentArtifact objects to the vector store collection.
122
+
123
+ Converts artifacts to LangChain documents, adds them to the vector store, and
124
+ updates the MLRun context. If documents are split, the IDs are handled appropriately.
125
+
126
+ :param artifacts: List of DocumentArtifact objects to add
127
+ :type artifacts: list[DocumentArtifact]
128
+ :param splitter: Document splitter to break artifacts into smaller chunks.
129
+ If None, each artifact becomes a single document.
130
+ :type splitter: TextSplitter, optional
131
+ :param kwargs: Additional arguments passed to the underlying add_documents method.
132
+ Special handling for 'ids' kwarg:
133
+
134
+ * If provided and document is split, IDs are generated as "{original_id}_{i}"
135
+ where i starts from 1 (e.g., "doc1_1", "doc1_2", etc.)
136
+ * If provided and document isn't split, original IDs are used as-is
137
+
138
+ :return: List of IDs for all added documents. When no custom IDs are provided:
139
+
140
+ * Without splitting: Vector store generates IDs automatically
141
+ * With splitting: Vector store generates separate IDs for each chunk
142
+
143
+ When custom IDs are provided:
144
+
145
+ * Without splitting: Uses provided IDs directly
146
+ * With splitting: Generates sequential IDs as "{original_id}_{i}" for each chunk
147
+ :rtype: list
148
+
149
+ """
150
+ all_ids = []
151
+ user_ids = kwargs.pop("ids", None)
152
+
153
+ if user_ids:
154
+ if not isinstance(user_ids, Iterable):
155
+ raise ValueError("IDs must be an iterable collection")
156
+ if len(user_ids) != len(artifacts):
157
+ raise ValueError(
158
+ "The number of IDs should match the number of artifacts"
159
+ )
160
+ for index, artifact in enumerate(artifacts):
161
+ documents = artifact.to_langchain_documents(splitter)
162
+ artifact.collection_add(self.collection_name)
163
+ if self._mlrun_context:
164
+ self._mlrun_context.update_artifact(artifact)
165
+ if user_ids:
166
+ num_of_documents = len(documents)
167
+ if num_of_documents > 1:
168
+ ids_to_pass = [
169
+ f"{user_ids[index]}_{i}" for i in range(1, num_of_documents + 1)
170
+ ]
171
+ else:
172
+ ids_to_pass = [user_ids[index]]
173
+ kwargs["ids"] = ids_to_pass
174
+ ids = self._collection_impl.add_documents(documents, **kwargs)
175
+ all_ids.extend(ids)
176
+ return all_ids
177
+
178
+ def remove_from_artifact(self, artifact: DocumentArtifact):
179
+ """
180
+ Remove the current object from the given artifact's collection and update the artifact.
181
+
182
+ Args:
183
+ artifact (DocumentArtifact): The artifact from which the current object should be removed.
184
+ """
185
+ artifact.collection_remove(self.collection_name)
186
+ if self._mlrun_context:
187
+ self._mlrun_context.update_artifact(artifact)
188
+
189
+ def delete_artifacts(self, artifacts: list[DocumentArtifact]):
190
+ """
191
+ Delete a list of DocumentArtifact objects from the collection.
192
+
193
+ This method removes the specified artifacts from the collection and updates the MLRun context.
194
+ The deletion process varies depending on the type of the underlying collection implementation.
195
+
196
+ Args:
197
+ artifacts (list[DocumentArtifact]): A list of DocumentArtifact objects to be deleted.
198
+
199
+ Raises:
200
+ NotImplementedError: If the delete operation is not supported for the collection implementation.
201
+ """
202
+ store_class = self._collection_impl.__class__.__name__.lower()
203
+ for artifact in artifacts:
204
+ artifact.collection_remove(self.collection_name)
205
+ if self._mlrun_context:
206
+ self._mlrun_context.update_artifact(artifact)
207
+
208
+ if store_class == "milvus":
209
+ expr = f"{DocumentArtifact.METADATA_SOURCE_KEY} == '{artifact.get_source()}'"
210
+ return self._collection_impl.delete(expr=expr)
211
+ elif store_class == "chroma":
212
+ where = {DocumentArtifact.METADATA_SOURCE_KEY: artifact.get_source()}
213
+ return self._collection_impl.delete(where=where)
214
+
215
+ elif (
216
+ hasattr(self._collection_impl, "delete")
217
+ and "filter"
218
+ in inspect.signature(self._collection_impl.delete).parameters
219
+ ):
220
+ filter = {
221
+ "metadata": {
222
+ DocumentArtifact.METADATA_SOURCE_KEY: artifact.get_source()
223
+ }
224
+ }
225
+ return self._collection_impl.delete(filter=filter)
226
+ else:
227
+ raise NotImplementedError(
228
+ f"delete_artifacts() operation not supported for {store_class}"
229
+ )
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ from typing import Optional
15
16
  from urllib.parse import urlparse
16
17
 
17
18
  from fsspec import AbstractFileSystem
@@ -22,23 +23,23 @@ class WasbFS(AbstractFileSystem):
22
23
 
23
24
  def __init__(
24
25
  self,
25
- account_name: str = None,
26
- account_key: str = None,
27
- connection_string: str = None,
28
- credential: str = None,
29
- sas_token: str = None,
26
+ account_name: Optional[str] = None,
27
+ account_key: Optional[str] = None,
28
+ connection_string: Optional[str] = None,
29
+ credential: Optional[str] = None,
30
+ sas_token: Optional[str] = None,
30
31
  request_session=None,
31
- socket_timeout: int = None,
32
- blocksize: int = None,
33
- client_id: str = None,
34
- client_secret: str = None,
35
- tenant_id: str = None,
32
+ socket_timeout: Optional[int] = None,
33
+ blocksize: Optional[int] = None,
34
+ client_id: Optional[str] = None,
35
+ client_secret: Optional[str] = None,
36
+ tenant_id: Optional[str] = None,
36
37
  anon: bool = True,
37
- location_mode: str = None,
38
+ location_mode: Optional[str] = None,
38
39
  loop=None,
39
40
  asynchronous: bool = False,
40
41
  default_fill_cache: bool = True,
41
- default_cache_type: str = None,
42
+ default_cache_type: Optional[str] = None,
42
43
  **kwargs,
43
44
  ):
44
45
  from adlfs import AzureBlobFileSystem