mlrun 1.7.2rc4__py3-none-any.whl → 1.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (275) hide show
  1. mlrun/__init__.py +26 -22
  2. mlrun/__main__.py +15 -16
  3. mlrun/alerts/alert.py +150 -15
  4. mlrun/api/schemas/__init__.py +1 -9
  5. mlrun/artifacts/__init__.py +2 -3
  6. mlrun/artifacts/base.py +62 -19
  7. mlrun/artifacts/dataset.py +17 -17
  8. mlrun/artifacts/document.py +454 -0
  9. mlrun/artifacts/manager.py +28 -18
  10. mlrun/artifacts/model.py +91 -59
  11. mlrun/artifacts/plots.py +2 -2
  12. mlrun/common/constants.py +8 -0
  13. mlrun/common/formatters/__init__.py +1 -0
  14. mlrun/common/formatters/artifact.py +1 -1
  15. mlrun/common/formatters/feature_set.py +2 -0
  16. mlrun/common/formatters/function.py +1 -0
  17. mlrun/{model_monitoring/db/stores/v3io_kv/__init__.py → common/formatters/model_endpoint.py} +17 -0
  18. mlrun/common/formatters/pipeline.py +1 -2
  19. mlrun/common/formatters/project.py +9 -0
  20. mlrun/common/model_monitoring/__init__.py +0 -5
  21. mlrun/common/model_monitoring/helpers.py +12 -62
  22. mlrun/common/runtimes/constants.py +25 -4
  23. mlrun/common/schemas/__init__.py +9 -5
  24. mlrun/common/schemas/alert.py +114 -19
  25. mlrun/common/schemas/api_gateway.py +3 -3
  26. mlrun/common/schemas/artifact.py +22 -9
  27. mlrun/common/schemas/auth.py +8 -4
  28. mlrun/common/schemas/background_task.py +7 -7
  29. mlrun/common/schemas/client_spec.py +4 -4
  30. mlrun/common/schemas/clusterization_spec.py +2 -2
  31. mlrun/common/schemas/common.py +53 -3
  32. mlrun/common/schemas/constants.py +15 -0
  33. mlrun/common/schemas/datastore_profile.py +1 -1
  34. mlrun/common/schemas/feature_store.py +9 -9
  35. mlrun/common/schemas/frontend_spec.py +4 -4
  36. mlrun/common/schemas/function.py +10 -10
  37. mlrun/common/schemas/hub.py +1 -1
  38. mlrun/common/schemas/k8s.py +3 -3
  39. mlrun/common/schemas/memory_reports.py +3 -3
  40. mlrun/common/schemas/model_monitoring/__init__.py +4 -8
  41. mlrun/common/schemas/model_monitoring/constants.py +127 -46
  42. mlrun/common/schemas/model_monitoring/grafana.py +18 -12
  43. mlrun/common/schemas/model_monitoring/model_endpoints.py +154 -160
  44. mlrun/common/schemas/notification.py +24 -3
  45. mlrun/common/schemas/object.py +1 -1
  46. mlrun/common/schemas/pagination.py +4 -4
  47. mlrun/common/schemas/partition.py +142 -0
  48. mlrun/common/schemas/pipeline.py +3 -3
  49. mlrun/common/schemas/project.py +26 -18
  50. mlrun/common/schemas/runs.py +3 -3
  51. mlrun/common/schemas/runtime_resource.py +5 -5
  52. mlrun/common/schemas/schedule.py +1 -1
  53. mlrun/common/schemas/secret.py +1 -1
  54. mlrun/{model_monitoring/db/stores/sqldb/__init__.py → common/schemas/serving.py} +10 -1
  55. mlrun/common/schemas/tag.py +3 -3
  56. mlrun/common/schemas/workflow.py +6 -5
  57. mlrun/common/types.py +1 -0
  58. mlrun/config.py +157 -89
  59. mlrun/data_types/__init__.py +5 -3
  60. mlrun/data_types/infer.py +13 -3
  61. mlrun/data_types/spark.py +2 -1
  62. mlrun/datastore/__init__.py +59 -18
  63. mlrun/datastore/alibaba_oss.py +4 -1
  64. mlrun/datastore/azure_blob.py +4 -1
  65. mlrun/datastore/base.py +19 -24
  66. mlrun/datastore/datastore.py +10 -4
  67. mlrun/datastore/datastore_profile.py +178 -45
  68. mlrun/datastore/dbfs_store.py +4 -1
  69. mlrun/datastore/filestore.py +4 -1
  70. mlrun/datastore/google_cloud_storage.py +4 -1
  71. mlrun/datastore/hdfs.py +4 -1
  72. mlrun/datastore/inmem.py +4 -1
  73. mlrun/datastore/redis.py +4 -1
  74. mlrun/datastore/s3.py +14 -3
  75. mlrun/datastore/sources.py +89 -92
  76. mlrun/datastore/store_resources.py +7 -4
  77. mlrun/datastore/storeytargets.py +51 -16
  78. mlrun/datastore/targets.py +38 -31
  79. mlrun/datastore/utils.py +87 -4
  80. mlrun/datastore/v3io.py +4 -1
  81. mlrun/datastore/vectorstore.py +291 -0
  82. mlrun/datastore/wasbfs/fs.py +13 -12
  83. mlrun/db/base.py +286 -100
  84. mlrun/db/httpdb.py +1562 -490
  85. mlrun/db/nopdb.py +250 -83
  86. mlrun/errors.py +6 -2
  87. mlrun/execution.py +194 -50
  88. mlrun/feature_store/__init__.py +2 -10
  89. mlrun/feature_store/api.py +20 -458
  90. mlrun/feature_store/common.py +9 -9
  91. mlrun/feature_store/feature_set.py +20 -18
  92. mlrun/feature_store/feature_vector.py +105 -479
  93. mlrun/feature_store/feature_vector_utils.py +466 -0
  94. mlrun/feature_store/retrieval/base.py +15 -11
  95. mlrun/feature_store/retrieval/job.py +2 -1
  96. mlrun/feature_store/retrieval/storey_merger.py +1 -1
  97. mlrun/feature_store/steps.py +3 -3
  98. mlrun/features.py +30 -13
  99. mlrun/frameworks/__init__.py +1 -2
  100. mlrun/frameworks/_common/__init__.py +1 -2
  101. mlrun/frameworks/_common/artifacts_library.py +2 -2
  102. mlrun/frameworks/_common/mlrun_interface.py +10 -6
  103. mlrun/frameworks/_common/model_handler.py +31 -31
  104. mlrun/frameworks/_common/producer.py +3 -1
  105. mlrun/frameworks/_dl_common/__init__.py +1 -2
  106. mlrun/frameworks/_dl_common/loggers/__init__.py +1 -2
  107. mlrun/frameworks/_dl_common/loggers/mlrun_logger.py +4 -4
  108. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +3 -3
  109. mlrun/frameworks/_ml_common/__init__.py +1 -2
  110. mlrun/frameworks/_ml_common/loggers/__init__.py +1 -2
  111. mlrun/frameworks/_ml_common/model_handler.py +21 -21
  112. mlrun/frameworks/_ml_common/plans/__init__.py +1 -2
  113. mlrun/frameworks/_ml_common/plans/confusion_matrix_plan.py +3 -1
  114. mlrun/frameworks/_ml_common/plans/dataset_plan.py +3 -3
  115. mlrun/frameworks/_ml_common/plans/roc_curve_plan.py +4 -4
  116. mlrun/frameworks/auto_mlrun/__init__.py +1 -2
  117. mlrun/frameworks/auto_mlrun/auto_mlrun.py +22 -15
  118. mlrun/frameworks/huggingface/__init__.py +1 -2
  119. mlrun/frameworks/huggingface/model_server.py +9 -9
  120. mlrun/frameworks/lgbm/__init__.py +47 -44
  121. mlrun/frameworks/lgbm/callbacks/__init__.py +1 -2
  122. mlrun/frameworks/lgbm/callbacks/logging_callback.py +4 -2
  123. mlrun/frameworks/lgbm/callbacks/mlrun_logging_callback.py +4 -2
  124. mlrun/frameworks/lgbm/mlrun_interfaces/__init__.py +1 -2
  125. mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +5 -5
  126. mlrun/frameworks/lgbm/model_handler.py +15 -11
  127. mlrun/frameworks/lgbm/model_server.py +11 -7
  128. mlrun/frameworks/lgbm/utils.py +2 -2
  129. mlrun/frameworks/onnx/__init__.py +1 -2
  130. mlrun/frameworks/onnx/dataset.py +3 -3
  131. mlrun/frameworks/onnx/mlrun_interface.py +2 -2
  132. mlrun/frameworks/onnx/model_handler.py +7 -5
  133. mlrun/frameworks/onnx/model_server.py +8 -6
  134. mlrun/frameworks/parallel_coordinates.py +11 -11
  135. mlrun/frameworks/pytorch/__init__.py +22 -23
  136. mlrun/frameworks/pytorch/callbacks/__init__.py +1 -2
  137. mlrun/frameworks/pytorch/callbacks/callback.py +2 -1
  138. mlrun/frameworks/pytorch/callbacks/logging_callback.py +15 -8
  139. mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +19 -12
  140. mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +22 -15
  141. mlrun/frameworks/pytorch/callbacks_handler.py +36 -30
  142. mlrun/frameworks/pytorch/mlrun_interface.py +17 -17
  143. mlrun/frameworks/pytorch/model_handler.py +21 -17
  144. mlrun/frameworks/pytorch/model_server.py +13 -9
  145. mlrun/frameworks/sklearn/__init__.py +19 -18
  146. mlrun/frameworks/sklearn/estimator.py +2 -2
  147. mlrun/frameworks/sklearn/metric.py +3 -3
  148. mlrun/frameworks/sklearn/metrics_library.py +8 -6
  149. mlrun/frameworks/sklearn/mlrun_interface.py +3 -2
  150. mlrun/frameworks/sklearn/model_handler.py +4 -3
  151. mlrun/frameworks/tf_keras/__init__.py +11 -12
  152. mlrun/frameworks/tf_keras/callbacks/__init__.py +1 -2
  153. mlrun/frameworks/tf_keras/callbacks/logging_callback.py +17 -14
  154. mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +15 -12
  155. mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +21 -18
  156. mlrun/frameworks/tf_keras/model_handler.py +17 -13
  157. mlrun/frameworks/tf_keras/model_server.py +12 -8
  158. mlrun/frameworks/xgboost/__init__.py +19 -18
  159. mlrun/frameworks/xgboost/model_handler.py +13 -9
  160. mlrun/k8s_utils.py +2 -5
  161. mlrun/launcher/base.py +3 -4
  162. mlrun/launcher/client.py +2 -2
  163. mlrun/launcher/local.py +6 -2
  164. mlrun/launcher/remote.py +1 -1
  165. mlrun/lists.py +8 -4
  166. mlrun/model.py +132 -46
  167. mlrun/model_monitoring/__init__.py +3 -5
  168. mlrun/model_monitoring/api.py +113 -98
  169. mlrun/model_monitoring/applications/__init__.py +0 -5
  170. mlrun/model_monitoring/applications/_application_steps.py +81 -50
  171. mlrun/model_monitoring/applications/base.py +467 -14
  172. mlrun/model_monitoring/applications/context.py +212 -134
  173. mlrun/model_monitoring/{db/stores/base → applications/evidently}/__init__.py +6 -2
  174. mlrun/model_monitoring/applications/evidently/base.py +146 -0
  175. mlrun/model_monitoring/applications/histogram_data_drift.py +89 -56
  176. mlrun/model_monitoring/applications/results.py +67 -15
  177. mlrun/model_monitoring/controller.py +701 -315
  178. mlrun/model_monitoring/db/__init__.py +0 -2
  179. mlrun/model_monitoring/db/_schedules.py +242 -0
  180. mlrun/model_monitoring/db/_stats.py +189 -0
  181. mlrun/model_monitoring/db/tsdb/__init__.py +33 -22
  182. mlrun/model_monitoring/db/tsdb/base.py +243 -49
  183. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +76 -36
  184. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +33 -0
  185. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +213 -0
  186. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +534 -88
  187. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +1 -0
  188. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +436 -106
  189. mlrun/model_monitoring/helpers.py +356 -114
  190. mlrun/model_monitoring/stream_processing.py +190 -345
  191. mlrun/model_monitoring/tracking_policy.py +11 -4
  192. mlrun/model_monitoring/writer.py +49 -90
  193. mlrun/package/__init__.py +3 -6
  194. mlrun/package/context_handler.py +2 -2
  195. mlrun/package/packager.py +12 -9
  196. mlrun/package/packagers/__init__.py +0 -2
  197. mlrun/package/packagers/default_packager.py +14 -11
  198. mlrun/package/packagers/numpy_packagers.py +16 -7
  199. mlrun/package/packagers/pandas_packagers.py +18 -18
  200. mlrun/package/packagers/python_standard_library_packagers.py +25 -11
  201. mlrun/package/packagers_manager.py +35 -32
  202. mlrun/package/utils/__init__.py +0 -3
  203. mlrun/package/utils/_pickler.py +6 -6
  204. mlrun/platforms/__init__.py +47 -16
  205. mlrun/platforms/iguazio.py +4 -1
  206. mlrun/projects/operations.py +30 -30
  207. mlrun/projects/pipelines.py +116 -47
  208. mlrun/projects/project.py +1292 -329
  209. mlrun/render.py +5 -9
  210. mlrun/run.py +57 -14
  211. mlrun/runtimes/__init__.py +1 -3
  212. mlrun/runtimes/base.py +30 -22
  213. mlrun/runtimes/daskjob.py +9 -9
  214. mlrun/runtimes/databricks_job/databricks_runtime.py +6 -5
  215. mlrun/runtimes/function_reference.py +5 -2
  216. mlrun/runtimes/generators.py +3 -2
  217. mlrun/runtimes/kubejob.py +6 -7
  218. mlrun/runtimes/mounts.py +574 -0
  219. mlrun/runtimes/mpijob/__init__.py +0 -2
  220. mlrun/runtimes/mpijob/abstract.py +7 -6
  221. mlrun/runtimes/nuclio/api_gateway.py +7 -7
  222. mlrun/runtimes/nuclio/application/application.py +11 -13
  223. mlrun/runtimes/nuclio/application/reverse_proxy.go +66 -64
  224. mlrun/runtimes/nuclio/function.py +127 -70
  225. mlrun/runtimes/nuclio/serving.py +105 -37
  226. mlrun/runtimes/pod.py +159 -54
  227. mlrun/runtimes/remotesparkjob.py +3 -2
  228. mlrun/runtimes/sparkjob/__init__.py +0 -2
  229. mlrun/runtimes/sparkjob/spark3job.py +22 -12
  230. mlrun/runtimes/utils.py +7 -6
  231. mlrun/secrets.py +2 -2
  232. mlrun/serving/__init__.py +8 -0
  233. mlrun/serving/merger.py +7 -5
  234. mlrun/serving/remote.py +35 -22
  235. mlrun/serving/routers.py +186 -240
  236. mlrun/serving/server.py +41 -10
  237. mlrun/serving/states.py +432 -118
  238. mlrun/serving/utils.py +13 -2
  239. mlrun/serving/v1_serving.py +3 -2
  240. mlrun/serving/v2_serving.py +161 -203
  241. mlrun/track/__init__.py +1 -1
  242. mlrun/track/tracker.py +2 -2
  243. mlrun/track/trackers/mlflow_tracker.py +6 -5
  244. mlrun/utils/async_http.py +35 -22
  245. mlrun/utils/clones.py +7 -4
  246. mlrun/utils/helpers.py +511 -58
  247. mlrun/utils/logger.py +119 -13
  248. mlrun/utils/notifications/notification/__init__.py +22 -19
  249. mlrun/utils/notifications/notification/base.py +39 -15
  250. mlrun/utils/notifications/notification/console.py +6 -6
  251. mlrun/utils/notifications/notification/git.py +11 -11
  252. mlrun/utils/notifications/notification/ipython.py +10 -9
  253. mlrun/utils/notifications/notification/mail.py +176 -0
  254. mlrun/utils/notifications/notification/slack.py +16 -8
  255. mlrun/utils/notifications/notification/webhook.py +24 -8
  256. mlrun/utils/notifications/notification_pusher.py +191 -200
  257. mlrun/utils/regex.py +12 -2
  258. mlrun/utils/version/version.json +2 -2
  259. {mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info}/METADATA +69 -54
  260. mlrun-1.8.0.dist-info/RECORD +351 -0
  261. {mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info}/WHEEL +1 -1
  262. mlrun/model_monitoring/applications/evidently_base.py +0 -137
  263. mlrun/model_monitoring/db/stores/__init__.py +0 -136
  264. mlrun/model_monitoring/db/stores/base/store.py +0 -213
  265. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +0 -71
  266. mlrun/model_monitoring/db/stores/sqldb/models/base.py +0 -190
  267. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +0 -103
  268. mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +0 -40
  269. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +0 -659
  270. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +0 -726
  271. mlrun/model_monitoring/model_endpoint.py +0 -118
  272. mlrun-1.7.2rc4.dist-info/RECORD +0 -351
  273. {mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info}/entry_points.txt +0 -0
  274. {mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info/licenses}/LICENSE +0 -0
  275. {mlrun-1.7.2rc4.dist-info → mlrun-1.8.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,454 @@
1
+ # Copyright 2024 Iguazio
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import re
16
+ import tempfile
17
+ from collections.abc import Iterator
18
+ from copy import deepcopy
19
+ from importlib import import_module
20
+ from typing import Optional, Union
21
+
22
+ import mlrun
23
+ import mlrun.artifacts
24
+ from mlrun.artifacts import Artifact, ArtifactSpec
25
+ from mlrun.model import ModelObj
26
+
27
+ from ..utils import generate_artifact_uri
28
+ from .base import ArtifactStatus
29
+
30
+
31
+ class DocumentLoaderSpec(ModelObj):
32
+ """
33
+ A class to load a document from a file path using a specified loader class.
34
+
35
+ This class is responsible for loading documents from a given source path using a specified loader class.
36
+ The loader class is dynamically imported and instantiated with the provided arguments. The loaded documents
37
+ can be optionally uploaded as artifacts. Note that only loader classes that return single results
38
+ (e.g., TextLoader, UnstructuredHTMLLoader, WebBaseLoader(scalar)) are supported - loaders returning multiple
39
+ results like DirectoryLoader or WebBaseLoader(list) are not compatible.
40
+
41
+ Attributes:
42
+ loader_class_name (str): The name of the loader class to use for loading documents.
43
+ src_name (str): The name of the source attribute to pass to the loader class.
44
+ kwargs (Optional[dict]): Additional keyword arguments to pass to the loader class.
45
+
46
+ """
47
+
48
+ _dict_fields = ["loader_class_name", "src_name", "download_object", "kwargs"]
49
+
50
+ def __init__(
51
+ self,
52
+ loader_class_name: str = "langchain_community.document_loaders.TextLoader",
53
+ src_name: str = "file_path",
54
+ download_object: bool = True,
55
+ kwargs: Optional[dict] = None,
56
+ ):
57
+ """
58
+ Initialize the document loader.
59
+
60
+ Args:
61
+ loader_class_name (str): The name of the loader class to use.
62
+ src_name (str): The source name for the document.
63
+ kwargs (Optional[dict]): Additional keyword arguments to pass to the loader class.
64
+ download_object (bool, optional): If True, the file will be downloaded before launching
65
+ the loader. If False, the loader accepts a link that should not be downloaded.
66
+ Defaults to True.
67
+ Example:
68
+ >>> # Create a loader specification for PDF documents
69
+ >>> loader_spec = DocumentLoaderSpec(
70
+ ... loader_class_name="langchain_community.document_loaders.PDFLoader",
71
+ ... src_name="file_path",
72
+ ... kwargs={"extract_images": True},
73
+ ... )
74
+ >>> # Create a loader instance for a specific PDF file
75
+ >>> pdf_loader = loader_spec.make_loader("/path/to/document.pdf")
76
+ >>> # Load the documents
77
+ >>> documents = pdf_loader.load()
78
+
79
+ """
80
+ self.loader_class_name = loader_class_name
81
+ self.src_name = src_name
82
+ self.download_object = download_object
83
+ self.kwargs = kwargs
84
+
85
+ def make_loader(self, src_path):
86
+ module_name, class_name = self.loader_class_name.rsplit(".", 1)
87
+ module = import_module(module_name)
88
+ loader_class = getattr(module, class_name)
89
+ kwargs = deepcopy(self.kwargs or {})
90
+ kwargs[self.src_name] = src_path
91
+ loader = loader_class(**kwargs)
92
+ return loader
93
+
94
+
95
+ class MLRunLoader:
96
+ """
97
+ A factory class for creating instances of a dynamically defined document loader.
98
+
99
+ Args:
100
+ artifact_key (str, optional): The key for the artifact to be logged.
101
+ The '%%' pattern in the key will be replaced by the source path
102
+ with any unsupported characters converted to '_'. Defaults to "%%".
103
+ local_path (str): The source path of the document to be loaded.
104
+ loader_spec (DocumentLoaderSpec): Specification for the document loader.
105
+ producer (Optional[Union[MlrunProject, str, MLClientCtx]], optional): The producer of the document.
106
+ If not specified, will try to get the current MLRun context or project.
107
+ Defaults to None.
108
+ upload (bool, optional): Flag indicating whether to upload the document.
109
+ labels (Optional[Dict[str, str]], optional): Key-value labels to attach to the artifact. Defaults to None.
110
+ tag (str, optional): Version tag for the artifact. Defaults to "".
111
+
112
+ Returns:
113
+ DynamicDocumentLoader: An instance of a dynamically defined subclass of BaseLoader.
114
+
115
+ Example:
116
+ >>> # Create a document loader specification
117
+ >>> loader_spec = DocumentLoaderSpec(
118
+ ... loader_class_name="langchain_community.document_loaders.TextLoader",
119
+ ... src_name="file_path",
120
+ ... )
121
+ >>> # Create a basic loader for a single file
122
+ >>> loader = MLRunLoader(
123
+ ... source_path="/path/to/document.txt",
124
+ ... loader_spec=loader_spec,
125
+ ... artifact_key="my_doc",
126
+ ... producer=project,
127
+ ... upload=True,
128
+ ... )
129
+ >>> documents = loader.load()
130
+ >>> # Create a loader with auto-generated keys
131
+ >>> loader = MLRunLoader(
132
+ ... source_path="/path/to/document.txt",
133
+ ... loader_spec=loader_spec,
134
+ ... artifact_key="%%", # %% will be replaced with encoded path
135
+ ... producer=project,
136
+ ... )
137
+ >>> documents = loader.load()
138
+ >>> # Use with DirectoryLoader
139
+ >>> from langchain_community.document_loaders import DirectoryLoader
140
+ >>> dir_loader = DirectoryLoader(
141
+ ... "/path/to/directory",
142
+ ... glob="**/*.txt",
143
+ ... loader_cls=MLRunLoader,
144
+ ... loader_kwargs={
145
+ ... "loader_spec": loader_spec,
146
+ ... "artifact_key": "%%",
147
+ ... "producer": project,
148
+ ... "upload": True,
149
+ ... },
150
+ ... )
151
+ >>> documents = dir_loader.load()
152
+
153
+ """
154
+
155
+ def __new__(
156
+ cls,
157
+ source_path: str,
158
+ loader_spec: "DocumentLoaderSpec",
159
+ artifact_key="%%",
160
+ producer: Optional[Union["MlrunProject", str, "MLClientCtx"]] = None, # noqa: F821
161
+ upload: bool = False,
162
+ tag: str = "",
163
+ labels: Optional[dict[str, str]] = None,
164
+ ):
165
+ # Dynamically import BaseLoader
166
+ from langchain_community.document_loaders.base import BaseLoader
167
+
168
+ class DynamicDocumentLoader(BaseLoader):
169
+ def __init__(
170
+ self,
171
+ local_path,
172
+ loader_spec,
173
+ artifact_key,
174
+ producer,
175
+ upload,
176
+ tag,
177
+ labels,
178
+ ):
179
+ self.producer = producer
180
+ self.artifact_key = (
181
+ MLRunLoader.artifact_key_instance(artifact_key, local_path)
182
+ if "%%" in artifact_key
183
+ else artifact_key
184
+ )
185
+ self.loader_spec = loader_spec
186
+ self.local_path = local_path
187
+ self.upload = upload
188
+ self.tag = tag
189
+ self.labels = labels
190
+
191
+ # Resolve the producer
192
+ if not self.producer:
193
+ self.producer = mlrun.mlconf.default_project
194
+ if isinstance(self.producer, str):
195
+ self.producer = mlrun.get_or_create_project(self.producer)
196
+
197
+ def lazy_load(self) -> Iterator["Document"]: # noqa: F821
198
+ collections = None
199
+ try:
200
+ artifact = self.producer.get_artifact(self.artifact_key, self.tag)
201
+ collections = (
202
+ artifact.status.collections if artifact else collections
203
+ )
204
+ except mlrun.MLRunNotFoundError:
205
+ pass
206
+ artifact = self.producer.log_document(
207
+ key=self.artifact_key,
208
+ document_loader_spec=self.loader_spec,
209
+ local_path=self.local_path,
210
+ upload=self.upload,
211
+ labels=self.labels,
212
+ tag=self.tag,
213
+ collections=collections,
214
+ )
215
+ res = artifact.to_langchain_documents()
216
+ return res
217
+
218
+ # Return an instance of the dynamically defined subclass
219
+ instance = DynamicDocumentLoader(
220
+ artifact_key=artifact_key,
221
+ local_path=source_path,
222
+ loader_spec=loader_spec,
223
+ producer=producer,
224
+ upload=upload,
225
+ tag=tag,
226
+ labels=labels,
227
+ )
228
+ return instance
229
+
230
+ @staticmethod
231
+ def artifact_key_instance(artifact_key: str, src_path: str) -> str:
232
+ if "%%" in artifact_key:
233
+ resolved_path = DocumentArtifact.key_from_source(src_path)
234
+ artifact_key = artifact_key.replace("%%", resolved_path)
235
+ return artifact_key
236
+
237
+
238
+ class DocumentArtifact(Artifact):
239
+ """
240
+ A specific artifact class inheriting from generic artifact, used to maintain Document meta-data.
241
+ """
242
+
243
+ @staticmethod
244
+ def key_from_source(src_path: str) -> str:
245
+ """Convert a source path into a valid artifact key by replacing invalid characters with underscores.
246
+ Args:
247
+ src_path (str): The source path to be converted into a valid artifact key
248
+ Returns:
249
+ str: A modified version of the source path where all invalid characters are replaced
250
+ with underscores while preserving valid sequences in their original positions
251
+ Examples:
252
+ >>> DocumentArtifact.key_from_source("data/file-name(v1).txt")
253
+ "data_file-name_v1__txt"
254
+ """
255
+ pattern = mlrun.utils.regex.artifact_key[0]
256
+ # Convert anchored pattern (^...$) to non-anchored version for finditer
257
+ search_pattern = pattern.strip("^$")
258
+ result = []
259
+ current_pos = 0
260
+
261
+ # Find all valid sequences
262
+ for match in re.finditer(search_pattern, src_path):
263
+ # Add '_' values for characters between matches
264
+ for char in src_path[current_pos : match.start()]:
265
+ result.append("_")
266
+
267
+ # Add the valid sequence
268
+ result.append(match.group())
269
+ current_pos = match.end()
270
+
271
+ # Handle any remaining characters after the last match
272
+ for char in src_path[current_pos:]:
273
+ result.append("_")
274
+
275
+ resolved_path = "".join(result)
276
+ resolved_path = resolved_path.lstrip("_")
277
+ return resolved_path
278
+
279
+ class DocumentArtifactSpec(ArtifactSpec):
280
+ _dict_fields = ArtifactSpec._dict_fields + [
281
+ "document_loader",
282
+ "original_source",
283
+ ]
284
+
285
+ def __init__(
286
+ self,
287
+ *args,
288
+ document_loader: Optional[DocumentLoaderSpec] = None,
289
+ original_source: Optional[str] = None,
290
+ **kwargs,
291
+ ):
292
+ super().__init__(*args, **kwargs)
293
+ self.document_loader = document_loader
294
+ self.original_source = original_source
295
+
296
+ class DocumentArtifactStatus(ArtifactStatus):
297
+ _dict_fields = ArtifactStatus._dict_fields + ["collections"]
298
+
299
+ def __init__(
300
+ self,
301
+ *args,
302
+ collections: Optional[dict] = None,
303
+ **kwargs,
304
+ ):
305
+ super().__init__(*args, **kwargs)
306
+ self.collections = collections if collections is not None else {}
307
+
308
+ kind = "document"
309
+
310
+ METADATA_SOURCE_KEY = "source"
311
+ METADATA_ORIGINAL_SOURCE_KEY = "original_source"
312
+ METADATA_CHUNK_KEY = "mlrun_chunk"
313
+ METADATA_ARTIFACT_TARGET_PATH_KEY = "mlrun_target_path"
314
+ METADATA_ARTIFACT_TAG = "mlrun_tag"
315
+ METADATA_ARTIFACT_KEY = "mlrun_key"
316
+ METADATA_ARTIFACT_PROJECT = "mlrun_project"
317
+
318
+ def __init__(
319
+ self,
320
+ original_source: Optional[str] = None,
321
+ document_loader_spec: Optional[DocumentLoaderSpec] = None,
322
+ collections: Optional[dict] = None,
323
+ **kwargs,
324
+ ):
325
+ super().__init__(**kwargs)
326
+ self.spec.document_loader = (
327
+ document_loader_spec.to_dict()
328
+ if document_loader_spec
329
+ else self.spec.document_loader
330
+ )
331
+ self.spec.original_source = original_source or self.spec.original_source
332
+ self.status = DocumentArtifact.DocumentArtifactStatus(collections=collections)
333
+
334
+ @property
335
+ def status(self) -> DocumentArtifactStatus:
336
+ return self._status
337
+
338
+ @status.setter
339
+ def status(self, status):
340
+ self._status = self._verify_dict(
341
+ status, "status", DocumentArtifact.DocumentArtifactStatus
342
+ )
343
+
344
+ @property
345
+ def spec(self) -> DocumentArtifactSpec:
346
+ return self._spec
347
+
348
+ @spec.setter
349
+ def spec(self, spec):
350
+ self._spec = self._verify_dict(
351
+ spec, "spec", DocumentArtifact.DocumentArtifactSpec
352
+ )
353
+
354
+ def get_source(self):
355
+ """Get the source URI for this artifact."""
356
+ return generate_artifact_uri(self.metadata.project, self.spec.db_key)
357
+
358
+ def to_langchain_documents(
359
+ self,
360
+ splitter: Optional["TextSplitter"] = None, # noqa: F821
361
+ ) -> list["Document"]: # noqa: F821
362
+ from langchain.schema import Document
363
+
364
+ """
365
+ Create LC documents from the artifact
366
+
367
+ Args:
368
+ splitter (Optional[TextSplitter]): A LangChain TextSplitter to split the document into chunks.
369
+
370
+ Returns:
371
+ list[Document]: A list of LangChain Document objects.
372
+ """
373
+
374
+ loader_spec = DocumentLoaderSpec.from_dict(self.spec.document_loader)
375
+ if loader_spec.download_object and self.get_target_path():
376
+ with tempfile.NamedTemporaryFile() as tmp_file:
377
+ mlrun.datastore.store_manager.object(
378
+ url=self.get_target_path()
379
+ ).download(tmp_file.name)
380
+ loader = loader_spec.make_loader(tmp_file.name)
381
+ documents = loader.load()
382
+ elif self.spec.original_source:
383
+ loader = loader_spec.make_loader(self.spec.original_source)
384
+ documents = loader.load()
385
+ else:
386
+ raise ValueError(
387
+ "No src_path or target_path provided. Cannot load document."
388
+ )
389
+
390
+ results = []
391
+ idx = 0
392
+ for document in documents:
393
+ if splitter:
394
+ texts = splitter.split_text(document.page_content)
395
+ else:
396
+ texts = [document.page_content]
397
+
398
+ metadata = document.metadata
399
+
400
+ metadata[self.METADATA_ORIGINAL_SOURCE_KEY] = self.spec.original_source
401
+ metadata[self.METADATA_SOURCE_KEY] = self.get_source()
402
+ metadata[self.METADATA_ARTIFACT_TAG] = self.tag or "latest"
403
+ metadata[self.METADATA_ARTIFACT_KEY] = self.db_key
404
+ metadata[self.METADATA_ARTIFACT_PROJECT] = self.metadata.project
405
+
406
+ if self.get_target_path():
407
+ metadata[self.METADATA_ARTIFACT_TARGET_PATH_KEY] = (
408
+ self.get_target_path()
409
+ )
410
+
411
+ for text in texts:
412
+ metadata[self.METADATA_CHUNK_KEY] = str(idx)
413
+ doc = Document(
414
+ page_content=text,
415
+ metadata=metadata.copy(),
416
+ )
417
+ results.append(doc)
418
+ idx = idx + 1
419
+ return results
420
+
421
+ def collection_add(self, collection_id: str) -> bool:
422
+ """
423
+ Add a collection ID to the artifact's collection list.
424
+
425
+ Adds the specified collection ID to the artifact's collection mapping if it
426
+ doesn't already exist.
427
+ This method only modifies the client-side artifact object and does not persist
428
+ the changes to the MLRun DB. To save the changes permanently, you must call
429
+ project.update_artifact() after this method.
430
+
431
+ Args:
432
+ collection_id (str): The ID of the collection to add
433
+ """
434
+ if collection_id not in self.status.collections:
435
+ self.status.collections[collection_id] = "1"
436
+ return True
437
+ return False
438
+
439
+ def collection_remove(self, collection_id: str) -> bool:
440
+ """
441
+ Remove a collection ID from the artifact's collection list.
442
+
443
+ Removes the specified collection ID from the artifact's local collection mapping.
444
+ This method only modifies the client-side artifact object and does not persist
445
+ the changes to the MLRun DB. To save the changes permanently, you must call
446
+ project.update_artifact() or context.update_artifact() after this method.
447
+
448
+ Args:
449
+ collection_id (str): The ID of the collection to remove
450
+ """
451
+ if collection_id in self.status.collections:
452
+ self.status.collections.pop(collection_id)
453
+ return True
454
+ return False
@@ -41,6 +41,7 @@ from .dataset import (
41
41
  DatasetArtifact,
42
42
  TableArtifact,
43
43
  )
44
+ from .document import DocumentArtifact
44
45
  from .model import ModelArtifact
45
46
  from .plots import (
46
47
  PlotArtifact,
@@ -57,6 +58,7 @@ artifact_types = {
57
58
  "model": ModelArtifact,
58
59
  "dataset": DatasetArtifact,
59
60
  "plotly": PlotlyArtifact,
61
+ "document": DocumentArtifact,
60
62
  }
61
63
 
62
64
 
@@ -106,7 +108,7 @@ class ArtifactProducer:
106
108
  def dict_to_artifact(struct: dict) -> Artifact:
107
109
  kind = struct.get("kind", "")
108
110
 
109
- # TODO: remove this in 1.8.0
111
+ # TODO: Remove once data migration v5 is obsolete
110
112
  if mlrun.utils.is_legacy_artifact(struct):
111
113
  return mlrun.artifacts.base.convert_legacy_artifact_to_new_format(struct)
112
114
 
@@ -124,7 +126,7 @@ class ArtifactManager:
124
126
 
125
127
  self.artifact_db = db
126
128
  self.input_artifacts = {}
127
- self.artifacts = {}
129
+ self.artifact_uris = {}
128
130
 
129
131
  @staticmethod
130
132
  def ensure_artifact_source_file_exists(item, path, body):
@@ -156,14 +158,12 @@ class ArtifactManager:
156
158
 
157
159
  def artifact_list(self, full=False):
158
160
  artifacts = []
159
- for artifact in self.artifacts.values():
160
- if isinstance(artifact, dict):
161
- artifacts.append(artifact)
161
+ for artifacts_uri in self.artifact_uris.values():
162
+ artifact: Artifact = mlrun.datastore.get_store_resource(artifacts_uri)
163
+ if full:
164
+ artifacts.append(artifact.to_dict())
162
165
  else:
163
- if full:
164
- artifacts.append(artifact.to_dict())
165
- else:
166
- artifacts.append(artifact.base_dict())
166
+ artifacts.append(artifact.base_dict())
167
167
  return artifacts
168
168
 
169
169
  def log_artifact(
@@ -246,6 +246,8 @@ class ArtifactManager:
246
246
  # otherwise, we do not want to override it.
247
247
  # this is mainly relevant for imported artifacts that have an explicit db_key value already set
248
248
  db_key = item.db_key or key
249
+ if db_key != key:
250
+ validate_artifact_key_name(db_key, "artifact.db_key")
249
251
  item.db_key = db_key or ""
250
252
  item.viewer = viewer or item.viewer
251
253
  item.tree = producer.tag
@@ -304,7 +306,6 @@ class ArtifactManager:
304
306
  item.target_path = target_path
305
307
 
306
308
  item.before_log()
307
- self.artifacts[key] = item
308
309
 
309
310
  if ((upload is None and item.kind != "dir") or upload) and not item.is_inline():
310
311
  # before uploading the item, we want to ensure that its tags are valid,
@@ -313,32 +314,38 @@ class ArtifactManager:
313
314
  item.upload(artifact_path=artifact_path)
314
315
 
315
316
  if db_key:
316
- self._log_to_db(db_key, project, producer.inputs, item)
317
+ artifact_uid = self._log_to_db(db_key, project, producer.inputs, item)
318
+ if artifact_uid is not None:
319
+ item.uid = artifact_uid
320
+ # Generate the artifact URI after logging to the database and retrieving the artifact UID, if available.
321
+ self.artifact_uris[key] = item.uri
322
+
317
323
  size = str(item.size) or "?"
318
324
  db_str = "Y" if (self.artifact_db and db_key) else "N"
319
325
  logger.debug(
320
- f"log artifact {key} at {item.target_path}, size: {size}, db: {db_str}"
326
+ f"Log artifact {key} at {item.target_path}, size: {size}, db: {db_str}"
321
327
  )
322
328
  return item
323
329
 
324
- def update_artifact(self, producer, item):
325
- self.artifacts[item.key] = item
330
+ def update_artifact(self, producer, item: Artifact):
331
+ self.artifact_uris[item.key] = item.uri
326
332
  self._log_to_db(item.db_key, producer.project, producer.inputs, item)
327
333
 
328
- def _log_to_db(self, key, project, sources, item, tag=None):
334
+ def _log_to_db(self, key, project, sources, item, tag=None) -> typing.Optional[str]:
329
335
  """
330
336
  log artifact to db
331
337
  :param key: Identifying key of the artifact.
332
338
  :param project: Project that the artifact belongs to.
333
- :param sources: List of artifact sources ( Mainly passed from the producer.items ).
339
+ :param sources: List of artifact sources ( Mainly passed from the `producer.items` ).
334
340
  :param item: The actual artifact to store.
335
341
  :param tag: The name of the Tag of the artifact.
342
+ :return: The logged artifact uid.
336
343
  """
337
344
  if self.artifact_db:
338
345
  item.updated = None
339
346
  if sources:
340
347
  item.sources = [{"name": k, "path": str(v)} for k, v in sources.items()]
341
- self.artifact_db.store_artifact(
348
+ artifact_item = self.artifact_db.store_artifact(
342
349
  key,
343
350
  item.to_dict(),
344
351
  iter=item.iter,
@@ -346,6 +353,8 @@ class ArtifactManager:
346
353
  project=project,
347
354
  tree=item.tree,
348
355
  )
356
+ if artifact_item:
357
+ return artifact_item.get("metadata", {}).get("uid")
349
358
 
350
359
  def link_artifact(
351
360
  self,
@@ -387,13 +396,14 @@ class ArtifactManager:
387
396
  deletion_strategy: mlrun.common.schemas.artifact.ArtifactsDeletionStrategies = (
388
397
  mlrun.common.schemas.artifact.ArtifactsDeletionStrategies.metadata_only
389
398
  ),
390
- secrets: dict = None,
399
+ secrets: typing.Optional[dict] = None,
391
400
  ):
392
401
  self.artifact_db.del_artifact(
393
402
  key=item.db_key,
394
403
  project=item.project,
395
404
  tag=item.tag,
396
405
  tree=item.tree,
406
+ iter=item.iter,
397
407
  deletion_strategy=deletion_strategy,
398
408
  secrets=secrets,
399
409
  )