mlrun 1.7.0rc13__py3-none-any.whl → 1.7.0rc21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (156) hide show
  1. mlrun/__init__.py +10 -1
  2. mlrun/__main__.py +23 -111
  3. mlrun/alerts/__init__.py +15 -0
  4. mlrun/alerts/alert.py +144 -0
  5. mlrun/api/schemas/__init__.py +4 -3
  6. mlrun/artifacts/__init__.py +8 -3
  7. mlrun/artifacts/base.py +36 -253
  8. mlrun/artifacts/dataset.py +9 -190
  9. mlrun/artifacts/manager.py +46 -42
  10. mlrun/artifacts/model.py +9 -141
  11. mlrun/artifacts/plots.py +14 -375
  12. mlrun/common/constants.py +65 -3
  13. mlrun/common/formatters/__init__.py +19 -0
  14. mlrun/{runtimes/mpijob/v1alpha1.py → common/formatters/artifact.py} +6 -14
  15. mlrun/common/formatters/base.py +113 -0
  16. mlrun/common/formatters/function.py +46 -0
  17. mlrun/common/formatters/pipeline.py +53 -0
  18. mlrun/common/formatters/project.py +51 -0
  19. mlrun/{runtimes → common/runtimes}/constants.py +32 -4
  20. mlrun/common/schemas/__init__.py +10 -5
  21. mlrun/common/schemas/alert.py +92 -11
  22. mlrun/common/schemas/api_gateway.py +56 -0
  23. mlrun/common/schemas/artifact.py +15 -5
  24. mlrun/common/schemas/auth.py +2 -0
  25. mlrun/common/schemas/client_spec.py +1 -0
  26. mlrun/common/schemas/frontend_spec.py +1 -0
  27. mlrun/common/schemas/function.py +4 -0
  28. mlrun/common/schemas/model_monitoring/__init__.py +15 -3
  29. mlrun/common/schemas/model_monitoring/constants.py +58 -7
  30. mlrun/common/schemas/model_monitoring/grafana.py +9 -5
  31. mlrun/common/schemas/model_monitoring/model_endpoints.py +86 -2
  32. mlrun/common/schemas/pipeline.py +0 -9
  33. mlrun/common/schemas/project.py +6 -11
  34. mlrun/common/types.py +1 -0
  35. mlrun/config.py +36 -8
  36. mlrun/data_types/to_pandas.py +9 -9
  37. mlrun/datastore/base.py +41 -9
  38. mlrun/datastore/datastore.py +6 -2
  39. mlrun/datastore/datastore_profile.py +56 -4
  40. mlrun/datastore/hdfs.py +5 -0
  41. mlrun/datastore/inmem.py +2 -2
  42. mlrun/datastore/redis.py +2 -2
  43. mlrun/datastore/s3.py +5 -0
  44. mlrun/datastore/sources.py +147 -7
  45. mlrun/datastore/store_resources.py +7 -7
  46. mlrun/datastore/targets.py +129 -9
  47. mlrun/datastore/utils.py +42 -0
  48. mlrun/datastore/v3io.py +1 -1
  49. mlrun/db/auth_utils.py +152 -0
  50. mlrun/db/base.py +55 -11
  51. mlrun/db/httpdb.py +346 -107
  52. mlrun/db/nopdb.py +52 -10
  53. mlrun/errors.py +11 -0
  54. mlrun/execution.py +24 -9
  55. mlrun/feature_store/__init__.py +0 -2
  56. mlrun/feature_store/api.py +12 -47
  57. mlrun/feature_store/feature_set.py +9 -0
  58. mlrun/feature_store/feature_vector.py +8 -0
  59. mlrun/feature_store/ingestion.py +7 -6
  60. mlrun/feature_store/retrieval/base.py +9 -4
  61. mlrun/feature_store/retrieval/conversion.py +9 -9
  62. mlrun/feature_store/retrieval/dask_merger.py +2 -0
  63. mlrun/feature_store/retrieval/job.py +9 -3
  64. mlrun/feature_store/retrieval/local_merger.py +2 -0
  65. mlrun/feature_store/retrieval/spark_merger.py +16 -0
  66. mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
  67. mlrun/frameworks/parallel_coordinates.py +2 -1
  68. mlrun/frameworks/tf_keras/__init__.py +4 -1
  69. mlrun/k8s_utils.py +10 -11
  70. mlrun/launcher/base.py +4 -3
  71. mlrun/launcher/client.py +5 -3
  72. mlrun/launcher/local.py +8 -2
  73. mlrun/launcher/remote.py +8 -2
  74. mlrun/lists.py +6 -2
  75. mlrun/model.py +62 -20
  76. mlrun/model_monitoring/__init__.py +1 -1
  77. mlrun/model_monitoring/api.py +41 -18
  78. mlrun/model_monitoring/application.py +5 -305
  79. mlrun/model_monitoring/applications/__init__.py +11 -0
  80. mlrun/model_monitoring/applications/_application_steps.py +157 -0
  81. mlrun/model_monitoring/applications/base.py +280 -0
  82. mlrun/model_monitoring/applications/context.py +214 -0
  83. mlrun/model_monitoring/applications/evidently_base.py +211 -0
  84. mlrun/model_monitoring/applications/histogram_data_drift.py +132 -91
  85. mlrun/model_monitoring/applications/results.py +99 -0
  86. mlrun/model_monitoring/controller.py +3 -1
  87. mlrun/model_monitoring/db/__init__.py +2 -0
  88. mlrun/model_monitoring/db/stores/__init__.py +0 -2
  89. mlrun/model_monitoring/db/stores/base/store.py +22 -37
  90. mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +43 -21
  91. mlrun/model_monitoring/db/stores/sqldb/models/base.py +39 -8
  92. mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +27 -7
  93. mlrun/model_monitoring/db/stores/sqldb/models/sqlite.py +5 -0
  94. mlrun/model_monitoring/db/stores/sqldb/sql_store.py +246 -224
  95. mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +232 -216
  96. mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
  97. mlrun/model_monitoring/db/tsdb/base.py +329 -0
  98. mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
  99. mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
  100. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
  101. mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
  102. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
  103. mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
  104. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
  105. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +636 -0
  106. mlrun/model_monitoring/evidently_application.py +6 -118
  107. mlrun/model_monitoring/helpers.py +46 -1
  108. mlrun/model_monitoring/model_endpoint.py +3 -2
  109. mlrun/model_monitoring/stream_processing.py +57 -216
  110. mlrun/model_monitoring/writer.py +134 -124
  111. mlrun/package/utils/_formatter.py +2 -2
  112. mlrun/platforms/__init__.py +10 -9
  113. mlrun/platforms/iguazio.py +21 -202
  114. mlrun/projects/operations.py +19 -12
  115. mlrun/projects/pipelines.py +103 -109
  116. mlrun/projects/project.py +377 -137
  117. mlrun/render.py +15 -14
  118. mlrun/run.py +16 -47
  119. mlrun/runtimes/__init__.py +6 -3
  120. mlrun/runtimes/base.py +8 -7
  121. mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
  122. mlrun/runtimes/funcdoc.py +0 -28
  123. mlrun/runtimes/kubejob.py +2 -1
  124. mlrun/runtimes/local.py +5 -2
  125. mlrun/runtimes/mpijob/__init__.py +0 -20
  126. mlrun/runtimes/mpijob/v1.py +1 -1
  127. mlrun/runtimes/nuclio/api_gateway.py +440 -208
  128. mlrun/runtimes/nuclio/application/application.py +170 -8
  129. mlrun/runtimes/nuclio/function.py +39 -49
  130. mlrun/runtimes/pod.py +21 -41
  131. mlrun/runtimes/remotesparkjob.py +9 -3
  132. mlrun/runtimes/sparkjob/spark3job.py +1 -1
  133. mlrun/runtimes/utils.py +6 -45
  134. mlrun/serving/server.py +2 -1
  135. mlrun/serving/states.py +53 -2
  136. mlrun/serving/v2_serving.py +5 -1
  137. mlrun/track/tracker.py +2 -1
  138. mlrun/utils/async_http.py +25 -5
  139. mlrun/utils/helpers.py +107 -75
  140. mlrun/utils/logger.py +39 -7
  141. mlrun/utils/notifications/notification/__init__.py +14 -9
  142. mlrun/utils/notifications/notification/base.py +1 -1
  143. mlrun/utils/notifications/notification/slack.py +61 -13
  144. mlrun/utils/notifications/notification/webhook.py +1 -1
  145. mlrun/utils/notifications/notification_pusher.py +147 -16
  146. mlrun/utils/regex.py +9 -0
  147. mlrun/utils/v3io_clients.py +0 -1
  148. mlrun/utils/version/version.json +2 -2
  149. {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc21.dist-info}/METADATA +14 -6
  150. {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc21.dist-info}/RECORD +154 -133
  151. mlrun/kfpops.py +0 -865
  152. mlrun/platforms/other.py +0 -305
  153. {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc21.dist-info}/LICENSE +0 -0
  154. {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc21.dist-info}/WHEEL +0 -0
  155. {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc21.dist-info}/entry_points.txt +0 -0
  156. {mlrun-1.7.0rc13.dist-info → mlrun-1.7.0rc21.dist-info}/top_level.txt +0 -0
mlrun/datastore/base.py CHANGED
@@ -179,11 +179,23 @@ class DataStore:
179
179
  return {}
180
180
 
181
181
  @staticmethod
182
- def _parquet_reader(df_module, url, file_system, time_column, start_time, end_time):
182
+ def _parquet_reader(
183
+ df_module,
184
+ url,
185
+ file_system,
186
+ time_column,
187
+ start_time,
188
+ end_time,
189
+ additional_filters,
190
+ ):
183
191
  from storey.utils import find_filters, find_partitions
184
192
 
185
193
  def set_filters(
186
- partitions_time_attributes, start_time_inner, end_time_inner, kwargs
194
+ partitions_time_attributes,
195
+ start_time_inner,
196
+ end_time_inner,
197
+ filters_inner,
198
+ kwargs,
187
199
  ):
188
200
  filters = []
189
201
  find_filters(
@@ -193,20 +205,23 @@ class DataStore:
193
205
  filters,
194
206
  time_column,
195
207
  )
208
+ if filters and filters_inner:
209
+ filters[0] += filters_inner
210
+
196
211
  kwargs["filters"] = filters
197
212
 
198
213
  def reader(*args, **kwargs):
199
- if start_time or end_time:
200
- if time_column is None:
201
- raise mlrun.errors.MLRunInvalidArgumentError(
202
- "When providing start_time or end_time, must provide time_column"
203
- )
204
-
214
+ if time_column is None and (start_time or end_time):
215
+ raise mlrun.errors.MLRunInvalidArgumentError(
216
+ "When providing start_time or end_time, must provide time_column"
217
+ )
218
+ if start_time or end_time or additional_filters:
205
219
  partitions_time_attributes = find_partitions(url, file_system)
206
220
  set_filters(
207
221
  partitions_time_attributes,
208
222
  start_time,
209
223
  end_time,
224
+ additional_filters,
210
225
  kwargs,
211
226
  )
212
227
  try:
@@ -217,6 +232,7 @@ class DataStore:
217
232
  ):
218
233
  raise ex
219
234
 
235
+ # TODO: fix timezone issue (ML-6308)
220
236
  if start_time.tzinfo:
221
237
  start_time_inner = start_time.replace(tzinfo=None)
222
238
  end_time_inner = end_time.replace(tzinfo=None)
@@ -228,6 +244,7 @@ class DataStore:
228
244
  partitions_time_attributes,
229
245
  start_time_inner,
230
246
  end_time_inner,
247
+ additional_filters,
231
248
  kwargs,
232
249
  )
233
250
  return df_module.read_parquet(*args, **kwargs)
@@ -246,6 +263,7 @@ class DataStore:
246
263
  start_time=None,
247
264
  end_time=None,
248
265
  time_column=None,
266
+ additional_filters=None,
249
267
  **kwargs,
250
268
  ):
251
269
  df_module = df_module or pd
@@ -310,7 +328,13 @@ class DataStore:
310
328
  kwargs["columns"] = columns
311
329
 
312
330
  reader = self._parquet_reader(
313
- df_module, url, file_system, time_column, start_time, end_time
331
+ df_module,
332
+ url,
333
+ file_system,
334
+ time_column,
335
+ start_time,
336
+ end_time,
337
+ additional_filters,
314
338
  )
315
339
 
316
340
  elif file_url.endswith(".json") or format == "json":
@@ -539,6 +563,7 @@ class DataItem:
539
563
  time_column=None,
540
564
  start_time=None,
541
565
  end_time=None,
566
+ additional_filters=None,
542
567
  **kwargs,
543
568
  ):
544
569
  """return a dataframe object (generated from the dataitem).
@@ -550,6 +575,12 @@ class DataItem:
550
575
  :param end_time: filters out data after this time
551
576
  :param time_column: Store timestamp_key will be used if None.
552
577
  The results will be filtered by this column and start_time & end_time.
578
+ :param additional_filters: List of additional_filter conditions as tuples.
579
+ Each tuple should be in the format (column_name, operator, value).
580
+ Supported operators: "=", ">=", "<=", ">", "<".
581
+ Example: [("Product", "=", "Computer")]
582
+ For all supported filters, please see:
583
+ https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
553
584
  """
554
585
  df = self._store.as_df(
555
586
  self._url,
@@ -560,6 +591,7 @@ class DataItem:
560
591
  time_column=time_column,
561
592
  start_time=start_time,
562
593
  end_time=end_time,
594
+ additional_filters=additional_filters,
563
595
  **kwargs,
564
596
  )
565
597
  return df
@@ -223,6 +223,11 @@ class StoreManager:
223
223
  subpath = url[len("memory://") :]
224
224
  return in_memory_store, subpath, url
225
225
 
226
+ elif schema in get_local_file_schema():
227
+ # parse_url() will drop the windows drive-letter from the path for url like "c:\a\b".
228
+ # As a workaround, we set subpath to the url.
229
+ subpath = url.replace("file://", "", 1)
230
+
226
231
  if not schema and endpoint:
227
232
  if endpoint in self._stores.keys():
228
233
  return self._stores[endpoint], subpath, url
@@ -241,8 +246,7 @@ class StoreManager:
241
246
  )
242
247
  if not secrets and not mlrun.config.is_running_as_api():
243
248
  self._stores[store_key] = store
244
- # in file stores in windows path like c:\a\b the drive letter is dropped from the path, so we return the url
245
- return store, url if store.kind == "file" else subpath, url
249
+ return store, subpath, url
246
250
 
247
251
  def reset_secrets(self):
248
252
  self._secrets = {}
@@ -37,6 +37,7 @@ class DatastoreProfile(pydantic.BaseModel):
37
37
  extra = pydantic.Extra.forbid
38
38
 
39
39
  @pydantic.validator("name")
40
+ @classmethod
40
41
  def lower_case(cls, v):
41
42
  return v.lower()
42
43
 
@@ -185,6 +186,18 @@ class DatastoreProfileS3(DatastoreProfile):
185
186
  assume_role_arn: typing.Optional[str] = None
186
187
  access_key_id: typing.Optional[str] = None
187
188
  secret_key: typing.Optional[str] = None
189
+ bucket: typing.Optional[str] = None
190
+
191
+ @pydantic.validator("bucket")
192
+ @classmethod
193
+ def check_bucket(cls, v):
194
+ if not v:
195
+ warnings.warn(
196
+ "The 'bucket' attribute will be mandatory starting from version 1.9",
197
+ FutureWarning,
198
+ stacklevel=2,
199
+ )
200
+ return v
188
201
 
189
202
  def secrets(self) -> dict:
190
203
  res = {}
@@ -203,7 +216,13 @@ class DatastoreProfileS3(DatastoreProfile):
203
216
  return res
204
217
 
205
218
  def url(self, subpath):
206
- return f"s3:/{subpath}"
219
+ # TODO: There is an inconsistency with DatastoreProfileGCS. In DatastoreProfileGCS,
220
+ # we assume that the subpath can begin without a '/' character,
221
+ # while here we assume it always starts with one.
222
+ if self.bucket:
223
+ return f"s3://{self.bucket}{subpath}"
224
+ else:
225
+ return f"s3:/{subpath}"
207
226
 
208
227
 
209
228
  class DatastoreProfileRedis(DatastoreProfile):
@@ -272,18 +291,36 @@ class DatastoreProfileGCS(DatastoreProfile):
272
291
  _private_attributes = ("gcp_credentials",)
273
292
  credentials_path: typing.Optional[str] = None # path to file.
274
293
  gcp_credentials: typing.Optional[typing.Union[str, dict]] = None
294
+ bucket: typing.Optional[str] = None
295
+
296
+ @pydantic.validator("bucket")
297
+ @classmethod
298
+ def check_bucket(cls, v):
299
+ if not v:
300
+ warnings.warn(
301
+ "The 'bucket' attribute will be mandatory starting from version 1.9",
302
+ FutureWarning,
303
+ stacklevel=2,
304
+ )
305
+ return v
275
306
 
276
307
  @pydantic.validator("gcp_credentials", pre=True, always=True)
308
+ @classmethod
277
309
  def convert_dict_to_json(cls, v):
278
310
  if isinstance(v, dict):
279
311
  return json.dumps(v)
280
312
  return v
281
313
 
282
314
  def url(self, subpath) -> str:
315
+ # TODO: but there's something wrong with the subpath being assumed to not start with a slash here,
316
+ # but the opposite assumption is made in S3.
283
317
  if subpath.startswith("/"):
284
318
  # in gcs the path after schema is starts with bucket, wherefore it should not start with "/".
285
319
  subpath = subpath[1:]
286
- return f"gcs://{subpath}"
320
+ if self.bucket:
321
+ return f"gcs://{self.bucket}/{subpath}"
322
+ else:
323
+ return f"gcs://{subpath}"
287
324
 
288
325
  def secrets(self) -> dict:
289
326
  res = {}
@@ -311,12 +348,27 @@ class DatastoreProfileAzureBlob(DatastoreProfile):
311
348
  client_secret: typing.Optional[str] = None
312
349
  sas_token: typing.Optional[str] = None
313
350
  credential: typing.Optional[str] = None
351
+ container: typing.Optional[str] = None
352
+
353
+ @pydantic.validator("container")
354
+ @classmethod
355
+ def check_container(cls, v):
356
+ if not v:
357
+ warnings.warn(
358
+ "The 'container' attribute will be mandatory starting from version 1.9",
359
+ FutureWarning,
360
+ stacklevel=2,
361
+ )
362
+ return v
314
363
 
315
364
  def url(self, subpath) -> str:
316
365
  if subpath.startswith("/"):
317
- # in azure the path after schema is starts with bucket, wherefore it should not start with "/".
366
+ # in azure the path after schema is starts with container, wherefore it should not start with "/".
318
367
  subpath = subpath[1:]
319
- return f"az://{subpath}"
368
+ if self.container:
369
+ return f"az://{self.container}/{subpath}"
370
+ else:
371
+ return f"az://{subpath}"
320
372
 
321
373
  def secrets(self) -> dict:
322
374
  res = {}
mlrun/datastore/hdfs.py CHANGED
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import os
15
+ from urllib.parse import urlparse
15
16
 
16
17
  import fsspec
17
18
 
@@ -49,3 +50,7 @@ class HdfsStore(DataStore):
49
50
  @property
50
51
  def spark_url(self):
51
52
  return f"hdfs://{self.host}:{self.port}"
53
+
54
+ def rm(self, url, recursive=False, maxdepth=None):
55
+ path = urlparse(url).path
56
+ self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
mlrun/datastore/inmem.py CHANGED
@@ -80,8 +80,8 @@ class InMemoryStore(DataStore):
80
80
  reader = df_module.read_json
81
81
  else:
82
82
  raise mlrun.errors.MLRunInvalidArgumentError(f"file type unhandled {url}")
83
- # InMemoryStore store do not filter on time
84
- for field in ["time_column", "start_time", "end_time"]:
83
+ # InMemoryStore store don't pass filters
84
+ for field in ["time_column", "start_time", "end_time", "additional_filters"]:
85
85
  kwargs.pop(field, None)
86
86
 
87
87
  return reader(item, **kwargs)
mlrun/datastore/redis.py CHANGED
@@ -31,7 +31,7 @@ class RedisStore(DataStore):
31
31
  """
32
32
 
33
33
  def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
34
- REDIS_DEFAULT_PORT = "6379"
34
+ redis_default_port = "6379"
35
35
  super().__init__(parent, name, schema, endpoint, secrets=secrets)
36
36
  self.headers = None
37
37
 
@@ -49,7 +49,7 @@ class RedisStore(DataStore):
49
49
  user = self._get_secret_or_env("REDIS_USER", "", credentials_prefix)
50
50
  password = self._get_secret_or_env("REDIS_PASSWORD", "", credentials_prefix)
51
51
  host = parsed_endpoint.hostname
52
- port = parsed_endpoint.port if parsed_endpoint.port else REDIS_DEFAULT_PORT
52
+ port = parsed_endpoint.port if parsed_endpoint.port else redis_default_port
53
53
  schema = parsed_endpoint.scheme
54
54
  if user or password:
55
55
  endpoint = f"{schema}://{user}:{password}@{host}:{port}"
mlrun/datastore/s3.py CHANGED
@@ -198,6 +198,11 @@ class S3Store(DataStore):
198
198
  bucket = self.s3.Bucket(bucket)
199
199
  return [obj.key[key_length:] for obj in bucket.objects.filter(Prefix=key)]
200
200
 
201
+ def rm(self, path, recursive=False, maxdepth=None):
202
+ bucket, key = self.get_bucket_and_key(path)
203
+ path = f"{bucket}/{key}"
204
+ self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
205
+
201
206
 
202
207
  def parse_s3_bucket_and_key(s3_path):
203
208
  try:
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import json
15
+ import operator
15
16
  import os
16
17
  import warnings
17
18
  from base64 import b64encode
@@ -29,6 +30,7 @@ from nuclio.config import split_path
29
30
  import mlrun
30
31
  from mlrun.config import config
31
32
  from mlrun.datastore.snowflake_utils import get_snowflake_spark_options
33
+ from mlrun.datastore.utils import transform_list_filters_to_tuple
32
34
  from mlrun.secrets import SecretsStore
33
35
 
34
36
  from ..model import DataSource
@@ -102,8 +104,12 @@ class BaseSourceDriver(DataSource):
102
104
  start_time=None,
103
105
  end_time=None,
104
106
  time_field=None,
107
+ additional_filters=None,
105
108
  ):
106
109
  """return the source data as dataframe"""
110
+ mlrun.utils.helpers.additional_filters_warning(
111
+ additional_filters, self.__class__
112
+ )
107
113
  return mlrun.store_manager.object(url=self.path).as_df(
108
114
  columns=columns,
109
115
  df_module=df_module,
@@ -174,7 +180,7 @@ class CSVSource(BaseSourceDriver):
174
180
  self,
175
181
  name: str = "",
176
182
  path: str = None,
177
- attributes: dict[str, str] = None,
183
+ attributes: dict[str, object] = None,
178
184
  key_field: str = None,
179
185
  schedule: str = None,
180
186
  parse_dates: Union[None, int, str, list[int], list[str]] = None,
@@ -245,7 +251,11 @@ class CSVSource(BaseSourceDriver):
245
251
  start_time=None,
246
252
  end_time=None,
247
253
  time_field=None,
254
+ additional_filters=None,
248
255
  ):
256
+ mlrun.utils.helpers.additional_filters_warning(
257
+ additional_filters, self.__class__
258
+ )
249
259
  reader_args = self.attributes.get("reader_args", {})
250
260
  return mlrun.store_manager.object(url=self.path).as_df(
251
261
  columns=columns,
@@ -281,6 +291,12 @@ class ParquetSource(BaseSourceDriver):
281
291
  :parameter start_time: filters out data before this time
282
292
  :parameter end_time: filters out data after this time
283
293
  :parameter attributes: additional parameters to pass to storey.
294
+ :param additional_filters: List of additional_filter conditions as tuples.
295
+ Each tuple should be in the format (column_name, operator, value).
296
+ Supported operators: "=", ">=", "<=", ">", "<".
297
+ Example: [("Product", "=", "Computer")]
298
+ For all supported filters, please see:
299
+ https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
284
300
  """
285
301
 
286
302
  kind = "parquet"
@@ -291,13 +307,19 @@ class ParquetSource(BaseSourceDriver):
291
307
  self,
292
308
  name: str = "",
293
309
  path: str = None,
294
- attributes: dict[str, str] = None,
310
+ attributes: dict[str, object] = None,
295
311
  key_field: str = None,
296
312
  time_field: str = None,
297
313
  schedule: str = None,
298
314
  start_time: Optional[Union[datetime, str]] = None,
299
315
  end_time: Optional[Union[datetime, str]] = None,
316
+ additional_filters: Optional[list[Union[tuple, list]]] = None,
300
317
  ):
318
+ if additional_filters:
319
+ attributes = copy(attributes) or {}
320
+ additional_filters = transform_list_filters_to_tuple(additional_filters)
321
+ attributes["additional_filters"] = additional_filters
322
+
301
323
  super().__init__(
302
324
  name,
303
325
  path,
@@ -325,6 +347,10 @@ class ParquetSource(BaseSourceDriver):
325
347
  def end_time(self, end_time):
326
348
  self._end_time = self._convert_to_datetime(end_time)
327
349
 
350
+ @property
351
+ def additional_filters(self):
352
+ return self.attributes.get("additional_filters")
353
+
328
354
  @staticmethod
329
355
  def _convert_to_datetime(time):
330
356
  if time and isinstance(time, str):
@@ -341,16 +367,17 @@ class ParquetSource(BaseSourceDriver):
341
367
  start_time=None,
342
368
  end_time=None,
343
369
  context=None,
370
+ additional_filters=None,
344
371
  ):
345
372
  import storey
346
373
 
347
- attributes = self.attributes or {}
374
+ attributes = copy(self.attributes)
375
+ attributes.pop("additional_filters", None)
348
376
  if context:
349
377
  attributes["context"] = context
350
-
378
+ additional_filters = transform_list_filters_to_tuple(additional_filters)
351
379
  data_item = mlrun.store_manager.object(self.path)
352
380
  store, path, url = mlrun.store_manager.get_or_create_store(self.path)
353
-
354
381
  return storey.ParquetSource(
355
382
  paths=url, # unlike self.path, it already has store:// replaced
356
383
  key_field=self.key_field or key_field,
@@ -358,9 +385,20 @@ class ParquetSource(BaseSourceDriver):
358
385
  end_filter=self.end_time,
359
386
  start_filter=self.start_time,
360
387
  filter_column=self.time_field or time_field,
388
+ additional_filters=self.additional_filters or additional_filters,
361
389
  **attributes,
362
390
  )
363
391
 
392
+ @classmethod
393
+ def from_dict(cls, struct=None, fields=None, deprecated_fields: dict = None):
394
+ new_obj = super().from_dict(
395
+ struct=struct, fields=fields, deprecated_fields=deprecated_fields
396
+ )
397
+ new_obj.attributes["additional_filters"] = transform_list_filters_to_tuple(
398
+ new_obj.additional_filters
399
+ )
400
+ return new_obj
401
+
364
402
  def get_spark_options(self):
365
403
  store, path, _ = mlrun.store_manager.get_or_create_store(self.path)
366
404
  spark_options = store.get_spark_options()
@@ -380,8 +418,10 @@ class ParquetSource(BaseSourceDriver):
380
418
  start_time=None,
381
419
  end_time=None,
382
420
  time_field=None,
421
+ additional_filters=None,
383
422
  ):
384
423
  reader_args = self.attributes.get("reader_args", {})
424
+ additional_filters = transform_list_filters_to_tuple(additional_filters)
385
425
  return mlrun.store_manager.object(url=self.path).as_df(
386
426
  columns=columns,
387
427
  df_module=df_module,
@@ -389,9 +429,88 @@ class ParquetSource(BaseSourceDriver):
389
429
  end_time=end_time or self.end_time,
390
430
  time_column=time_field or self.time_field,
391
431
  format="parquet",
432
+ additional_filters=additional_filters or self.additional_filters,
392
433
  **reader_args,
393
434
  )
394
435
 
436
+ def _build_spark_additional_filters(self, column_types: dict):
437
+ if not self.additional_filters:
438
+ return None
439
+ from pyspark.sql.functions import col, isnan, lit
440
+
441
+ operators = {
442
+ "==": operator.eq,
443
+ "=": operator.eq,
444
+ ">": operator.gt,
445
+ "<": operator.lt,
446
+ ">=": operator.ge,
447
+ "<=": operator.le,
448
+ "!=": operator.ne,
449
+ }
450
+
451
+ spark_filter = None
452
+ new_filter = lit(True)
453
+ for filter_tuple in self.additional_filters:
454
+ if not filter_tuple:
455
+ continue
456
+ col_name, op, value = filter_tuple
457
+ if op.lower() in ("in", "not in") and isinstance(value, (list, tuple, set)):
458
+ none_exists = False
459
+ value = list(value)
460
+ for sub_value in value:
461
+ if sub_value is None:
462
+ value.remove(sub_value)
463
+ none_exists = True
464
+ if none_exists:
465
+ filter_nan = column_types[col_name] not in ("timestamp", "date")
466
+ if value:
467
+ if op.lower() == "in":
468
+ new_filter = (
469
+ col(col_name).isin(value) | col(col_name).isNull()
470
+ )
471
+ if filter_nan:
472
+ new_filter = new_filter | isnan(col(col_name))
473
+
474
+ else:
475
+ new_filter = (
476
+ ~col(col_name).isin(value) & ~col(col_name).isNull()
477
+ )
478
+ if filter_nan:
479
+ new_filter = new_filter & ~isnan(col(col_name))
480
+ else:
481
+ if op.lower() == "in":
482
+ new_filter = col(col_name).isNull()
483
+ if filter_nan:
484
+ new_filter = new_filter | isnan(col(col_name))
485
+ else:
486
+ new_filter = ~col(col_name).isNull()
487
+ if filter_nan:
488
+ new_filter = new_filter & ~isnan(col(col_name))
489
+ else:
490
+ if op.lower() == "in":
491
+ new_filter = col(col_name).isin(value)
492
+ elif op.lower() == "not in":
493
+ new_filter = ~col(col_name).isin(value)
494
+ elif op in operators:
495
+ new_filter = operators[op](col(col_name), value)
496
+ else:
497
+ raise mlrun.errors.MLRunInvalidArgumentError(
498
+ f"unsupported filter operator: {op}"
499
+ )
500
+ if spark_filter is not None:
501
+ spark_filter = spark_filter & new_filter
502
+ else:
503
+ spark_filter = new_filter
504
+ return spark_filter
505
+
506
+ def _filter_spark_df(self, df, time_field=None, columns=None):
507
+ spark_additional_filters = self._build_spark_additional_filters(
508
+ column_types=dict(df.dtypes)
509
+ )
510
+ if spark_additional_filters is not None:
511
+ df = df.filter(spark_additional_filters)
512
+ return super()._filter_spark_df(df=df, time_field=time_field, columns=columns)
513
+
395
514
 
396
515
  class BigQuerySource(BaseSourceDriver):
397
516
  """
@@ -519,10 +638,15 @@ class BigQuerySource(BaseSourceDriver):
519
638
  start_time=None,
520
639
  end_time=None,
521
640
  time_field=None,
641
+ additional_filters=None,
522
642
  ):
523
643
  from google.cloud import bigquery
524
644
  from google.cloud.bigquery_storage_v1 import BigQueryReadClient
525
645
 
646
+ mlrun.utils.helpers.additional_filters_warning(
647
+ additional_filters, self.__class__
648
+ )
649
+
526
650
  def schema_to_dtypes(schema):
527
651
  from mlrun.data_types.data_types import gbq_to_pandas_dtype
528
652
 
@@ -562,7 +686,6 @@ class BigQuerySource(BaseSourceDriver):
562
686
  else:
563
687
  df = rows_iterator.to_dataframe(dtypes=dtypes)
564
688
 
565
- # TODO : filter as part of the query
566
689
  return select_columns_from_df(
567
690
  filter_df_start_end_time(
568
691
  df,
@@ -740,7 +863,19 @@ class DataFrameSource:
740
863
  context=self.context or context,
741
864
  )
742
865
 
743
- def to_dataframe(self, **kwargs):
866
+ def to_dataframe(
867
+ self,
868
+ columns=None,
869
+ df_module=None,
870
+ entities=None,
871
+ start_time=None,
872
+ end_time=None,
873
+ time_field=None,
874
+ additional_filters=None,
875
+ ):
876
+ mlrun.utils.helpers.additional_filters_warning(
877
+ additional_filters, self.__class__
878
+ )
744
879
  return self._df
745
880
 
746
881
  def is_iterator(self):
@@ -935,6 +1070,7 @@ class KafkaSource(OnlineSource):
935
1070
  start_time=None,
936
1071
  end_time=None,
937
1072
  time_field=None,
1073
+ additional_filters=None,
938
1074
  ):
939
1075
  raise mlrun.MLRunInvalidArgumentError(
940
1076
  "KafkaSource does not support batch processing"
@@ -1075,9 +1211,13 @@ class SQLSource(BaseSourceDriver):
1075
1211
  start_time=None,
1076
1212
  end_time=None,
1077
1213
  time_field=None,
1214
+ additional_filters=None,
1078
1215
  ):
1079
1216
  import sqlalchemy as sqlalchemy
1080
1217
 
1218
+ mlrun.utils.helpers.additional_filters_warning(
1219
+ additional_filters, self.__class__
1220
+ )
1081
1221
  db_path = self.attributes.get("db_path")
1082
1222
  table_name = self.attributes.get("table_name")
1083
1223
  parse_dates = self.attributes.get("parse_dates")
@@ -17,7 +17,7 @@
17
17
  import mlrun
18
18
  import mlrun.artifacts
19
19
  from mlrun.config import config
20
- from mlrun.utils.helpers import is_legacy_artifact, parse_artifact_uri
20
+ from mlrun.utils.helpers import parse_artifact_uri
21
21
 
22
22
  from ..common.helpers import parse_versioned_object_uri
23
23
  from ..platforms.iguazio import parse_path
@@ -146,7 +146,11 @@ def get_store_resource(
146
146
 
147
147
  db = db or mlrun.get_run_db(secrets=secrets)
148
148
  kind, uri = parse_store_uri(uri)
149
- if kind == StorePrefix.FeatureSet:
149
+ if not kind:
150
+ raise mlrun.errors.MLRunInvalidArgumentError(
151
+ f"Cannot get store resource from invalid URI: {uri}"
152
+ )
153
+ elif kind == StorePrefix.FeatureSet:
150
154
  project, name, tag, uid = parse_versioned_object_uri(
151
155
  uri, project or config.default_project
152
156
  )
@@ -167,11 +171,7 @@ def get_store_resource(
167
171
  )
168
172
  if resource.get("kind", "") == "link":
169
173
  # todo: support other link types (not just iter, move this to the db/api layer
170
- link_iteration = (
171
- resource.get("link_iteration", 0)
172
- if is_legacy_artifact(resource)
173
- else resource["spec"].get("link_iteration", 0)
174
- )
174
+ link_iteration = resource["spec"].get("link_iteration", 0)
175
175
 
176
176
  resource = db.read_artifact(
177
177
  key,