PyPI - mlrun - Versions diffs - 1.7.0rc17__py3-none-any.whl → 1.7.0rc18__py3-none-any.whl - Mend

mlrun 1.7.0rc17py3-none-any.whl → 1.7.0rc18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (55) hide show

mlrun/alerts/alert.py +1 -1
mlrun/artifacts/manager.py +5 -1
mlrun/common/runtimes/constants.py +3 -0
mlrun/common/schemas/__init__.py +1 -1
mlrun/common/schemas/alert.py +31 -9
mlrun/common/schemas/client_spec.py +1 -0
mlrun/common/schemas/function.py +4 -0
mlrun/common/schemas/model_monitoring/__init__.py +3 -1
mlrun/common/schemas/model_monitoring/constants.py +20 -1
mlrun/common/schemas/model_monitoring/grafana.py +9 -5
mlrun/common/schemas/model_monitoring/model_endpoints.py +17 -6
mlrun/config.py +2 -0
mlrun/data_types/to_pandas.py +5 -5
mlrun/datastore/datastore.py +6 -2
mlrun/datastore/redis.py +2 -2
mlrun/datastore/s3.py +5 -0
mlrun/datastore/sources.py +111 -6
mlrun/datastore/targets.py +2 -2
mlrun/db/base.py +5 -1
mlrun/db/httpdb.py +22 -3
mlrun/db/nopdb.py +5 -1
mlrun/errors.py +6 -0
mlrun/feature_store/retrieval/conversion.py +5 -5
mlrun/feature_store/retrieval/job.py +3 -2
mlrun/feature_store/retrieval/spark_merger.py +2 -1
mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +2 -2
mlrun/model_monitoring/db/stores/base/store.py +16 -3
mlrun/model_monitoring/db/stores/sqldb/sql_store.py +44 -43
mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +190 -91
mlrun/model_monitoring/db/tsdb/__init__.py +35 -6
mlrun/model_monitoring/db/tsdb/base.py +25 -18
mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +207 -0
mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +231 -0
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +73 -72
mlrun/model_monitoring/db/v3io_tsdb_reader.py +217 -16
mlrun/model_monitoring/helpers.py +32 -0
mlrun/model_monitoring/stream_processing.py +7 -4
mlrun/model_monitoring/writer.py +18 -13
mlrun/package/utils/_formatter.py +2 -2
mlrun/projects/project.py +33 -8
mlrun/render.py +8 -5
mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
mlrun/utils/async_http.py +25 -5
mlrun/utils/helpers.py +20 -1
mlrun/utils/notifications/notification/slack.py +27 -7
mlrun/utils/notifications/notification_pusher.py +38 -40
mlrun/utils/version/version.json +2 -2
{mlrun-1.7.0rc17.dist-info → mlrun-1.7.0rc18.dist-info}/METADATA +7 -2
{mlrun-1.7.0rc17.dist-info → mlrun-1.7.0rc18.dist-info}/RECORD +55 -51
{mlrun-1.7.0rc17.dist-info → mlrun-1.7.0rc18.dist-info}/LICENSE +0 -0
{mlrun-1.7.0rc17.dist-info → mlrun-1.7.0rc18.dist-info}/WHEEL +0 -0
{mlrun-1.7.0rc17.dist-info → mlrun-1.7.0rc18.dist-info}/entry_points.txt +0 -0
{mlrun-1.7.0rc17.dist-info → mlrun-1.7.0rc18.dist-info}/top_level.txt +0 -0

mlrun/alerts/alert.py CHANGED Viewed

@@ -137,7 +137,7 @@ class AlertConfig(ModelObj):
             template = db.get_alert_template(template)
         # Extract parameters from the template and apply them to the AlertConfig object
-        self.description = template.description
+        self.summary = template.summary
         self.severity = template.severity
         self.criteria = template.criteria
         self.trigger = template.trigger

mlrun/artifacts/manager.py CHANGED Viewed

@@ -72,6 +72,10 @@ class ArtifactProducer:
     def get_meta(self) -> dict:
         return {"kind": self.kind, "name": self.name, "tag": self.tag}
+    @property
+    def uid(self):
+        return None
 def dict_to_artifact(struct: dict) -> Artifact:
     kind = struct.get("kind", "")
@@ -262,7 +266,7 @@ class ArtifactManager:
         if target_path and item.is_dir and not target_path.endswith("/"):
             target_path += "/"
         target_path = template_artifact_path(
-            artifact_path=target_path, project=producer.project
+            artifact_path=target_path, project=producer.project, run_uid=producer.uid
         )
         item.target_path = target_path

mlrun/common/runtimes/constants.py CHANGED Viewed

@@ -136,6 +136,7 @@ class RunStates:
     unknown = "unknown"
     aborted = "aborted"
     aborting = "aborting"
+    skipped = "skipped"
     @staticmethod
     def all():
@@ -148,6 +149,7 @@ class RunStates:
             RunStates.unknown,
             RunStates.aborted,
             RunStates.aborting,
+            RunStates.skipped,
         ]
     @staticmethod
@@ -156,6 +158,7 @@ class RunStates:
             RunStates.completed,
             RunStates.error,
             RunStates.aborted,
+            RunStates.skipped,
         ]
     @staticmethod

mlrun/common/schemas/__init__.py CHANGED Viewed

@@ -148,10 +148,10 @@ from .model_monitoring import (
     ModelMonitoringMode,
     ModelMonitoringStoreKinds,
     MonitoringFunctionNames,
-    MonitoringTSDBTables,
     PrometheusEndpoints,
     TimeSeriesConnector,
     TSDBTarget,
+    V3IOTSDBTables,
 )
 from .notification import (
     Notification,

mlrun/common/schemas/alert.py CHANGED Viewed

@@ -22,7 +22,7 @@ from mlrun.common.types import StrEnum
 class EventEntityKind(StrEnum):
-    MODEL = "model"
+    MODEL_ENDPOINT_RESULT = "model-endpoint-result"
     JOB = "job"
@@ -33,14 +33,34 @@ class EventEntities(pydantic.BaseModel):
 class EventKind(StrEnum):
-    DRIFT_DETECTED = "drift_detected"
-    DRIFT_SUSPECTED = "drift_suspected"
+    DATA_DRIFT_DETECTED = "data_drift_detected"
+    DATA_DRIFT_SUSPECTED = "data_drift_suspected"
+    CONCEPT_DRIFT_DETECTED = "concept_drift_detected"
+    CONCEPT_DRIFT_SUSPECTED = "concept_drift_suspected"
+    MODEL_PERFORMANCE_DETECTED = "model_performance_detected"
+    MODEL_PERFORMANCE_SUSPECTED = "model_performance_suspected"
+    MODEL_SERVING_PERFORMANCE_DETECTED = "model_serving_performance_detected"
+    MODEL_SERVING_PERFORMANCE_SUSPECTED = "model_serving_performance_suspected"
+    MM_APP_ANOMALY_DETECTED = "mm_app_anomaly_detected"
+    MM_APP_ANOMALY_SUSPECTED = "mm_app_anomaly_suspected"
     FAILED = "failed"
 _event_kind_entity_map = {
-    EventKind.DRIFT_SUSPECTED: [EventEntityKind.MODEL],
-    EventKind.DRIFT_DETECTED: [EventEntityKind.MODEL],
+    EventKind.DATA_DRIFT_SUSPECTED: [EventEntityKind.MODEL_ENDPOINT_RESULT],
+    EventKind.DATA_DRIFT_DETECTED: [EventEntityKind.MODEL_ENDPOINT_RESULT],
+    EventKind.CONCEPT_DRIFT_DETECTED: [EventEntityKind.MODEL_ENDPOINT_RESULT],
+    EventKind.CONCEPT_DRIFT_SUSPECTED: [EventEntityKind.MODEL_ENDPOINT_RESULT],
+    EventKind.MODEL_PERFORMANCE_DETECTED: [EventEntityKind.MODEL_ENDPOINT_RESULT],
+    EventKind.MODEL_PERFORMANCE_SUSPECTED: [EventEntityKind.MODEL_ENDPOINT_RESULT],
+    EventKind.MODEL_SERVING_PERFORMANCE_DETECTED: [
+        EventEntityKind.MODEL_ENDPOINT_RESULT
+    ],
+    EventKind.MODEL_SERVING_PERFORMANCE_SUSPECTED: [
+        EventEntityKind.MODEL_ENDPOINT_RESULT
+    ],
+    EventKind.MM_APP_ANOMALY_DETECTED: [EventEntityKind.MODEL_ENDPOINT_RESULT],
+    EventKind.MM_APP_ANOMALY_SUSPECTED: [EventEntityKind.MODEL_ENDPOINT_RESULT],
     EventKind.FAILED: [EventEntityKind.JOB],
 }
@@ -123,7 +143,8 @@ class AlertConfig(pydantic.BaseModel):
         pydantic.Field(
             description=(
                 "String to be sent in the notifications generated."
-                "e.g. 'Model {{ $project }}/{{ $entity }} is drifting.'"
+                "e.g. 'Model {{project}}/{{entity}} is drifting.'"
+                "Supported variables: project, entity, name"
             )
         ),
     ]
@@ -161,8 +182,9 @@ class AlertTemplate(
     system_generated: bool = False
     # AlertConfig fields that are pre-defined
-    description: Optional[str] = (
-        "String to be sent in the generated notifications e.g. 'Model {{ $project }}/{{ $entity }} is drifting.'"
+    summary: Optional[str] = (
+        "String to be sent in the generated notifications e.g. 'Model {{project}}/{{entity}} is drifting.'"
+        "See AlertConfig.summary description"
     )
     severity: AlertSeverity
     trigger: AlertTrigger
@@ -173,7 +195,7 @@ class AlertTemplate(
     def templates_differ(self, other):
         return (
             self.template_description != other.template_description
-            or self.description != other.description
+            or self.summary != other.summary
             or self.severity != other.severity
             or self.trigger != other.trigger
             or self.reset_policy != other.reset_policy

mlrun/common/schemas/client_spec.py CHANGED Viewed

@@ -59,6 +59,7 @@ class ClientSpec(pydantic.BaseModel):
     sql_url: typing.Optional[str]
     model_endpoint_monitoring_store_type: typing.Optional[str]
     model_endpoint_monitoring_endpoint_store_connection: typing.Optional[str]
+    model_monitoring_tsdb_connection: typing.Optional[str]
     ce: typing.Optional[dict]
     # not passing them as one object as it possible client user would like to override only one of the params
     calculate_artifact_hash: typing.Optional[str]

mlrun/common/schemas/function.py CHANGED Viewed

@@ -45,6 +45,9 @@ class FunctionState:
     # same goes for the build which is not coming from the pod, but is used and we can't just omit it for BC reasons
     build = "build"
+    # for pipeline steps
+    skipped = "skipped"
     @classmethod
     def get_function_state_from_pod_state(cls, pod_state: str):
         if pod_state == "succeeded":
@@ -60,6 +63,7 @@ class FunctionState:
         return [
             cls.ready,
             cls.error,
+            cls.skipped,
         ]

mlrun/common/schemas/model_monitoring/__init__.py CHANGED Viewed

@@ -30,20 +30,22 @@ from .constants import (
     ModelMonitoringMode,
     ModelMonitoringStoreKinds,
     MonitoringFunctionNames,
-    MonitoringTSDBTables,
     ProjectSecretKeys,
     PrometheusEndpoints,
     PrometheusMetric,
     ResultData,
     SchedulingKeys,
+    TDEngineSuperTables,
     TimeSeriesConnector,
     TSDBTarget,
+    V3IOTSDBTables,
     VersionedModel,
     WriterEvent,
     WriterEventKind,
 )
 from .grafana import (
     GrafanaColumn,
+    GrafanaColumnType,
     GrafanaDataPoint,
     GrafanaNumberColumn,
     GrafanaStringColumn,

mlrun/common/schemas/model_monitoring/constants.py CHANGED Viewed

@@ -81,6 +81,8 @@ class EventFieldType:
     DRIFT_DETECTED_THRESHOLD = "drift_detected_threshold"
     POSSIBLE_DRIFT_THRESHOLD = "possible_drift_threshold"
     SAMPLE_PARQUET_PATH = "sample_parquet_path"
+    TIME = "time"
+    TABLE_COLUMN = "table_column"
 class FeatureSetFeatures(MonitoringStrEnum):
@@ -171,6 +173,7 @@ class ProjectSecretKeys:
     PIPELINES_ACCESS_KEY = "MODEL_MONITORING_PIPELINES_ACCESS_KEY"
     KAFKA_BROKERS = "KAFKA_BROKERS"
     STREAM_PATH = "STREAM_PATH"
+    TSDB_CONNECTION = "TSDB_CONNECTION"
 class ModelMonitoringStoreKinds:
@@ -230,12 +233,18 @@ class MonitoringFunctionNames(MonitoringStrEnum):
     WRITER = "model-monitoring-writer"
-class MonitoringTSDBTables(MonitoringStrEnum):
+class V3IOTSDBTables(MonitoringStrEnum):
     APP_RESULTS = "app-results"
     METRICS = "metrics"
     EVENTS = "events"
+class TDEngineSuperTables(MonitoringStrEnum):
+    APP_RESULTS = "app_results"
+    METRICS = "metrics"
+    PREDICTIONS = "predictions"
 @dataclass
 class FunctionURI:
     project: str
@@ -339,6 +348,7 @@ class ControllerPolicy:
 class TSDBTarget:
     V3IO_TSDB = "v3io-tsdb"
+    TDEngine = "tdengine"
     PROMETHEUS = "prometheus"
     APP_RESULTS_TABLE = "app-results"
     V3IO_BE = "tsdb"
@@ -348,3 +358,12 @@ class TSDBTarget:
 class HistogramDataDriftApplicationConstants:
     NAME = "histogram-data-drift"
     GENERAL_RESULT_NAME = "general_drift"
+class PredictionsQueryConstants:
+    DEFAULT_AGGREGATION_GRANULARITY = "10m"
+    INVOCATIONS = "invocations"
+class SpecialApps:
+    MLRUN_INFRA = "mlrun-infra"

mlrun/common/schemas/model_monitoring/grafana.py CHANGED Viewed

@@ -11,12 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
 from typing import Optional, Union
 from pydantic import BaseModel
+import mlrun.common.types
+class GrafanaColumnType(mlrun.common.types.StrEnum):
+    NUMBER = "number"
+    STRING = "string"
 class GrafanaColumn(BaseModel):
     text: str
@@ -24,13 +30,11 @@ class GrafanaColumn(BaseModel):
 class GrafanaNumberColumn(GrafanaColumn):
-    text: str
-    type: str = "number"
+    type: str = GrafanaColumnType.NUMBER
 class GrafanaStringColumn(GrafanaColumn):
-    text: str
-    type: str = "string"
+    type: str = GrafanaColumnType.STRING
 class GrafanaTable(BaseModel):

mlrun/common/schemas/model_monitoring/model_endpoints.py CHANGED Viewed

@@ -298,6 +298,7 @@ class ModelEndpointList(BaseModel):
 class ModelEndpointMonitoringMetricType(mlrun.common.types.StrEnum):
     RESULT = "result"
+    METRIC = "metric"
 class ModelEndpointMonitoringMetric(BaseModel):
@@ -322,7 +323,7 @@ _FQN_PART_PATTERN = r"[a-zA-Z0-9_-]+"
 _FQN_PATTERN = (
     rf"^(?P<project>{_FQN_PART_PATTERN})\."
     rf"(?P<app>{_FQN_PART_PATTERN})\."
-    rf"(?P<type>{_FQN_PART_PATTERN})\."
+    rf"(?P<type>{ModelEndpointMonitoringMetricType.RESULT}|{ModelEndpointMonitoringMetricType.METRIC})\."
     rf"(?P<name>{_FQN_PART_PATTERN})$"
 )
 _FQN_REGEX = re.compile(_FQN_PATTERN)
@@ -337,27 +338,37 @@ def _parse_metric_fqn_to_monitoring_metric(fqn: str) -> ModelEndpointMonitoringM
     )
+class _MetricPoint(NamedTuple):
+    timestamp: datetime
+    value: float
 class _ResultPoint(NamedTuple):
     timestamp: datetime
     value: float
     status: ResultStatusApp
-class _ModelEndpointMonitoringResultValuesBase(BaseModel):
+class _ModelEndpointMonitoringMetricValuesBase(BaseModel):
     full_name: str
     type: ModelEndpointMonitoringMetricType
     data: bool
-class ModelEndpointMonitoringResultValues(_ModelEndpointMonitoringResultValuesBase):
-    full_name: str
-    type: ModelEndpointMonitoringMetricType
+class ModelEndpointMonitoringMetricValues(_ModelEndpointMonitoringMetricValuesBase):
+    type: ModelEndpointMonitoringMetricType = ModelEndpointMonitoringMetricType.METRIC
+    values: list[_MetricPoint]
+    data: bool = True
+class ModelEndpointMonitoringResultValues(_ModelEndpointMonitoringMetricValuesBase):
+    type: ModelEndpointMonitoringMetricType = ModelEndpointMonitoringMetricType.RESULT
     result_kind: ResultKindApp
     values: list[_ResultPoint]
     data: bool = True
-class ModelEndpointMonitoringResultNoData(_ModelEndpointMonitoringResultValuesBase):
+class ModelEndpointMonitoringMetricNoData(_ModelEndpointMonitoringMetricValuesBase):
     full_name: str
     type: ModelEndpointMonitoringMetricType
     data: bool = False

mlrun/config.py CHANGED Viewed

@@ -521,7 +521,9 @@ default_config = {
         # See mlrun.model_monitoring.db.stores.ObjectStoreFactory for available options
         "store_type": "v3io-nosql",
         "endpoint_store_connection": "",
+        # See mlrun.model_monitoring.db.tsdb.ObjectTSDBFactory for available options
         "tsdb_connector_type": "v3io-tsdb",
+        "tsdb_connection": "",
     },
     "secret_stores": {
         # Use only in testing scenarios (such as integration tests) to avoid using k8s for secrets (will use in-memory

mlrun/data_types/to_pandas.py CHANGED Viewed

@@ -154,10 +154,10 @@ def toPandas(spark_df):
     column_counter = Counter(spark_df.columns)
     dtype = [None] * len(spark_df.schema)
-    for fieldIdx, field in enumerate(spark_df.schema):
+    for field_idx, field in enumerate(spark_df.schema):
         # For duplicate column name, we use `iloc` to access it.
         if column_counter[field.name] > 1:
-            pandas_col = pdf.iloc[:, fieldIdx]
+            pandas_col = pdf.iloc[:, field_idx]
         else:
             pandas_col = pdf[field.name]
@@ -171,12 +171,12 @@ def toPandas(spark_df):
             and field.nullable
             and pandas_col.isnull().any()
         ):
-            dtype[fieldIdx] = pandas_type
+            dtype[field_idx] = pandas_type
         # Ensure we fall back to nullable numpy types, even when whole column is null:
         if isinstance(field.dataType, IntegralType) and pandas_col.isnull().any():
-            dtype[fieldIdx] = np.float64
+            dtype[field_idx] = np.float64
         if isinstance(field.dataType, BooleanType) and pandas_col.isnull().any():
-            dtype[fieldIdx] = object
+            dtype[field_idx] = object
     df = pd.DataFrame()
     for index, t in enumerate(dtype):

mlrun/datastore/datastore.py CHANGED Viewed

@@ -223,6 +223,11 @@ class StoreManager:
             subpath = url[len("memory://") :]
             return in_memory_store, subpath, url
+        elif schema in get_local_file_schema():
+            # parse_url() will drop the windows drive-letter from the path for url like "c:\a\b".
+            # As a workaround, we set subpath to the url.
+            subpath = url.replace("file://", "", 1)
         if not schema and endpoint:
             if endpoint in self._stores.keys():
                 return self._stores[endpoint], subpath, url
@@ -241,8 +246,7 @@ class StoreManager:
         )
         if not secrets and not mlrun.config.is_running_as_api():
             self._stores[store_key] = store
-        # in file stores in windows path like c:\a\b the drive letter is dropped from the path, so we return the url
-        return store, url if store.kind == "file" else subpath, url
+        return store, subpath, url
     def reset_secrets(self):
         self._secrets = {}

mlrun/datastore/redis.py CHANGED Viewed

@@ -31,7 +31,7 @@ class RedisStore(DataStore):
     """
     def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
-        REDIS_DEFAULT_PORT = "6379"
+        redis_default_port = "6379"
         super().__init__(parent, name, schema, endpoint, secrets=secrets)
         self.headers = None
@@ -49,7 +49,7 @@ class RedisStore(DataStore):
         user = self._get_secret_or_env("REDIS_USER", "", credentials_prefix)
         password = self._get_secret_or_env("REDIS_PASSWORD", "", credentials_prefix)
         host = parsed_endpoint.hostname
-        port = parsed_endpoint.port if parsed_endpoint.port else REDIS_DEFAULT_PORT
+        port = parsed_endpoint.port if parsed_endpoint.port else redis_default_port
         schema = parsed_endpoint.scheme
         if user or password:
             endpoint = f"{schema}://{user}:{password}@{host}:{port}"

mlrun/datastore/s3.py CHANGED Viewed

@@ -198,6 +198,11 @@ class S3Store(DataStore):
         bucket = self.s3.Bucket(bucket)
         return [obj.key[key_length:] for obj in bucket.objects.filter(Prefix=key)]
+    def rm(self, path, recursive=False, maxdepth=None):
+        bucket, key = self.get_bucket_and_key(path)
+        path = f"{bucket}/{key}"
+        self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
 def parse_s3_bucket_and_key(s3_path):
     try:

mlrun/datastore/sources.py CHANGED Viewed

@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import json
+import math
+import operator
 import os
 import warnings
 from base64 import b64encode
@@ -178,7 +180,7 @@ class CSVSource(BaseSourceDriver):
         self,
         name: str = "",
         path: str = None,
-        attributes: dict[str, str] = None,
+        attributes: dict[str, object] = None,
         key_field: str = None,
         schedule: str = None,
         parse_dates: Union[None, int, str, list[int], list[str]] = None,
@@ -305,7 +307,7 @@ class ParquetSource(BaseSourceDriver):
         self,
         name: str = "",
         path: str = None,
-        attributes: dict[str, str] = None,
+        attributes: dict[str, object] = None,
         key_field: str = None,
         time_field: str = None,
         schedule: str = None,
@@ -313,6 +315,10 @@ class ParquetSource(BaseSourceDriver):
         end_time: Optional[Union[datetime, str]] = None,
         additional_filters: Optional[list[tuple]] = None,
     ):
+        if additional_filters:
+            attributes = copy(attributes) or {}
+            attributes["additional_filters"] = additional_filters
+            self.validate_additional_filters(additional_filters)
         super().__init__(
             name,
             path,
@@ -323,7 +329,6 @@ class ParquetSource(BaseSourceDriver):
             start_time,
             end_time,
         )
-        self.additional_filters = additional_filters
     @property
     def start_time(self):
@@ -341,6 +346,10 @@ class ParquetSource(BaseSourceDriver):
     def end_time(self, end_time):
         self._end_time = self._convert_to_datetime(end_time)
+    @property
+    def additional_filters(self):
+        return self.attributes.get("additional_filters")
     @staticmethod
     def _convert_to_datetime(time):
         if time and isinstance(time, str):
@@ -350,6 +359,25 @@ class ParquetSource(BaseSourceDriver):
         else:
             return time
+    @staticmethod
+    def validate_additional_filters(additional_filters):
+        if not additional_filters:
+            return
+        for filter_tuple in additional_filters:
+            if not filter_tuple:
+                continue
+            col_name, op, value = filter_tuple
+            if isinstance(value, float) and math.isnan(value):
+                raise mlrun.errors.MLRunInvalidArgumentError(
+                    "using NaN in additional_filters is not supported"
+                )
+            elif isinstance(value, (list, tuple, set)):
+                for sub_value in value:
+                    if isinstance(sub_value, float) and math.isnan(sub_value):
+                        raise mlrun.errors.MLRunInvalidArgumentError(
+                            "using NaN in additional_filters is not supported"
+                        )
     def to_step(
         self,
         key_field=None,
@@ -361,13 +389,12 @@ class ParquetSource(BaseSourceDriver):
     ):
         import storey
-        attributes = self.attributes or {}
+        attributes = copy(self.attributes)
+        attributes.pop("additional_filters", None)
         if context:
             attributes["context"] = context
         data_item = mlrun.store_manager.object(self.path)
         store, path, url = mlrun.store_manager.get_or_create_store(self.path)
         return storey.ParquetSource(
             paths=url,  # unlike self.path, it already has store:// replaced
             key_field=self.key_field or key_field,
@@ -412,6 +439,84 @@ class ParquetSource(BaseSourceDriver):
             **reader_args,
         )
+    def _build_spark_additional_filters(self, column_types: dict):
+        if not self.additional_filters:
+            return None
+        from pyspark.sql.functions import col, isnan, lit
+        operators = {
+            "==": operator.eq,
+            "=": operator.eq,
+            ">": operator.gt,
+            "<": operator.lt,
+            ">=": operator.ge,
+            "<=": operator.le,
+            "!=": operator.ne,
+        }
+        spark_filter = None
+        new_filter = lit(True)
+        for filter_tuple in self.additional_filters:
+            if not filter_tuple:
+                continue
+            col_name, op, value = filter_tuple
+            if op.lower() in ("in", "not in") and isinstance(value, (list, tuple, set)):
+                none_exists = False
+                value = list(value)
+                for sub_value in value:
+                    if sub_value is None:
+                        value.remove(sub_value)
+                        none_exists = True
+                if none_exists:
+                    filter_nan = column_types[col_name] not in ("timestamp", "date")
+                    if value:
+                        if op.lower() == "in":
+                            new_filter = (
+                                col(col_name).isin(value) | col(col_name).isNull()
+                            )
+                            if filter_nan:
+                                new_filter = new_filter | isnan(col(col_name))
+                        else:
+                            new_filter = (
+                                ~col(col_name).isin(value) & ~col(col_name).isNull()
+                            )
+                            if filter_nan:
+                                new_filter = new_filter & ~isnan(col(col_name))
+                    else:
+                        if op.lower() == "in":
+                            new_filter = col(col_name).isNull()
+                            if filter_nan:
+                                new_filter = new_filter | isnan(col(col_name))
+                        else:
+                            new_filter = ~col(col_name).isNull()
+                            if filter_nan:
+                                new_filter = new_filter & ~isnan(col(col_name))
+                else:
+                    if op.lower() == "in":
+                        new_filter = col(col_name).isin(value)
+                    elif op.lower() == "not in":
+                        new_filter = ~col(col_name).isin(value)
+            elif op in operators:
+                new_filter = operators[op](col(col_name), value)
+            else:
+                raise mlrun.errors.MLRunInvalidArgumentError(
+                    f"unsupported filter operator: {op}"
+                )
+            if spark_filter is not None:
+                spark_filter = spark_filter & new_filter
+            else:
+                spark_filter = new_filter
+        return spark_filter
+    def _filter_spark_df(self, df, time_field=None, columns=None):
+        spark_additional_filters = self._build_spark_additional_filters(
+            column_types=dict(df.dtypes)
+        )
+        if spark_additional_filters is not None:
+            df = df.filter(spark_additional_filters)
+        return super()._filter_spark_df(df=df, time_field=time_field, columns=columns)
 class BigQuerySource(BaseSourceDriver):
     """

mlrun/datastore/targets.py CHANGED Viewed

@@ -2134,7 +2134,7 @@ class SQLTarget(BaseStoreTarget):
                 raise ValueError(f"Table named {table_name} is not exist")
             elif not table_exists and create_table:
-                TYPE_TO_SQL_TYPE = {
+                type_to_sql_type = {
                     int: sqlalchemy.Integer,
                     str: sqlalchemy.String(self.attributes.get("varchar_len")),
                     datetime.datetime: sqlalchemy.dialects.mysql.DATETIME(fsp=6),
@@ -2147,7 +2147,7 @@ class SQLTarget(BaseStoreTarget):
                 # creat new table with the given name
                 columns = []
                 for col, col_type in self.schema.items():
-                    col_type_sql = TYPE_TO_SQL_TYPE.get(col_type)
+                    col_type_sql = type_to_sql_type.get(col_type)
                     if col_type_sql is None:
                         raise TypeError(
                             f"'{col_type}' unsupported type for column '{col}'"

mlrun/db/base.py CHANGED Viewed

@@ -17,6 +17,7 @@ from abc import ABC, abstractmethod
 from typing import Optional, Union
 import mlrun.alerts
+import mlrun.common.runtimes.constants
 import mlrun.common.schemas
 import mlrun.model_monitoring
@@ -63,7 +64,10 @@ class RunDBInterface(ABC):
         uid: Optional[Union[str, list[str]]] = None,
         project: Optional[str] = None,
         labels: Optional[Union[str, list[str]]] = None,
-        state: Optional[str] = None,
+        state: Optional[
+            mlrun.common.runtimes.constants.RunStates
+        ] = None,  # Backward compatibility
+        states: Optional[list[mlrun.common.runtimes.constants.RunStates]] = None,
         sort: bool = True,
         last: int = 0,
         iter: bool = False,

mlrun 1.7.0rc17__py3-none-any.whl → 1.7.0rc18__py3-none-any.whl

Potentially problematic release.

mlrun 1.7.0rc17py3-none-any.whl → 1.7.0rc18py3-none-any.whl