PyPI - mlrun - Versions diffs - 1.7.0rc14__py3-none-any.whl → 1.7.0rc16__py3-none-any.whl - Mend

mlrun 1.7.0rc14py3-none-any.whl → 1.7.0rc16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (107) hide show

mlrun/__init__.py +10 -1
mlrun/__main__.py +18 -109
mlrun/{runtimes/mpijob/v1alpha1.py → alerts/__init__.py} +2 -16
mlrun/alerts/alert.py +141 -0
mlrun/artifacts/__init__.py +8 -3
mlrun/artifacts/base.py +36 -253
mlrun/artifacts/dataset.py +9 -190
mlrun/artifacts/manager.py +20 -41
mlrun/artifacts/model.py +8 -140
mlrun/artifacts/plots.py +14 -375
mlrun/common/schemas/__init__.py +4 -2
mlrun/common/schemas/alert.py +46 -4
mlrun/common/schemas/api_gateway.py +4 -0
mlrun/common/schemas/artifact.py +15 -0
mlrun/common/schemas/auth.py +2 -0
mlrun/common/schemas/model_monitoring/__init__.py +8 -1
mlrun/common/schemas/model_monitoring/constants.py +40 -4
mlrun/common/schemas/model_monitoring/model_endpoints.py +73 -2
mlrun/common/schemas/project.py +2 -0
mlrun/config.py +7 -4
mlrun/data_types/to_pandas.py +4 -4
mlrun/datastore/base.py +41 -9
mlrun/datastore/datastore_profile.py +54 -4
mlrun/datastore/inmem.py +2 -2
mlrun/datastore/sources.py +43 -2
mlrun/datastore/store_resources.py +2 -6
mlrun/datastore/targets.py +106 -39
mlrun/db/base.py +23 -3
mlrun/db/httpdb.py +101 -47
mlrun/db/nopdb.py +20 -2
mlrun/errors.py +5 -0
mlrun/feature_store/__init__.py +0 -2
mlrun/feature_store/api.py +12 -47
mlrun/feature_store/feature_set.py +9 -0
mlrun/feature_store/retrieval/base.py +9 -4
mlrun/feature_store/retrieval/conversion.py +4 -4
mlrun/feature_store/retrieval/dask_merger.py +2 -0
mlrun/feature_store/retrieval/job.py +2 -0
mlrun/feature_store/retrieval/local_merger.py +2 -0
mlrun/feature_store/retrieval/spark_merger.py +5 -0
mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +5 -10
mlrun/launcher/base.py +4 -3
mlrun/launcher/client.py +1 -1
mlrun/lists.py +4 -2
mlrun/model.py +25 -11
mlrun/model_monitoring/__init__.py +1 -1
mlrun/model_monitoring/api.py +41 -18
mlrun/model_monitoring/application.py +5 -305
mlrun/model_monitoring/applications/__init__.py +11 -0
mlrun/model_monitoring/applications/_application_steps.py +157 -0
mlrun/model_monitoring/applications/base.py +282 -0
mlrun/model_monitoring/applications/context.py +214 -0
mlrun/model_monitoring/applications/evidently_base.py +211 -0
mlrun/model_monitoring/applications/histogram_data_drift.py +132 -91
mlrun/model_monitoring/applications/results.py +99 -0
mlrun/model_monitoring/controller.py +3 -1
mlrun/model_monitoring/db/__init__.py +2 -0
mlrun/model_monitoring/db/stores/base/store.py +9 -36
mlrun/model_monitoring/db/stores/sqldb/models/base.py +7 -6
mlrun/model_monitoring/db/stores/sqldb/sql_store.py +63 -110
mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +104 -187
mlrun/model_monitoring/db/tsdb/__init__.py +71 -0
mlrun/model_monitoring/db/tsdb/base.py +135 -0
mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +404 -0
mlrun/model_monitoring/db/v3io_tsdb_reader.py +134 -0
mlrun/model_monitoring/evidently_application.py +6 -118
mlrun/model_monitoring/helpers.py +1 -1
mlrun/model_monitoring/model_endpoint.py +3 -2
mlrun/model_monitoring/stream_processing.py +48 -213
mlrun/model_monitoring/writer.py +101 -121
mlrun/platforms/__init__.py +10 -9
mlrun/platforms/iguazio.py +21 -202
mlrun/projects/operations.py +11 -7
mlrun/projects/pipelines.py +13 -76
mlrun/projects/project.py +73 -45
mlrun/render.py +11 -13
mlrun/run.py +6 -41
mlrun/runtimes/__init__.py +3 -3
mlrun/runtimes/base.py +6 -6
mlrun/runtimes/funcdoc.py +0 -28
mlrun/runtimes/kubejob.py +2 -1
mlrun/runtimes/local.py +1 -1
mlrun/runtimes/mpijob/__init__.py +0 -20
mlrun/runtimes/mpijob/v1.py +1 -1
mlrun/runtimes/nuclio/api_gateway.py +75 -9
mlrun/runtimes/nuclio/function.py +9 -35
mlrun/runtimes/pod.py +16 -36
mlrun/runtimes/remotesparkjob.py +1 -1
mlrun/runtimes/sparkjob/spark3job.py +1 -1
mlrun/runtimes/utils.py +1 -39
mlrun/utils/helpers.py +72 -71
mlrun/utils/notifications/notification/base.py +1 -1
mlrun/utils/notifications/notification/slack.py +12 -5
mlrun/utils/notifications/notification/webhook.py +1 -1
mlrun/utils/notifications/notification_pusher.py +134 -14
mlrun/utils/version/version.json +2 -2
{mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/METADATA +4 -3
{mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/RECORD +105 -95
mlrun/kfpops.py +0 -865
mlrun/platforms/other.py +0 -305
/mlrun/{runtimes → common/runtimes}/constants.py +0 -0
{mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/LICENSE +0 -0
{mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/WHEEL +0 -0
{mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/entry_points.txt +0 -0
{mlrun-1.7.0rc14.dist-info → mlrun-1.7.0rc16.dist-info}/top_level.txt +0 -0

mlrun/common/schemas/alert.py CHANGED Viewed

@@ -26,10 +26,10 @@ class EventEntityKind(StrEnum):
     JOB = "job"
-class EventEntity(pydantic.BaseModel):
+class EventEntities(pydantic.BaseModel):
     kind: EventEntityKind
     project: str
-    id: str
+    ids: pydantic.conlist(str, min_items=1, max_items=1)
 class EventKind(StrEnum):
@@ -48,7 +48,7 @@ _event_kind_entity_map = {
 class Event(pydantic.BaseModel):
     kind: EventKind
     timestamp: Union[str, datetime] = None  # occurrence time
-    entity: EventEntity
+    entity: EventEntities
     value_dict: Optional[dict] = pydantic.Field(default_factory=dict)
     def is_valid(self):
@@ -71,6 +71,12 @@ class AlertTrigger(pydantic.BaseModel):
     events: list[EventKind] = []
     prometheus_alert: str = None
+    def __eq__(self, other):
+        return (
+            self.prometheus_alert == other.prometheus_alert
+            and self.events == other.events
+        )
 class AlertCriteria(pydantic.BaseModel):
     count: Annotated[
@@ -86,6 +92,9 @@ class AlertCriteria(pydantic.BaseModel):
         ),
     ] = None
+    def __eq__(self, other):
+        return self.count == other.count and self.period == other.period
 class ResetPolicy(StrEnum):
     MANUAL = "manual"
@@ -108,7 +117,7 @@ class AlertConfig(pydantic.BaseModel):
     ]
     created: Union[str, datetime] = None
     severity: AlertSeverity
-    entity: EventEntity
+    entities: EventEntities
     trigger: AlertTrigger
     criteria: Optional[AlertCriteria]
     reset_policy: ResetPolicy = ResetPolicy.MANUAL
@@ -120,3 +129,36 @@ class AlertConfig(pydantic.BaseModel):
 class AlertsModes(StrEnum):
     enabled = "enabled"
     disabled = "disabled"
+class AlertTemplate(
+    pydantic.BaseModel
+):  # Template fields that are not shared with created configs
+    template_id: int = None
+    template_name: str
+    template_description: Optional[str] = (
+        "String explaining the purpose of this template"
+    )
+    # A property that identifies templates that were created by the system and cannot be modified/deleted by the user
+    system_generated: bool = False
+    # AlertConfig fields that are pre-defined
+    description: Optional[str] = (
+        "String to be sent in the generated notifications e.g. 'Model {{ $project }}/{{ $entity }} is drifting.'"
+    )
+    severity: AlertSeverity
+    trigger: AlertTrigger
+    criteria: Optional[AlertCriteria]
+    reset_policy: ResetPolicy = ResetPolicy.MANUAL
+    # This is slightly different than __eq__ as it doesn't compare everything
+    def templates_differ(self, other):
+        return (
+            self.template_description != other.template_description
+            or self.description != other.description
+            or self.severity != other.severity
+            or self.trigger != other.trigger
+            or self.reset_policy != other.reset_policy
+            or self.criteria != other.criteria
+        )

mlrun/common/schemas/api_gateway.py CHANGED Viewed

@@ -23,6 +23,7 @@ import mlrun.common.types
 class APIGatewayAuthenticationMode(mlrun.common.types.StrEnum):
     basic = "basicAuth"
     none = "none"
+    access_key = "accessKey"
     @classmethod
     def from_str(cls, authentication_mode: str):
@@ -30,6 +31,8 @@ class APIGatewayAuthenticationMode(mlrun.common.types.StrEnum):
             return cls.none
         elif authentication_mode == "basicAuth":
             return cls.basic
+        elif authentication_mode == "accessKey":
+            return cls.access_key
         else:
             raise mlrun.errors.MLRunInvalidArgumentError(
                 f"Authentication mode `{authentication_mode}` is not supported",
@@ -63,6 +66,7 @@ class APIGatewayUpstream(_APIGatewayBaseModel):
     kind: Optional[str] = "nucliofunction"
     nucliofunction: dict[str, str]
     percentage: Optional[int] = 0
+    port: Optional[int] = 0
 class APIGatewaySpec(_APIGatewayBaseModel):

mlrun/common/schemas/artifact.py CHANGED Viewed

@@ -93,3 +93,18 @@ class Artifact(pydantic.BaseModel):
     metadata: ArtifactMetadata
     spec: ArtifactSpec
     status: ObjectStatus
+class ArtifactsDeletionStrategies(mlrun.common.types.StrEnum):
+    """Artifacts deletion strategies types."""
+    metadata_only = "metadata-only"
+    """Only removes the artifact db record, leaving all related artifact data in-place"""
+    data_optional = "data-optional"
+    """Delete the artifact data of the artifact as a best-effort.
+    If artifact data deletion fails still try to delete the artifact db record"""
+    data_force = "data-force"
+    """Delete the artifact data, and if cannot delete it fail the deletion
+    and don’t delete the artifact db record"""

mlrun/common/schemas/auth.py CHANGED Viewed

@@ -59,6 +59,7 @@ class AuthorizationResourceTypes(mlrun.common.types.StrEnum):
     hub_source = "hub-source"
     workflow = "workflow"
     alert = "alert"
+    alert_templates = "alert-templates"
     event = "event"
     datastore_profile = "datastore-profile"
     api_gateway = "api-gateway"
@@ -87,6 +88,7 @@ class AuthorizationResourceTypes(mlrun.common.types.StrEnum):
             AuthorizationResourceTypes.run: "/projects/{project_name}/runs/{resource_name}",
             AuthorizationResourceTypes.event: "/projects/{project_name}/events/{resource_name}",
             AuthorizationResourceTypes.alert: "/projects/{project_name}/alerts/{resource_name}",
+            AuthorizationResourceTypes.alert_templates: "/alert-templates/{resource_name}",
             # runtime resource doesn't have an identifier, we don't need any auth granularity behind project level
             AuthorizationResourceTypes.runtime_resource: "/projects/{project_name}/runtime-resources",
             AuthorizationResourceTypes.model_endpoint: "/projects/{project_name}/model-endpoints/{resource_name}",

mlrun/common/schemas/model_monitoring/__init__.py CHANGED Viewed

@@ -25,17 +25,22 @@ from .constants import (
     FeatureSetFeatures,
     FileTargetKind,
     FunctionURI,
+    MetricData,
     ModelEndpointTarget,
     ModelMonitoringMode,
     ModelMonitoringStoreKinds,
     MonitoringFunctionNames,
+    MonitoringTSDBTables,
     ProjectSecretKeys,
     PrometheusEndpoints,
     PrometheusMetric,
+    ResultData,
     SchedulingKeys,
-    TimeSeriesTarget,
+    TimeSeriesConnector,
+    TSDBTarget,
     VersionedModel,
     WriterEvent,
+    WriterEventKind,
 )
 from .grafana import (
     GrafanaColumn,
@@ -51,6 +56,8 @@ from .model_endpoints import (
     ModelEndpoint,
     ModelEndpointList,
     ModelEndpointMetadata,
+    ModelEndpointMonitoringMetric,
+    ModelEndpointMonitoringMetricType,
     ModelEndpointSpec,
     ModelEndpointStatus,
 )

mlrun/common/schemas/model_monitoring/constants.py CHANGED Viewed

@@ -99,14 +99,17 @@ class FeatureSetFeatures(MonitoringStrEnum):
 class ApplicationEvent:
     APPLICATION_NAME = "application_name"
-    CURRENT_STATS = "current_stats"
-    FEATURE_STATS = "feature_stats"
-    SAMPLE_PARQUET_PATH = "sample_parquet_path"
     START_INFER_TIME = "start_infer_time"
     END_INFER_TIME = "end_infer_time"
     LAST_REQUEST = "last_request"
     ENDPOINT_ID = "endpoint_id"
     OUTPUT_STREAM_URI = "output_stream_uri"
+    MLRUN_CONTEXT = "mlrun_context"
+    # Deprecated fields - TODO : delete in 1.9.0  (V1 app deprecation)
+    SAMPLE_PARQUET_PATH = "sample_parquet_path"
+    CURRENT_STATS = "current_stats"
+    FEATURE_STATS = "feature_stats"
 class WriterEvent(MonitoringStrEnum):
@@ -114,6 +117,21 @@ class WriterEvent(MonitoringStrEnum):
     ENDPOINT_ID = "endpoint_id"
     START_INFER_TIME = "start_infer_time"
     END_INFER_TIME = "end_infer_time"
+    EVENT_KIND = "event_kind"  # metric or result
+    DATA = "data"
+class WriterEventKind(MonitoringStrEnum):
+    METRIC = "metric"
+    RESULT = "result"
+class MetricData(MonitoringStrEnum):
+    METRIC_NAME = "metric_name"
+    METRIC_VALUE = "metric_value"
+class ResultData(MonitoringStrEnum):
     RESULT_NAME = "result_name"
     RESULT_VALUE = "result_value"
     RESULT_KIND = "result_kind"
@@ -138,7 +156,7 @@ class EventKeyMetrics:
     REAL_TIME = "real_time"
-class TimeSeriesTarget:
+class TimeSeriesConnector:
     TSDB = "tsdb"
@@ -176,6 +194,7 @@ class FileTargetKind:
     LOG_STREAM = "log_stream"
     APP_RESULTS = "app_results"
     MONITORING_SCHEDULES = "monitoring_schedules"
+    MONITORING_APPLICATION = "monitoring_application"
 class ModelMonitoringMode(str, Enum):
@@ -210,6 +229,12 @@ class MonitoringFunctionNames(MonitoringStrEnum):
     WRITER = "model-monitoring-writer"
+class MonitoringTSDBTables(MonitoringStrEnum):
+    APP_RESULTS = "app-results"
+    METRICS = "metrics"
+    EVENTS = "events"
 @dataclass
 class FunctionURI:
     project: str
@@ -303,11 +328,22 @@ class ModelMonitoringAppLabel:
     KEY = "mlrun__type"
     VAL = "mlrun__model-monitoring-application"
+    def __str__(self) -> str:
+        return f"{self.KEY}={self.VAL}"
 class ControllerPolicy:
     BASE_PERIOD = "base_period"
+class TSDBTarget:
+    V3IO_TSDB = "v3io-tsdb"
+    PROMETHEUS = "prometheus"
+    APP_RESULTS_TABLE = "app-results"
+    V3IO_BE = "tsdb"
+    V3IO_RATE = "1/s"
 class HistogramDataDriftApplicationConstants:
     NAME = "histogram-data-drift"
     GENERAL_RESULT_NAME = "general_drift"

mlrun/common/schemas/model_monitoring/model_endpoints.py CHANGED Viewed

@@ -11,16 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
 import enum
 import json
-from typing import Any, Optional
+import re
+from datetime import datetime
+from typing import Any, NamedTuple, Optional
 from pydantic import BaseModel, Field, validator
 from pydantic.main import Extra
 import mlrun.common.model_monitoring
+import mlrun.common.types
 from ..object import ObjectKind, ObjectSpec, ObjectStatus
 from .constants import (
@@ -29,6 +31,8 @@ from .constants import (
     EventKeyMetrics,
     EventLiveStats,
     ModelMonitoringMode,
+    ResultKindApp,
+    ResultStatusApp,
 )
@@ -292,6 +296,73 @@ class ModelEndpointList(BaseModel):
     endpoints: list[ModelEndpoint] = []
+class ModelEndpointMonitoringMetricType(mlrun.common.types.StrEnum):
+    RESULT = "result"
+class ModelEndpointMonitoringMetric(BaseModel):
+    project: str
+    app: str
+    type: ModelEndpointMonitoringMetricType
+    name: str
+    full_name: str
+def _compose_full_name(
+    *,
+    project: str,
+    app: str,
+    name: str,
+    type: ModelEndpointMonitoringMetricType = ModelEndpointMonitoringMetricType.RESULT,
+) -> str:
+    return ".".join([project, app, type, name])
+_FQN_PART_PATTERN = r"[a-zA-Z0-9_-]+"
+_FQN_PATTERN = (
+    rf"^(?P<project>{_FQN_PART_PATTERN})\."
+    rf"(?P<app>{_FQN_PART_PATTERN})\."
+    rf"(?P<type>{_FQN_PART_PATTERN})\."
+    rf"(?P<name>{_FQN_PART_PATTERN})$"
+)
+_FQN_REGEX = re.compile(_FQN_PATTERN)
+def _parse_metric_fqn_to_monitoring_metric(fqn: str) -> ModelEndpointMonitoringMetric:
+    match = _FQN_REGEX.fullmatch(fqn)
+    if match is None:
+        raise ValueError("The fully qualified name is not in the expected format")
+    return ModelEndpointMonitoringMetric.parse_obj(
+        match.groupdict() | {"full_name": fqn}
+    )
+class _ResultPoint(NamedTuple):
+    timestamp: datetime
+    value: float
+    status: ResultStatusApp
+class _ModelEndpointMonitoringResultValuesBase(BaseModel):
+    full_name: str
+    type: ModelEndpointMonitoringMetricType
+    data: bool
+class ModelEndpointMonitoringResultValues(_ModelEndpointMonitoringResultValuesBase):
+    full_name: str
+    type: ModelEndpointMonitoringMetricType
+    result_kind: ResultKindApp
+    values: list[_ResultPoint]
+    data: bool = True
+class ModelEndpointMonitoringResultNoData(_ModelEndpointMonitoringResultValuesBase):
+    full_name: str
+    type: ModelEndpointMonitoringMetricType
+    data: bool = False
 def _mapping_attributes(
     base_model: BaseModel,
     flattened_dictionary: dict,

mlrun/common/schemas/project.py CHANGED Viewed

@@ -114,6 +114,8 @@ class ProjectSummary(pydantic.BaseModel):
     runs_failed_recent_count: int
     runs_running_count: int
     schedules_count: int
+    pipelines_completed_recent_count: typing.Optional[int] = None
+    pipelines_failed_recent_count: typing.Optional[int] = None
     pipelines_running_count: typing.Optional[int] = None

mlrun/config.py CHANGED Viewed

@@ -361,7 +361,7 @@ default_config = {
             #                  is set to ClusterIP
             #  ---------------------------------------------------------------------
             # Note: adding a mode requires special handling on
-            # - mlrun.runtimes.constants.NuclioIngressAddTemplatedIngressModes
+            # - mlrun.common.runtimes.constants.NuclioIngressAddTemplatedIngressModes
             # - mlrun.runtimes.nuclio.function.enrich_function_with_ingress
             "add_templated_ingress_host_mode": "never",
             "explicit_ack": "enabled",
@@ -503,6 +503,7 @@ default_config = {
             "default": "v3io:///users/pipelines/{project}/model-endpoints/{kind}",
             "user_space": "v3io:///projects/{project}/model-endpoints/{kind}",
             "stream": "",
+            "monitoring_application": "v3io:///users/pipelines/{project}/monitoring-apps/",
         },
         # Offline storage path can be either relative or a full path. This path is used for general offline data
         # storage such as the parquet file which is generated from the monitoring stream function for the drift analysis
@@ -516,6 +517,7 @@ default_config = {
         # See mlrun.model_monitoring.db.stores.ObjectStoreFactory for available options
         "store_type": "v3io-nosql",
         "endpoint_store_connection": "",
+        "tsdb_connector_type": "v3io-tsdb",
     },
     "secret_stores": {
         # Use only in testing scenarios (such as integration tests) to avoid using k8s for secrets (will use in-memory
@@ -554,7 +556,7 @@ default_config = {
             "nosql": "v3io:///projects/{project}/FeatureStore/{name}/nosql",
             # "authority" is optional and generalizes [userinfo "@"] host [":" port]
             "redisnosql": "redis://{authority}/projects/{project}/FeatureStore/{name}/nosql",
-            "dsnosql": "ds://{ds_profile_name}/projects/{project}/FeatureStore/{name}/nosql",
+            "dsnosql": "ds://{ds_profile_name}/projects/{project}/FeatureStore/{name}/{kind}",
         },
         "default_targets": "parquet,nosql",
         "default_job_image": "mlrun/mlrun",
@@ -692,7 +694,7 @@ default_config = {
     "grafana_url": "",
     "alerts": {
         # supported modes: "enabled", "disabled".
-        "mode": "disabled"
+        "mode": "enabled"
     },
     "auth_with_client_id": {
         "enabled": False,
@@ -1088,6 +1090,7 @@ class Config:
         target: str = "online",
         artifact_path: str = None,
         function_name: str = None,
+        **kwargs,
     ) -> typing.Union[str, list[str]]:
         """Get the full path from the configuration based on the provided project and kind.
@@ -1114,7 +1117,7 @@ class Config:
             )
             if store_prefix_dict.get(kind):
                 # Target exist in store prefix and has a valid string value
-                return store_prefix_dict[kind].format(project=project)
+                return store_prefix_dict[kind].format(project=project, **kwargs)
             if (
                 function_name

mlrun/data_types/to_pandas.py CHANGED Viewed

@@ -65,10 +65,10 @@ def toPandas(spark_df):
                 msg = (
                     "toPandas attempted Arrow optimization because "
                     "'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, "
-                    "failed by the reason below:\n  %s\n"
+                    f"failed by the reason below:\n  {e}\n"
                     "Attempting non-optimization as "
                     "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to "
-                    "true." % str(e)
+                    "true."
                 )
                 warnings.warn(msg)
                 use_arrow = False
@@ -78,7 +78,7 @@ def toPandas(spark_df):
                     "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has "
                     "reached the error below and will not continue because automatic fallback "
                     "with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to "
-                    "false.\n  %s" % str(e)
+                    f"false.\n  {e}"
                 )
                 warnings.warn(msg)
                 raise
@@ -144,7 +144,7 @@ def toPandas(spark_df):
                     "reached the error below and can not continue. Note that "
                     "'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an "
                     "effect on failures in the middle of "
-                    "computation.\n  %s" % str(e)
+                    f"computation.\n  {e}"
                 )
                 warnings.warn(msg)
                 raise

mlrun/datastore/base.py CHANGED Viewed

@@ -179,11 +179,23 @@ class DataStore:
         return {}
     @staticmethod
-    def _parquet_reader(df_module, url, file_system, time_column, start_time, end_time):
+    def _parquet_reader(
+        df_module,
+        url,
+        file_system,
+        time_column,
+        start_time,
+        end_time,
+        additional_filters,
+    ):
         from storey.utils import find_filters, find_partitions
         def set_filters(
-            partitions_time_attributes, start_time_inner, end_time_inner, kwargs
+            partitions_time_attributes,
+            start_time_inner,
+            end_time_inner,
+            filters_inner,
+            kwargs,
         ):
             filters = []
             find_filters(
@@ -193,20 +205,23 @@ class DataStore:
                 filters,
                 time_column,
             )
+            if filters and filters_inner:
+                filters[0] += filters_inner
             kwargs["filters"] = filters
         def reader(*args, **kwargs):
-            if start_time or end_time:
-                if time_column is None:
-                    raise mlrun.errors.MLRunInvalidArgumentError(
-                        "When providing start_time or end_time, must provide time_column"
-                    )
+            if time_column is None and (start_time or end_time):
+                raise mlrun.errors.MLRunInvalidArgumentError(
+                    "When providing start_time or end_time, must provide time_column"
+                )
+            if start_time or end_time or additional_filters:
                 partitions_time_attributes = find_partitions(url, file_system)
                 set_filters(
                     partitions_time_attributes,
                     start_time,
                     end_time,
+                    additional_filters,
                     kwargs,
                 )
                 try:
@@ -217,6 +232,7 @@ class DataStore:
                     ):
                         raise ex
+                    # TODO: fix timezone issue (ML-6308)
                     if start_time.tzinfo:
                         start_time_inner = start_time.replace(tzinfo=None)
                         end_time_inner = end_time.replace(tzinfo=None)
@@ -228,6 +244,7 @@ class DataStore:
                         partitions_time_attributes,
                         start_time_inner,
                         end_time_inner,
+                        additional_filters,
                         kwargs,
                     )
                     return df_module.read_parquet(*args, **kwargs)
@@ -246,6 +263,7 @@ class DataStore:
         start_time=None,
         end_time=None,
         time_column=None,
+        additional_filters=None,
         **kwargs,
     ):
         df_module = df_module or pd
@@ -310,7 +328,13 @@ class DataStore:
                 kwargs["columns"] = columns
             reader = self._parquet_reader(
-                df_module, url, file_system, time_column, start_time, end_time
+                df_module,
+                url,
+                file_system,
+                time_column,
+                start_time,
+                end_time,
+                additional_filters,
             )
         elif file_url.endswith(".json") or format == "json":
@@ -539,6 +563,7 @@ class DataItem:
         time_column=None,
         start_time=None,
         end_time=None,
+        additional_filters=None,
         **kwargs,
     ):
         """return a dataframe object (generated from the dataitem).
@@ -550,6 +575,12 @@ class DataItem:
         :param end_time:    filters out data after this time
         :param time_column: Store timestamp_key will be used if None.
                             The results will be filtered by this column and start_time & end_time.
+        :param additional_filters: List of additional_filter conditions as tuples.
+                                    Each tuple should be in the format (column_name, operator, value).
+                                    Supported operators: "=", ">=", "<=", ">", "<".
+                                    Example: [("Product", "=", "Computer")]
+                                    For all supported filters, please see:
+                                    https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html
         """
         df = self._store.as_df(
             self._url,
@@ -560,6 +591,7 @@ class DataItem:
             time_column=time_column,
             start_time=start_time,
             end_time=end_time,
+            additional_filters=additional_filters,
             **kwargs,
         )
         return df

mlrun 1.7.0rc14__py3-none-any.whl → 1.7.0rc16__py3-none-any.whl

Potentially problematic release.

mlrun 1.7.0rc14py3-none-any.whl → 1.7.0rc16py3-none-any.whl