mlrun 1.7.0rc6__py3-none-any.whl → 1.7.0rc8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__main__.py +2 -0
- mlrun/common/constants.py +6 -0
- mlrun/common/schemas/__init__.py +3 -0
- mlrun/common/schemas/api_gateway.py +8 -1
- mlrun/common/schemas/model_monitoring/__init__.py +4 -0
- mlrun/common/schemas/model_monitoring/constants.py +35 -18
- mlrun/common/schemas/project.py +1 -0
- mlrun/common/types.py +7 -1
- mlrun/config.py +34 -10
- mlrun/data_types/data_types.py +4 -0
- mlrun/datastore/alibaba_oss.py +130 -0
- mlrun/datastore/azure_blob.py +4 -5
- mlrun/datastore/base.py +22 -16
- mlrun/datastore/datastore.py +4 -0
- mlrun/datastore/datastore_profile.py +7 -0
- mlrun/datastore/google_cloud_storage.py +1 -1
- mlrun/datastore/sources.py +2 -3
- mlrun/datastore/targets.py +6 -1
- mlrun/db/base.py +14 -6
- mlrun/db/httpdb.py +61 -56
- mlrun/db/nopdb.py +3 -0
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +6 -1
- mlrun/frameworks/tf_keras/mlrun_interface.py +20 -8
- mlrun/kfpops.py +2 -5
- mlrun/model.py +1 -0
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +104 -295
- mlrun/model_monitoring/controller.py +25 -25
- mlrun/model_monitoring/db/__init__.py +16 -0
- mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -34
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +47 -6
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +49 -0
- mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +76 -3
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +68 -0
- mlrun/model_monitoring/{stores → db/stores/sqldb}/models/sqlite.py +13 -1
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +662 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +134 -3
- mlrun/model_monitoring/helpers.py +3 -3
- mlrun/model_monitoring/stream_processing.py +41 -9
- mlrun/model_monitoring/tracking_policy.py +7 -1
- mlrun/model_monitoring/writer.py +4 -36
- mlrun/projects/pipelines.py +14 -2
- mlrun/projects/project.py +118 -103
- mlrun/run.py +5 -1
- mlrun/runtimes/base.py +6 -0
- mlrun/runtimes/nuclio/api_gateway.py +218 -65
- mlrun/runtimes/nuclio/function.py +3 -0
- mlrun/runtimes/nuclio/serving.py +28 -32
- mlrun/runtimes/pod.py +26 -0
- mlrun/serving/routers.py +4 -3
- mlrun/serving/server.py +4 -6
- mlrun/serving/states.py +34 -14
- mlrun/serving/v2_serving.py +4 -3
- mlrun/utils/helpers.py +34 -0
- mlrun/utils/http.py +1 -1
- mlrun/utils/retryer.py +1 -0
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc6.dist-info → mlrun-1.7.0rc8.dist-info}/METADATA +25 -16
- {mlrun-1.7.0rc6.dist-info → mlrun-1.7.0rc8.dist-info}/RECORD +66 -62
- mlrun/model_monitoring/batch.py +0 -933
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/models/mysql.py +0 -34
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
- {mlrun-1.7.0rc6.dist-info → mlrun-1.7.0rc8.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc6.dist-info → mlrun-1.7.0rc8.dist-info}/WHEEL +0 -0
- {mlrun-1.7.0rc6.dist-info → mlrun-1.7.0rc8.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc6.dist-info → mlrun-1.7.0rc8.dist-info}/top_level.txt +0 -0
mlrun/__main__.py
CHANGED
mlrun/common/constants.py
CHANGED
|
@@ -14,4 +14,10 @@
|
|
|
14
14
|
#
|
|
15
15
|
IMAGE_NAME_ENRICH_REGISTRY_PREFIX = "." # prefix for image name to enrich with registry
|
|
16
16
|
MLRUN_CREATED_LABEL = "mlrun-created"
|
|
17
|
+
MLRUN_MODEL_CONF = "model-conf"
|
|
18
|
+
MLRUN_SERVING_SPEC_MOUNT_PATH = f"/tmp/mlrun/{MLRUN_MODEL_CONF}"
|
|
19
|
+
MLRUN_SERVING_SPEC_FILENAME = "serving_spec.json"
|
|
20
|
+
MLRUN_SERVING_SPEC_PATH = (
|
|
21
|
+
f"{MLRUN_SERVING_SPEC_MOUNT_PATH}/{MLRUN_SERVING_SPEC_FILENAME}"
|
|
22
|
+
)
|
|
17
23
|
MYSQL_MEDIUMBLOB_SIZE_BYTES = 16 * 1024 * 1024
|
mlrun/common/schemas/__init__.py
CHANGED
|
@@ -21,6 +21,7 @@ from .api_gateway import (
|
|
|
21
21
|
APIGatewayMetadata,
|
|
22
22
|
APIGatewaysOutput,
|
|
23
23
|
APIGatewaySpec,
|
|
24
|
+
APIGatewayState,
|
|
24
25
|
APIGatewayStatus,
|
|
25
26
|
APIGatewayUpstream,
|
|
26
27
|
)
|
|
@@ -124,6 +125,7 @@ from .model_monitoring import (
|
|
|
124
125
|
EventFieldType,
|
|
125
126
|
EventKeyMetrics,
|
|
126
127
|
Features,
|
|
128
|
+
FeatureSetFeatures,
|
|
127
129
|
FeatureValues,
|
|
128
130
|
GrafanaColumn,
|
|
129
131
|
GrafanaDataPoint,
|
|
@@ -139,6 +141,7 @@ from .model_monitoring import (
|
|
|
139
141
|
ModelMonitoringMode,
|
|
140
142
|
ModelMonitoringStoreKinds,
|
|
141
143
|
MonitoringFunctionNames,
|
|
144
|
+
PrometheusEndpoints,
|
|
142
145
|
TimeSeriesTarget,
|
|
143
146
|
)
|
|
144
147
|
from .notification import (
|
|
@@ -36,6 +36,13 @@ class APIGatewayAuthenticationMode(mlrun.common.types.StrEnum):
|
|
|
36
36
|
)
|
|
37
37
|
|
|
38
38
|
|
|
39
|
+
class APIGatewayState(mlrun.common.types.StrEnum):
|
|
40
|
+
none = ""
|
|
41
|
+
ready = "ready"
|
|
42
|
+
error = "error"
|
|
43
|
+
waiting_for_provisioning = "waitingForProvisioning"
|
|
44
|
+
|
|
45
|
+
|
|
39
46
|
class _APIGatewayBaseModel(pydantic.BaseModel):
|
|
40
47
|
class Config:
|
|
41
48
|
extra = pydantic.Extra.allow
|
|
@@ -72,7 +79,7 @@ class APIGatewaySpec(_APIGatewayBaseModel):
|
|
|
72
79
|
|
|
73
80
|
class APIGatewayStatus(_APIGatewayBaseModel):
|
|
74
81
|
name: Optional[str]
|
|
75
|
-
state: Optional[
|
|
82
|
+
state: Optional[APIGatewayState]
|
|
76
83
|
|
|
77
84
|
|
|
78
85
|
class APIGateway(_APIGatewayBaseModel):
|
|
@@ -22,6 +22,7 @@ from .constants import (
|
|
|
22
22
|
EventFieldType,
|
|
23
23
|
EventKeyMetrics,
|
|
24
24
|
EventLiveStats,
|
|
25
|
+
FeatureSetFeatures,
|
|
25
26
|
FileTargetKind,
|
|
26
27
|
FunctionURI,
|
|
27
28
|
ModelEndpointTarget,
|
|
@@ -29,9 +30,12 @@ from .constants import (
|
|
|
29
30
|
ModelMonitoringStoreKinds,
|
|
30
31
|
MonitoringFunctionNames,
|
|
31
32
|
ProjectSecretKeys,
|
|
33
|
+
PrometheusEndpoints,
|
|
32
34
|
PrometheusMetric,
|
|
35
|
+
SchedulingKeys,
|
|
33
36
|
TimeSeriesTarget,
|
|
34
37
|
VersionedModel,
|
|
38
|
+
WriterEvent,
|
|
35
39
|
)
|
|
36
40
|
from .grafana import (
|
|
37
41
|
GrafanaColumn,
|
|
@@ -21,6 +21,12 @@ import mlrun.common.helpers
|
|
|
21
21
|
from mlrun.common.types import StrEnum
|
|
22
22
|
|
|
23
23
|
|
|
24
|
+
class MonitoringStrEnum(StrEnum):
|
|
25
|
+
@classmethod
|
|
26
|
+
def list(cls):
|
|
27
|
+
return list(map(lambda c: c.value, cls))
|
|
28
|
+
|
|
29
|
+
|
|
24
30
|
class EventFieldType:
|
|
25
31
|
FUNCTION_URI = "function_uri"
|
|
26
32
|
FUNCTION = "function"
|
|
@@ -77,6 +83,20 @@ class EventFieldType:
|
|
|
77
83
|
SAMPLE_PARQUET_PATH = "sample_parquet_path"
|
|
78
84
|
|
|
79
85
|
|
|
86
|
+
class FeatureSetFeatures(MonitoringStrEnum):
|
|
87
|
+
LATENCY = EventFieldType.LATENCY
|
|
88
|
+
ERROR_COUNT = EventFieldType.ERROR_COUNT
|
|
89
|
+
METRICS = EventFieldType.METRICS
|
|
90
|
+
|
|
91
|
+
@classmethod
|
|
92
|
+
def time_stamp(cls):
|
|
93
|
+
return EventFieldType.TIMESTAMP
|
|
94
|
+
|
|
95
|
+
@classmethod
|
|
96
|
+
def entity(cls):
|
|
97
|
+
return EventFieldType.ENDPOINT_ID
|
|
98
|
+
|
|
99
|
+
|
|
80
100
|
class ApplicationEvent:
|
|
81
101
|
APPLICATION_NAME = "application_name"
|
|
82
102
|
CURRENT_STATS = "current_stats"
|
|
@@ -89,7 +109,7 @@ class ApplicationEvent:
|
|
|
89
109
|
OUTPUT_STREAM_URI = "output_stream_uri"
|
|
90
110
|
|
|
91
111
|
|
|
92
|
-
class WriterEvent(
|
|
112
|
+
class WriterEvent(MonitoringStrEnum):
|
|
93
113
|
APPLICATION_NAME = "application_name"
|
|
94
114
|
ENDPOINT_ID = "endpoint_id"
|
|
95
115
|
START_INFER_TIME = "start_infer_time"
|
|
@@ -101,10 +121,6 @@ class WriterEvent(StrEnum):
|
|
|
101
121
|
RESULT_EXTRA_DATA = "result_extra_data"
|
|
102
122
|
CURRENT_STATS = "current_stats"
|
|
103
123
|
|
|
104
|
-
@classmethod
|
|
105
|
-
def list(cls):
|
|
106
|
-
return list(map(lambda c: c.value, cls))
|
|
107
|
-
|
|
108
124
|
|
|
109
125
|
class EventLiveStats:
|
|
110
126
|
LATENCY_AVG_5M = "latency_avg_5m"
|
|
@@ -146,6 +162,9 @@ class ModelMonitoringStoreKinds:
|
|
|
146
162
|
|
|
147
163
|
class SchedulingKeys:
|
|
148
164
|
LAST_ANALYZED = "last_analyzed"
|
|
165
|
+
ENDPOINT_ID = "endpoint_id"
|
|
166
|
+
APPLICATION_NAME = "application_name"
|
|
167
|
+
UID = "uid"
|
|
149
168
|
|
|
150
169
|
|
|
151
170
|
class FileTargetKind:
|
|
@@ -155,6 +174,8 @@ class FileTargetKind:
|
|
|
155
174
|
PARQUET = "parquet"
|
|
156
175
|
APPS_PARQUET = "apps_parquet"
|
|
157
176
|
LOG_STREAM = "log_stream"
|
|
177
|
+
APP_RESULTS = "app_results"
|
|
178
|
+
MONITORING_SCHEDULES = "monitoring_schedules"
|
|
158
179
|
|
|
159
180
|
|
|
160
181
|
class ModelMonitoringMode(str, Enum):
|
|
@@ -177,20 +198,16 @@ class PrometheusMetric:
|
|
|
177
198
|
DRIFT_STATUS = "drift_status"
|
|
178
199
|
|
|
179
200
|
|
|
180
|
-
class
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
201
|
+
class PrometheusEndpoints(MonitoringStrEnum):
|
|
202
|
+
MODEL_MONITORING_METRICS = "/model-monitoring-metrics"
|
|
203
|
+
MONITORING_BATCH_METRICS = "/monitoring-batch-metrics"
|
|
204
|
+
MONITORING_DRIFT_STATUS = "/monitoring-drift-status"
|
|
205
|
+
|
|
185
206
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
MonitoringFunctionNames.STREAM,
|
|
191
|
-
MonitoringFunctionNames.BATCH,
|
|
192
|
-
MonitoringFunctionNames.APPLICATION_CONTROLLER,
|
|
193
|
-
]
|
|
207
|
+
class MonitoringFunctionNames(MonitoringStrEnum):
|
|
208
|
+
STREAM = "model-monitoring-stream"
|
|
209
|
+
APPLICATION_CONTROLLER = "model-monitoring-controller"
|
|
210
|
+
WRITER = "model-monitoring-writer"
|
|
194
211
|
|
|
195
212
|
|
|
196
213
|
@dataclass
|
mlrun/common/schemas/project.py
CHANGED
|
@@ -87,6 +87,7 @@ class ProjectSpec(pydantic.BaseModel):
|
|
|
87
87
|
custom_packagers: typing.Optional[list[tuple[str, bool]]] = None
|
|
88
88
|
default_image: typing.Optional[str] = None
|
|
89
89
|
build: typing.Optional[ImageBuilder] = None
|
|
90
|
+
default_function_node_selector: typing.Optional[dict] = {}
|
|
90
91
|
|
|
91
92
|
class Config:
|
|
92
93
|
extra = pydantic.Extra.allow
|
mlrun/common/types.py
CHANGED
|
@@ -11,7 +11,6 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
#
|
|
15
14
|
|
|
16
15
|
import enum
|
|
17
16
|
|
|
@@ -23,3 +22,10 @@ class StrEnum(str, enum.Enum):
|
|
|
23
22
|
|
|
24
23
|
def __repr__(self):
|
|
25
24
|
return self.value
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# Partial backport from Python 3.11
|
|
28
|
+
# https://docs.python.org/3/library/http.html#http.HTTPMethod
|
|
29
|
+
class HTTPMethod(StrEnum):
|
|
30
|
+
GET = "GET"
|
|
31
|
+
POST = "POST"
|
mlrun/config.py
CHANGED
|
@@ -362,6 +362,8 @@ default_config = {
|
|
|
362
362
|
# - mlrun.runtimes.nuclio.function.enrich_function_with_ingress
|
|
363
363
|
"add_templated_ingress_host_mode": "never",
|
|
364
364
|
"explicit_ack": "enabled",
|
|
365
|
+
# size of serving spec to move to config maps
|
|
366
|
+
"serving_spec_env_cutoff": 4096,
|
|
365
367
|
},
|
|
366
368
|
"logs": {
|
|
367
369
|
"decode": {
|
|
@@ -479,6 +481,14 @@ default_config = {
|
|
|
479
481
|
# if set to true, will log a warning for trying to use run db functionality while in nop db mode
|
|
480
482
|
"verbose": True,
|
|
481
483
|
},
|
|
484
|
+
"pagination": {
|
|
485
|
+
"default_page_size": 20,
|
|
486
|
+
"pagination_cache": {
|
|
487
|
+
"interval": 60,
|
|
488
|
+
"ttl": 3600,
|
|
489
|
+
"max_size": 10000,
|
|
490
|
+
},
|
|
491
|
+
},
|
|
482
492
|
},
|
|
483
493
|
"model_endpoint_monitoring": {
|
|
484
494
|
"serving_stream_args": {"shard_count": 1, "retention_period_hours": 24},
|
|
@@ -498,10 +508,9 @@ default_config = {
|
|
|
498
508
|
# when the user is working in CE environment and has not provided any stream path.
|
|
499
509
|
"default_http_sink": "http://nuclio-{project}-model-monitoring-stream.{namespace}.svc.cluster.local:8080",
|
|
500
510
|
"default_http_sink_app": "http://nuclio-{project}-{application_name}.{namespace}.svc.cluster.local:8080",
|
|
501
|
-
"batch_processing_function_branch": "master",
|
|
502
511
|
"parquet_batching_max_events": 10_000,
|
|
503
512
|
"parquet_batching_timeout_secs": timedelta(minutes=1).total_seconds(),
|
|
504
|
-
# See mlrun.model_monitoring.stores.
|
|
513
|
+
# See mlrun.model_monitoring.db.stores.ObjectStoreFactory for available options
|
|
505
514
|
"store_type": "v3io-nosql",
|
|
506
515
|
"endpoint_store_connection": "",
|
|
507
516
|
},
|
|
@@ -542,6 +551,7 @@ default_config = {
|
|
|
542
551
|
"nosql": "v3io:///projects/{project}/FeatureStore/{name}/{kind}",
|
|
543
552
|
# "authority" is optional and generalizes [userinfo "@"] host [":" port]
|
|
544
553
|
"redisnosql": "redis://{authority}/projects/{project}/FeatureStore/{name}/{kind}",
|
|
554
|
+
"dsnosql": "ds://{ds_profile_name}/projects/{project}/FeatureStore/{name}/{kind}",
|
|
545
555
|
},
|
|
546
556
|
"default_targets": "parquet,nosql",
|
|
547
557
|
"default_job_image": "mlrun/mlrun",
|
|
@@ -616,8 +626,9 @@ default_config = {
|
|
|
616
626
|
},
|
|
617
627
|
"workflows": {
|
|
618
628
|
"default_workflow_runner_name": "workflow-runner-{}",
|
|
619
|
-
# Default timeout seconds for retrieving workflow id after execution
|
|
620
|
-
|
|
629
|
+
# Default timeout seconds for retrieving workflow id after execution
|
|
630
|
+
# Remote workflow timeout is the maximum between remote and the inner engine timeout
|
|
631
|
+
"timeouts": {"local": 120, "kfp": 60, "remote": 60 * 5},
|
|
621
632
|
},
|
|
622
633
|
"log_collector": {
|
|
623
634
|
"address": "localhost:8282",
|
|
@@ -1066,7 +1077,7 @@ class Config:
|
|
|
1066
1077
|
target: str = "online",
|
|
1067
1078
|
artifact_path: str = None,
|
|
1068
1079
|
function_name: str = None,
|
|
1069
|
-
) -> str:
|
|
1080
|
+
) -> typing.Union[str, list[str]]:
|
|
1070
1081
|
"""Get the full path from the configuration based on the provided project and kind.
|
|
1071
1082
|
|
|
1072
1083
|
:param project: Project name.
|
|
@@ -1082,7 +1093,8 @@ class Config:
|
|
|
1082
1093
|
relative artifact path will be taken from the global MLRun artifact path.
|
|
1083
1094
|
:param function_name: Application name, None for model_monitoring_stream.
|
|
1084
1095
|
|
|
1085
|
-
:return: Full configured path for the provided kind.
|
|
1096
|
+
:return: Full configured path for the provided kind. Can be either a single path
|
|
1097
|
+
or a list of paths in the case of the online model monitoring stream path.
|
|
1086
1098
|
"""
|
|
1087
1099
|
|
|
1088
1100
|
if target != "offline":
|
|
@@ -1104,10 +1116,22 @@ class Config:
|
|
|
1104
1116
|
if function_name is None
|
|
1105
1117
|
else f"{kind}-{function_name.lower()}",
|
|
1106
1118
|
)
|
|
1107
|
-
return mlrun.
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1119
|
+
elif kind == "stream": # return list for mlrun<1.6.3 BC
|
|
1120
|
+
return [
|
|
1121
|
+
mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
|
|
1122
|
+
project=project,
|
|
1123
|
+
kind=kind,
|
|
1124
|
+
), # old stream uri (pipelines) for BC ML-6043
|
|
1125
|
+
mlrun.mlconf.model_endpoint_monitoring.store_prefixes.user_space.format(
|
|
1126
|
+
project=project,
|
|
1127
|
+
kind=kind,
|
|
1128
|
+
), # new stream uri (projects)
|
|
1129
|
+
]
|
|
1130
|
+
else:
|
|
1131
|
+
return mlrun.mlconf.model_endpoint_monitoring.store_prefixes.default.format(
|
|
1132
|
+
project=project,
|
|
1133
|
+
kind=kind,
|
|
1134
|
+
)
|
|
1111
1135
|
|
|
1112
1136
|
# Get the current offline path from the configuration
|
|
1113
1137
|
file_path = mlrun.mlconf.model_endpoint_monitoring.offline_storage_path.format(
|
mlrun/data_types/data_types.py
CHANGED
|
@@ -41,6 +41,7 @@ class ValueType(str, Enum):
|
|
|
41
41
|
BYTES = "bytes"
|
|
42
42
|
STRING = "str"
|
|
43
43
|
DATETIME = "datetime"
|
|
44
|
+
LIST = "List"
|
|
44
45
|
BYTES_LIST = "List[bytes]"
|
|
45
46
|
STRING_LIST = "List[string]"
|
|
46
47
|
INT32_LIST = "List[int32]"
|
|
@@ -48,6 +49,7 @@ class ValueType(str, Enum):
|
|
|
48
49
|
DOUBLE_LIST = "List[float]"
|
|
49
50
|
FLOAT_LIST = "List[float32]"
|
|
50
51
|
BOOL_LIST = "List[bool]"
|
|
52
|
+
Tuple = "Tuple"
|
|
51
53
|
|
|
52
54
|
|
|
53
55
|
def pd_schema_to_value_type(value):
|
|
@@ -102,6 +104,8 @@ def python_type_to_value_type(value_type):
|
|
|
102
104
|
"datetime64[ns]": ValueType.INT64,
|
|
103
105
|
"datetime64[ns, tz]": ValueType.INT64,
|
|
104
106
|
"category": ValueType.STRING,
|
|
107
|
+
"list": ValueType.LIST,
|
|
108
|
+
"tuple": ValueType.Tuple,
|
|
105
109
|
}
|
|
106
110
|
|
|
107
111
|
if type_name in type_map:
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# Copyright 2023 Iguazio
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import time
|
|
16
|
+
from datetime import datetime
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from urllib.parse import urlparse
|
|
19
|
+
|
|
20
|
+
import oss2
|
|
21
|
+
from fsspec.registry import get_filesystem_class
|
|
22
|
+
|
|
23
|
+
import mlrun.errors
|
|
24
|
+
|
|
25
|
+
from .base import DataStore, FileStats, makeDatastoreSchemaSanitizer
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class OSSStore(DataStore):
|
|
29
|
+
using_bucket = True
|
|
30
|
+
|
|
31
|
+
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
32
|
+
super().__init__(parent, name, schema, endpoint, secrets)
|
|
33
|
+
# will be used in case user asks to assume a role and work through fsspec
|
|
34
|
+
|
|
35
|
+
access_key_id = self._get_secret_or_env("ALIBABA_ACCESS_KEY_ID")
|
|
36
|
+
secret_key = self._get_secret_or_env("ALIBABA_SECRET_ACCESS_KEY")
|
|
37
|
+
endpoint_url = self._get_secret_or_env("ALIBABA_ENDPOINT_URL")
|
|
38
|
+
if access_key_id and secret_key and endpoint_url:
|
|
39
|
+
self.auth = oss2.Auth(access_key_id, secret_key)
|
|
40
|
+
self.endpoint_url = endpoint_url
|
|
41
|
+
else:
|
|
42
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
43
|
+
"missing ALIBABA_ACCESS_KEY_ID or ALIBABA_SECRET_ACCESS_KEY ALIBABA_ENDPOINT_URL in environment"
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def filesystem(self):
|
|
48
|
+
"""return fsspec file system object, if supported"""
|
|
49
|
+
if self._filesystem:
|
|
50
|
+
return self._filesystem
|
|
51
|
+
try:
|
|
52
|
+
import ossfs # noqa
|
|
53
|
+
except ImportError as exc:
|
|
54
|
+
raise ImportError("ALIBABA ossfs not installed") from exc
|
|
55
|
+
filesystem_class = get_filesystem_class(protocol=self.kind)
|
|
56
|
+
self._filesystem = makeDatastoreSchemaSanitizer(
|
|
57
|
+
filesystem_class,
|
|
58
|
+
using_bucket=self.using_bucket,
|
|
59
|
+
**self.get_storage_options(),
|
|
60
|
+
)
|
|
61
|
+
return self._filesystem
|
|
62
|
+
|
|
63
|
+
def get_storage_options(self):
|
|
64
|
+
res = dict(
|
|
65
|
+
endpoint=self._get_secret_or_env("ALIBABA_ENDPOINT_URL"),
|
|
66
|
+
key=self._get_secret_or_env("ALIBABA_ACCESS_KEY_ID"),
|
|
67
|
+
secret=self._get_secret_or_env("ALIBABA_SECRET_ACCESS_KEY"),
|
|
68
|
+
)
|
|
69
|
+
return self._sanitize_storage_options(res)
|
|
70
|
+
|
|
71
|
+
def get_bucket_and_key(self, key):
|
|
72
|
+
path = self._join(key)[1:]
|
|
73
|
+
return self.endpoint, path
|
|
74
|
+
|
|
75
|
+
def upload(self, key, src_path):
|
|
76
|
+
bucket, key = self.get_bucket_and_key(key)
|
|
77
|
+
oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
|
|
78
|
+
oss.put_object(key, open(src_path, "rb"))
|
|
79
|
+
|
|
80
|
+
def get(self, key, size=None, offset=0):
|
|
81
|
+
bucket, key = self.get_bucket_and_key(key)
|
|
82
|
+
oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
|
|
83
|
+
if size or offset:
|
|
84
|
+
return oss.get_object(key, byte_range=self.get_range(size, offset)).read()
|
|
85
|
+
return oss.get_object(key).read()
|
|
86
|
+
|
|
87
|
+
def put(self, key, data, append=False):
|
|
88
|
+
bucket, key = self.get_bucket_and_key(key)
|
|
89
|
+
oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
|
|
90
|
+
oss.put_object(key, data)
|
|
91
|
+
|
|
92
|
+
def stat(self, key):
|
|
93
|
+
bucket, key = self.get_bucket_and_key(key)
|
|
94
|
+
oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
|
|
95
|
+
obj = oss.get_object_meta(key)
|
|
96
|
+
size = obj.content_length
|
|
97
|
+
modified = datetime.fromtimestamp(obj.last_modified)
|
|
98
|
+
return FileStats(size, time.mktime(modified.timetuple()))
|
|
99
|
+
|
|
100
|
+
def listdir(self, key):
|
|
101
|
+
remote_path = self._convert_key_to_remote_path(key)
|
|
102
|
+
if self.filesystem.isfile(remote_path):
|
|
103
|
+
return key
|
|
104
|
+
remote_path = f"{remote_path}/**"
|
|
105
|
+
files = self.filesystem.glob(remote_path)
|
|
106
|
+
key_length = len(key)
|
|
107
|
+
files = [
|
|
108
|
+
f.split("/", 1)[1][key_length:] for f in files if len(f.split("/")) > 1
|
|
109
|
+
]
|
|
110
|
+
return files
|
|
111
|
+
|
|
112
|
+
def delete(self, key):
|
|
113
|
+
bucket, key = self.get_bucket_and_key(key)
|
|
114
|
+
oss = oss2.Bucket(self.auth, self.endpoint_url, bucket)
|
|
115
|
+
oss.delete_object(key)
|
|
116
|
+
|
|
117
|
+
def _convert_key_to_remote_path(self, key):
|
|
118
|
+
key = key.strip("/")
|
|
119
|
+
schema = urlparse(key).scheme
|
|
120
|
+
# if called without passing dataitem - like in fset.purge_targets,
|
|
121
|
+
# key will include schema.
|
|
122
|
+
if not schema:
|
|
123
|
+
key = Path(self.endpoint, key).as_posix()
|
|
124
|
+
return key
|
|
125
|
+
|
|
126
|
+
@staticmethod
|
|
127
|
+
def get_range(size, offset):
|
|
128
|
+
if size:
|
|
129
|
+
return [offset, size]
|
|
130
|
+
return [offset, None]
|
mlrun/datastore/azure_blob.py
CHANGED
|
@@ -158,18 +158,17 @@ class AzureBlobStore(DataStore):
|
|
|
158
158
|
st[key] = parsed_value
|
|
159
159
|
|
|
160
160
|
account_name = st.get("account_name")
|
|
161
|
-
if not account_name:
|
|
162
|
-
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
163
|
-
"Property 'account_name' is absent both in storage settings and connection string"
|
|
164
|
-
)
|
|
165
161
|
if primary_url:
|
|
166
162
|
if primary_url.startswith("http://"):
|
|
167
163
|
primary_url = primary_url[len("http://") :]
|
|
168
164
|
if primary_url.startswith("https://"):
|
|
169
165
|
primary_url = primary_url[len("https://") :]
|
|
170
166
|
host = primary_url
|
|
171
|
-
|
|
167
|
+
elif account_name:
|
|
172
168
|
host = f"{account_name}.{service}.core.windows.net"
|
|
169
|
+
else:
|
|
170
|
+
return res
|
|
171
|
+
|
|
173
172
|
if "account_key" in st:
|
|
174
173
|
res[f"spark.hadoop.fs.azure.account.key.{host}"] = st["account_key"]
|
|
175
174
|
|
mlrun/datastore/base.py
CHANGED
|
@@ -27,6 +27,7 @@ import requests
|
|
|
27
27
|
import urllib3
|
|
28
28
|
from deprecated import deprecated
|
|
29
29
|
|
|
30
|
+
import mlrun.config
|
|
30
31
|
import mlrun.errors
|
|
31
32
|
from mlrun.errors import err_to_str
|
|
32
33
|
from mlrun.utils import StorePrefix, is_ipython, logger
|
|
@@ -34,10 +35,6 @@ from mlrun.utils import StorePrefix, is_ipython, logger
|
|
|
34
35
|
from .store_resources import is_store_uri, parse_store_uri
|
|
35
36
|
from .utils import filter_df_start_end_time, select_columns_from_df
|
|
36
37
|
|
|
37
|
-
verify_ssl = False
|
|
38
|
-
if not verify_ssl:
|
|
39
|
-
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
40
|
-
|
|
41
38
|
|
|
42
39
|
class FileStats:
|
|
43
40
|
def __init__(self, size, modified, content_type=None):
|
|
@@ -633,17 +630,6 @@ def basic_auth_header(user, password):
|
|
|
633
630
|
return {"Authorization": authstr}
|
|
634
631
|
|
|
635
632
|
|
|
636
|
-
def http_get(url, headers=None, auth=None):
|
|
637
|
-
try:
|
|
638
|
-
response = requests.get(url, headers=headers, auth=auth, verify=verify_ssl)
|
|
639
|
-
except OSError as exc:
|
|
640
|
-
raise OSError(f"error: cannot connect to {url}: {err_to_str(exc)}")
|
|
641
|
-
|
|
642
|
-
mlrun.errors.raise_for_status(response)
|
|
643
|
-
|
|
644
|
-
return response.content
|
|
645
|
-
|
|
646
|
-
|
|
647
633
|
class HttpStore(DataStore):
|
|
648
634
|
def __init__(self, parent, schema, name, endpoint="", secrets: dict = None):
|
|
649
635
|
super().__init__(parent, name, schema, endpoint, secrets)
|
|
@@ -671,7 +657,7 @@ class HttpStore(DataStore):
|
|
|
671
657
|
raise ValueError("unimplemented")
|
|
672
658
|
|
|
673
659
|
def get(self, key, size=None, offset=0):
|
|
674
|
-
data =
|
|
660
|
+
data = self._http_get(self.url + self._join(key), self._headers, self.auth)
|
|
675
661
|
if offset:
|
|
676
662
|
data = data[offset:]
|
|
677
663
|
if size:
|
|
@@ -691,6 +677,26 @@ class HttpStore(DataStore):
|
|
|
691
677
|
f"schema as it is not secure and is not recommended."
|
|
692
678
|
)
|
|
693
679
|
|
|
680
|
+
def _http_get(
|
|
681
|
+
self,
|
|
682
|
+
url,
|
|
683
|
+
headers=None,
|
|
684
|
+
auth=None,
|
|
685
|
+
):
|
|
686
|
+
# import here to prevent import cycle
|
|
687
|
+
from mlrun.config import config as mlconf
|
|
688
|
+
|
|
689
|
+
verify_ssl = mlconf.httpdb.http.verify
|
|
690
|
+
try:
|
|
691
|
+
if not verify_ssl:
|
|
692
|
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
693
|
+
response = requests.get(url, headers=headers, auth=auth, verify=verify_ssl)
|
|
694
|
+
except OSError as exc:
|
|
695
|
+
raise OSError(f"error: cannot connect to {url}: {err_to_str(exc)}")
|
|
696
|
+
|
|
697
|
+
mlrun.errors.raise_for_status(response)
|
|
698
|
+
return response.content
|
|
699
|
+
|
|
694
700
|
|
|
695
701
|
# This wrapper class is designed to extract the 'ds' schema and profile name from URL-formatted paths.
|
|
696
702
|
# Within fsspec, the AbstractFileSystem::_strip_protocol() internal method is used to handle complete URL paths.
|
mlrun/datastore/datastore.py
CHANGED
|
@@ -68,6 +68,9 @@ class TemporaryClientDatastoreProfiles(metaclass=mlrun.utils.singleton.Singleton
|
|
|
68
68
|
def get(self, key):
|
|
69
69
|
return self._data.get(key, None)
|
|
70
70
|
|
|
71
|
+
def remove(self, key):
|
|
72
|
+
self._data.pop(key, None)
|
|
73
|
+
|
|
71
74
|
|
|
72
75
|
class DatastoreProfileBasic(DatastoreProfile):
|
|
73
76
|
type: str = pydantic.Field("basic")
|
|
@@ -460,3 +463,7 @@ def register_temporary_client_datastore_profile(profile: DatastoreProfile):
|
|
|
460
463
|
It's beneficial for testing purposes.
|
|
461
464
|
"""
|
|
462
465
|
TemporaryClientDatastoreProfiles().add(profile)
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def remove_temporary_client_datastore_profile(profile_name: str):
|
|
469
|
+
TemporaryClientDatastoreProfiles().remove(profile_name)
|
|
@@ -132,7 +132,7 @@ class GoogleCloudStorageStore(DataStore):
|
|
|
132
132
|
self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
|
|
133
133
|
|
|
134
134
|
def get_spark_options(self):
|
|
135
|
-
res =
|
|
135
|
+
res = {}
|
|
136
136
|
st = self.get_storage_options()
|
|
137
137
|
if "token" in st:
|
|
138
138
|
res = {"spark.hadoop.google.cloud.auth.service.account.enable": "true"}
|
mlrun/datastore/sources.py
CHANGED
|
@@ -854,12 +854,11 @@ class StreamSource(OnlineSource):
|
|
|
854
854
|
super().__init__(name, attributes=attrs, **kwargs)
|
|
855
855
|
|
|
856
856
|
def add_nuclio_trigger(self, function):
|
|
857
|
-
store,
|
|
857
|
+
store, _, url = mlrun.store_manager.get_or_create_store(self.path)
|
|
858
858
|
if store.kind != "v3io":
|
|
859
859
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
860
860
|
"Only profiles that reference the v3io datastore can be used with StreamSource"
|
|
861
861
|
)
|
|
862
|
-
path = "v3io:/" + path
|
|
863
862
|
storage_options = store.get_storage_options()
|
|
864
863
|
access_key = storage_options.get("v3io_access_key")
|
|
865
864
|
endpoint, stream_path = parse_path(url)
|
|
@@ -883,7 +882,7 @@ class StreamSource(OnlineSource):
|
|
|
883
882
|
kwargs["worker_allocation_mode"] = "static"
|
|
884
883
|
|
|
885
884
|
function.add_v3io_stream_trigger(
|
|
886
|
-
|
|
885
|
+
url,
|
|
887
886
|
self.name,
|
|
888
887
|
self.attributes["group"],
|
|
889
888
|
self.attributes["seek_to"],
|
mlrun/datastore/targets.py
CHANGED
|
@@ -524,7 +524,12 @@ class BaseStoreTarget(DataTargetBase):
|
|
|
524
524
|
store, path_in_store, target_path = self._get_store_and_path()
|
|
525
525
|
target_path = generate_path_with_chunk(self, chunk_id, target_path)
|
|
526
526
|
file_system = store.filesystem
|
|
527
|
-
if
|
|
527
|
+
if (
|
|
528
|
+
file_system.protocol == "file"
|
|
529
|
+
# fsspec 2023.10.0 changed protocol from "file" to ("file", "local")
|
|
530
|
+
or isinstance(file_system.protocol, (tuple, list))
|
|
531
|
+
and "file" in file_system.protocol
|
|
532
|
+
):
|
|
528
533
|
dir = os.path.dirname(target_path)
|
|
529
534
|
if dir:
|
|
530
535
|
os.makedirs(dir, exist_ok=True)
|