mlrun 1.10.0rc40__py3-none-any.whl → 1.11.0rc16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +3 -2
- mlrun/__main__.py +0 -4
- mlrun/artifacts/dataset.py +2 -2
- mlrun/artifacts/plots.py +1 -1
- mlrun/{model_monitoring/db/tsdb/tdengine → auth}/__init__.py +2 -3
- mlrun/auth/nuclio.py +89 -0
- mlrun/auth/providers.py +429 -0
- mlrun/auth/utils.py +415 -0
- mlrun/common/constants.py +7 -0
- mlrun/common/model_monitoring/helpers.py +41 -4
- mlrun/common/runtimes/constants.py +28 -0
- mlrun/common/schemas/__init__.py +13 -3
- mlrun/common/schemas/alert.py +2 -2
- mlrun/common/schemas/api_gateway.py +3 -0
- mlrun/common/schemas/auth.py +10 -10
- mlrun/common/schemas/client_spec.py +4 -0
- mlrun/common/schemas/constants.py +25 -0
- mlrun/common/schemas/frontend_spec.py +1 -8
- mlrun/common/schemas/function.py +24 -0
- mlrun/common/schemas/hub.py +3 -2
- mlrun/common/schemas/model_monitoring/__init__.py +1 -1
- mlrun/common/schemas/model_monitoring/constants.py +2 -2
- mlrun/common/schemas/secret.py +17 -2
- mlrun/common/secrets.py +95 -1
- mlrun/common/types.py +10 -10
- mlrun/config.py +53 -15
- mlrun/data_types/infer.py +2 -2
- mlrun/datastore/__init__.py +2 -3
- mlrun/datastore/base.py +274 -10
- mlrun/datastore/datastore.py +1 -1
- mlrun/datastore/datastore_profile.py +49 -17
- mlrun/datastore/model_provider/huggingface_provider.py +6 -2
- mlrun/datastore/model_provider/model_provider.py +2 -2
- mlrun/datastore/model_provider/openai_provider.py +2 -2
- mlrun/datastore/s3.py +15 -16
- mlrun/datastore/sources.py +1 -1
- mlrun/datastore/store_resources.py +4 -4
- mlrun/datastore/storeytargets.py +16 -10
- mlrun/datastore/targets.py +1 -1
- mlrun/datastore/utils.py +16 -3
- mlrun/datastore/v3io.py +1 -1
- mlrun/db/base.py +36 -12
- mlrun/db/httpdb.py +316 -101
- mlrun/db/nopdb.py +29 -11
- mlrun/errors.py +4 -2
- mlrun/execution.py +11 -12
- mlrun/feature_store/api.py +1 -1
- mlrun/feature_store/common.py +1 -1
- mlrun/feature_store/feature_vector_utils.py +1 -1
- mlrun/feature_store/steps.py +8 -6
- mlrun/frameworks/_common/utils.py +3 -3
- mlrun/frameworks/_dl_common/loggers/logger.py +1 -1
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +2 -1
- mlrun/frameworks/_ml_common/loggers/mlrun_logger.py +1 -1
- mlrun/frameworks/_ml_common/utils.py +2 -1
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +4 -3
- mlrun/frameworks/lgbm/mlrun_interfaces/mlrun_interface.py +2 -1
- mlrun/frameworks/onnx/dataset.py +2 -1
- mlrun/frameworks/onnx/mlrun_interface.py +2 -1
- mlrun/frameworks/pytorch/callbacks/logging_callback.py +5 -4
- mlrun/frameworks/pytorch/callbacks/mlrun_logging_callback.py +2 -1
- mlrun/frameworks/pytorch/callbacks/tensorboard_logging_callback.py +2 -1
- mlrun/frameworks/pytorch/utils.py +2 -1
- mlrun/frameworks/sklearn/metric.py +2 -1
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +5 -4
- mlrun/frameworks/tf_keras/callbacks/mlrun_logging_callback.py +2 -1
- mlrun/frameworks/tf_keras/callbacks/tensorboard_logging_callback.py +2 -1
- mlrun/hub/__init__.py +37 -0
- mlrun/hub/base.py +142 -0
- mlrun/hub/module.py +67 -76
- mlrun/hub/step.py +113 -0
- mlrun/launcher/base.py +2 -1
- mlrun/launcher/local.py +2 -1
- mlrun/model.py +12 -2
- mlrun/model_monitoring/__init__.py +0 -1
- mlrun/model_monitoring/api.py +2 -2
- mlrun/model_monitoring/applications/base.py +20 -6
- mlrun/model_monitoring/applications/context.py +1 -0
- mlrun/model_monitoring/controller.py +7 -17
- mlrun/model_monitoring/db/_schedules.py +2 -16
- mlrun/model_monitoring/db/_stats.py +2 -13
- mlrun/model_monitoring/db/tsdb/__init__.py +9 -7
- mlrun/model_monitoring/db/tsdb/base.py +2 -4
- mlrun/model_monitoring/db/tsdb/preaggregate.py +234 -0
- mlrun/model_monitoring/db/tsdb/stream_graph_steps.py +63 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_metrics_queries.py +414 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_predictions_queries.py +376 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/queries/timescaledb_results_queries.py +590 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connection.py +434 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_connector.py +541 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_operations.py +808 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_schema.py +502 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream.py +163 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/timescaledb_stream_graph_steps.py +60 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_dataframe_processor.py +141 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/utils/timescaledb_query_builder.py +585 -0
- mlrun/model_monitoring/db/tsdb/timescaledb/writer_graph_steps.py +73 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +4 -6
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +147 -79
- mlrun/model_monitoring/features_drift_table.py +2 -1
- mlrun/model_monitoring/helpers.py +2 -1
- mlrun/model_monitoring/stream_processing.py +18 -16
- mlrun/model_monitoring/writer.py +4 -3
- mlrun/package/__init__.py +2 -1
- mlrun/platforms/__init__.py +0 -44
- mlrun/platforms/iguazio.py +1 -1
- mlrun/projects/operations.py +11 -10
- mlrun/projects/project.py +81 -82
- mlrun/run.py +4 -7
- mlrun/runtimes/__init__.py +2 -204
- mlrun/runtimes/base.py +89 -21
- mlrun/runtimes/constants.py +225 -0
- mlrun/runtimes/daskjob.py +4 -2
- mlrun/runtimes/databricks_job/databricks_runtime.py +2 -1
- mlrun/runtimes/mounts.py +5 -0
- mlrun/runtimes/nuclio/__init__.py +12 -8
- mlrun/runtimes/nuclio/api_gateway.py +36 -6
- mlrun/runtimes/nuclio/application/application.py +200 -32
- mlrun/runtimes/nuclio/function.py +154 -49
- mlrun/runtimes/nuclio/serving.py +55 -42
- mlrun/runtimes/pod.py +59 -10
- mlrun/secrets.py +46 -2
- mlrun/serving/__init__.py +2 -0
- mlrun/serving/remote.py +5 -5
- mlrun/serving/routers.py +3 -3
- mlrun/serving/server.py +46 -43
- mlrun/serving/serving_wrapper.py +6 -2
- mlrun/serving/states.py +554 -207
- mlrun/serving/steps.py +1 -1
- mlrun/serving/system_steps.py +42 -33
- mlrun/track/trackers/mlflow_tracker.py +29 -31
- mlrun/utils/helpers.py +89 -16
- mlrun/utils/http.py +9 -2
- mlrun/utils/notifications/notification/git.py +1 -1
- mlrun/utils/notifications/notification/mail.py +39 -16
- mlrun/utils/notifications/notification_pusher.py +2 -2
- mlrun/utils/version/version.json +2 -2
- mlrun/utils/version/version.py +3 -4
- {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/METADATA +39 -49
- {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/RECORD +144 -130
- mlrun/db/auth_utils.py +0 -152
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +0 -343
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +0 -75
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connection.py +0 -281
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +0 -1368
- mlrun/model_monitoring/db/tsdb/tdengine/writer_graph_steps.py +0 -51
- {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/WHEEL +0 -0
- {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/entry_points.txt +0 -0
- {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/licenses/LICENSE +0 -0
- {mlrun-1.10.0rc40.dist-info → mlrun-1.11.0rc16.dist-info}/top_level.txt +0 -0
mlrun/datastore/base.py
CHANGED
|
@@ -11,11 +11,14 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
import datetime
|
|
15
|
+
import os
|
|
16
|
+
import os.path
|
|
14
17
|
import tempfile
|
|
15
18
|
import urllib.parse
|
|
16
19
|
from base64 import b64encode
|
|
17
20
|
from copy import copy
|
|
18
|
-
from
|
|
21
|
+
from types import ModuleType
|
|
19
22
|
from typing import Optional, Union
|
|
20
23
|
from urllib.parse import urlparse
|
|
21
24
|
|
|
@@ -26,6 +29,7 @@ import pyarrow
|
|
|
26
29
|
import pytz
|
|
27
30
|
import requests
|
|
28
31
|
|
|
32
|
+
import mlrun.common.schemas
|
|
29
33
|
import mlrun.config
|
|
30
34
|
import mlrun.errors
|
|
31
35
|
from mlrun.datastore.remote_client import BaseRemoteClient
|
|
@@ -156,6 +160,195 @@ class DataStore(BaseRemoteClient):
|
|
|
156
160
|
def get_spark_options(self, path=None):
|
|
157
161
|
return {}
|
|
158
162
|
|
|
163
|
+
@staticmethod
|
|
164
|
+
def _is_directory_in_range(
|
|
165
|
+
start_time: Optional[datetime.datetime],
|
|
166
|
+
end_time: Optional[datetime.datetime],
|
|
167
|
+
year: int,
|
|
168
|
+
month: Optional[int] = None,
|
|
169
|
+
day: Optional[int] = None,
|
|
170
|
+
hour: Optional[int] = None,
|
|
171
|
+
**kwargs,
|
|
172
|
+
):
|
|
173
|
+
"""Check if a partition directory (year=.., month=.., etc.) is in the time range."""
|
|
174
|
+
from dateutil.relativedelta import relativedelta
|
|
175
|
+
|
|
176
|
+
partition_start = datetime.datetime(
|
|
177
|
+
year=year,
|
|
178
|
+
month=month or 1,
|
|
179
|
+
day=day or 1,
|
|
180
|
+
hour=hour or 0,
|
|
181
|
+
tzinfo=start_time.tzinfo if start_time else end_time.tzinfo,
|
|
182
|
+
)
|
|
183
|
+
partition_end = (
|
|
184
|
+
partition_start
|
|
185
|
+
+ relativedelta(
|
|
186
|
+
years=1 if month is None else 0,
|
|
187
|
+
months=1 if day is None and month is not None else 0,
|
|
188
|
+
days=1 if hour is None and day is not None else 0,
|
|
189
|
+
hours=1 if hour is not None else 0,
|
|
190
|
+
)
|
|
191
|
+
- datetime.timedelta(microseconds=1)
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
if (end_time and end_time < partition_start) or (
|
|
195
|
+
start_time and start_time > partition_end
|
|
196
|
+
):
|
|
197
|
+
return False
|
|
198
|
+
return True
|
|
199
|
+
|
|
200
|
+
@staticmethod
|
|
201
|
+
def _list_partition_paths_helper(
|
|
202
|
+
paths: list[str],
|
|
203
|
+
start_time: Optional[datetime.datetime],
|
|
204
|
+
end_time: Optional[datetime.datetime],
|
|
205
|
+
current_path: str,
|
|
206
|
+
partition_level: str,
|
|
207
|
+
filesystem,
|
|
208
|
+
):
|
|
209
|
+
directory_split = current_path.rsplit("/", 1)
|
|
210
|
+
time_unit = None
|
|
211
|
+
directory_start, directory_end = "", ""
|
|
212
|
+
if len(directory_split) == 2:
|
|
213
|
+
directory_start, directory_end = directory_split
|
|
214
|
+
time_unit = directory_end.split("=")[0] if "=" in directory_end else None
|
|
215
|
+
|
|
216
|
+
if not time_unit and directory_end.endswith((".parquet", ".pq")):
|
|
217
|
+
paths.append(directory_start.rstrip("/"))
|
|
218
|
+
return
|
|
219
|
+
elif time_unit and time_unit == partition_level:
|
|
220
|
+
paths.append(current_path.rstrip("/"))
|
|
221
|
+
return
|
|
222
|
+
|
|
223
|
+
directories = filesystem.ls(current_path, detail=True)
|
|
224
|
+
if len(directories) == 0:
|
|
225
|
+
return
|
|
226
|
+
for directory in directories:
|
|
227
|
+
current_path = directory["name"]
|
|
228
|
+
parts = [p for p in current_path.split("/") if "=" in p]
|
|
229
|
+
kwargs = {}
|
|
230
|
+
for part in parts:
|
|
231
|
+
key, value = part.split("=", 1)
|
|
232
|
+
if value.isdigit():
|
|
233
|
+
value = int(value)
|
|
234
|
+
kwargs[key] = value
|
|
235
|
+
if DataStore._is_directory_in_range(start_time, end_time, **kwargs):
|
|
236
|
+
DataStore._list_partition_paths_helper(
|
|
237
|
+
paths,
|
|
238
|
+
start_time,
|
|
239
|
+
end_time,
|
|
240
|
+
current_path,
|
|
241
|
+
partition_level,
|
|
242
|
+
filesystem,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
@staticmethod
|
|
246
|
+
def _list_partitioned_paths(
|
|
247
|
+
base_url: str,
|
|
248
|
+
start_time: Optional[datetime.datetime],
|
|
249
|
+
end_time: Optional[datetime.datetime],
|
|
250
|
+
partition_level: str,
|
|
251
|
+
filesystem,
|
|
252
|
+
):
|
|
253
|
+
paths = []
|
|
254
|
+
parsed_base_url = urlparse(base_url)
|
|
255
|
+
base_path = parsed_base_url.path
|
|
256
|
+
|
|
257
|
+
if parsed_base_url.scheme not in ["v3io", "v3ios"]:
|
|
258
|
+
base_path = parsed_base_url.netloc + base_path
|
|
259
|
+
|
|
260
|
+
DataStore._list_partition_paths_helper(
|
|
261
|
+
paths, start_time, end_time, base_path, partition_level, filesystem
|
|
262
|
+
)
|
|
263
|
+
paths = [
|
|
264
|
+
DataStore._reconstruct_path_from_base_url(parsed_base_url, path)
|
|
265
|
+
for path in paths
|
|
266
|
+
]
|
|
267
|
+
return paths
|
|
268
|
+
|
|
269
|
+
@staticmethod
|
|
270
|
+
def _reconstruct_path_from_base_url(
|
|
271
|
+
parsed_base_url: urllib.parse.ParseResult, returned_path: str
|
|
272
|
+
) -> str:
|
|
273
|
+
scheme = parsed_base_url.scheme
|
|
274
|
+
authority = parsed_base_url.netloc
|
|
275
|
+
returned_path = returned_path.lstrip("/")
|
|
276
|
+
if scheme == "v3io":
|
|
277
|
+
return f"{scheme}://{authority}/{returned_path}"
|
|
278
|
+
else:
|
|
279
|
+
return f"{scheme}://{returned_path}"
|
|
280
|
+
|
|
281
|
+
@staticmethod
|
|
282
|
+
def _clean_filters_for_partitions(
|
|
283
|
+
filters: list[list[tuple]],
|
|
284
|
+
partition_keys: list[str],
|
|
285
|
+
):
|
|
286
|
+
"""
|
|
287
|
+
Remove partition keys from filters.
|
|
288
|
+
|
|
289
|
+
:param filters: pandas-style filters
|
|
290
|
+
Example: [[('year','=',2025),('month','=',11),('timestamp','>',ts1)]]
|
|
291
|
+
:param partition_keys: partition columns handled via directory
|
|
292
|
+
|
|
293
|
+
:return list of list of tuples: cleaned filters without partition keys
|
|
294
|
+
"""
|
|
295
|
+
cleaned_filters = []
|
|
296
|
+
for group in filters:
|
|
297
|
+
new_group = [f for f in group if f[0] not in partition_keys]
|
|
298
|
+
if new_group:
|
|
299
|
+
cleaned_filters.append(new_group)
|
|
300
|
+
return cleaned_filters
|
|
301
|
+
|
|
302
|
+
@staticmethod
|
|
303
|
+
def _read_partitioned_parquet(
|
|
304
|
+
base_url: str,
|
|
305
|
+
start_time: Optional[datetime.datetime],
|
|
306
|
+
end_time: Optional[datetime.datetime],
|
|
307
|
+
partition_keys: list[str],
|
|
308
|
+
df_module: ModuleType,
|
|
309
|
+
filesystem: fsspec.AbstractFileSystem,
|
|
310
|
+
**kwargs,
|
|
311
|
+
):
|
|
312
|
+
"""
|
|
313
|
+
Reads only the relevant partitions and concatenates the results.
|
|
314
|
+
Note that partition_keys cannot be empty.
|
|
315
|
+
"""
|
|
316
|
+
logger.debug(f"Starting partition discovery process for {base_url}")
|
|
317
|
+
|
|
318
|
+
paths = DataStore._list_partitioned_paths(
|
|
319
|
+
base_url,
|
|
320
|
+
start_time,
|
|
321
|
+
end_time,
|
|
322
|
+
partition_keys[-1],
|
|
323
|
+
filesystem,
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
dfs = []
|
|
327
|
+
for current_path in paths:
|
|
328
|
+
try:
|
|
329
|
+
kwargs["filters"] = DataStore._clean_filters_for_partitions(
|
|
330
|
+
kwargs["filters"], partition_keys
|
|
331
|
+
)
|
|
332
|
+
df = df_module.read_parquet(current_path, **kwargs)
|
|
333
|
+
logger.debug(
|
|
334
|
+
"Finished reading DataFrame from subpath",
|
|
335
|
+
url=current_path,
|
|
336
|
+
)
|
|
337
|
+
dfs.append(df)
|
|
338
|
+
except FileNotFoundError as e:
|
|
339
|
+
# Skip partitions that don't exist or have no data
|
|
340
|
+
logger.warning(
|
|
341
|
+
"Failed to read DataFrame", url=current_path, exception=e
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
final_df = pd.concat(dfs) if dfs else pd.DataFrame()
|
|
345
|
+
logger.debug(
|
|
346
|
+
"Finished reading partitioned parquet files",
|
|
347
|
+
url=base_url,
|
|
348
|
+
columns=final_df.columns,
|
|
349
|
+
)
|
|
350
|
+
return final_df
|
|
351
|
+
|
|
159
352
|
@staticmethod
|
|
160
353
|
def _parquet_reader(
|
|
161
354
|
df_module,
|
|
@@ -165,6 +358,7 @@ class DataStore(BaseRemoteClient):
|
|
|
165
358
|
start_time,
|
|
166
359
|
end_time,
|
|
167
360
|
additional_filters,
|
|
361
|
+
optimize_discovery,
|
|
168
362
|
):
|
|
169
363
|
from storey.utils import find_filters, find_partitions
|
|
170
364
|
|
|
@@ -203,7 +397,10 @@ class DataStore(BaseRemoteClient):
|
|
|
203
397
|
)
|
|
204
398
|
|
|
205
399
|
if start_time or end_time or additional_filters:
|
|
206
|
-
partitions_time_attributes = find_partitions(
|
|
400
|
+
partitions_time_attributes, partitions = find_partitions(
|
|
401
|
+
url, file_system
|
|
402
|
+
)
|
|
403
|
+
logger.debug("Partitioned parquet read", partitions=partitions)
|
|
207
404
|
set_filters(
|
|
208
405
|
partitions_time_attributes,
|
|
209
406
|
start_time,
|
|
@@ -211,8 +408,28 @@ class DataStore(BaseRemoteClient):
|
|
|
211
408
|
additional_filters,
|
|
212
409
|
kwargs,
|
|
213
410
|
)
|
|
411
|
+
|
|
214
412
|
try:
|
|
215
|
-
|
|
413
|
+
if (
|
|
414
|
+
optimize_discovery
|
|
415
|
+
and partitions_time_attributes
|
|
416
|
+
and DataStore._verify_path_partition_level(
|
|
417
|
+
urlparse(url).path, partitions
|
|
418
|
+
)
|
|
419
|
+
and (start_time or end_time)
|
|
420
|
+
):
|
|
421
|
+
return DataStore._read_partitioned_parquet(
|
|
422
|
+
url,
|
|
423
|
+
start_time,
|
|
424
|
+
end_time,
|
|
425
|
+
partitions_time_attributes,
|
|
426
|
+
df_module,
|
|
427
|
+
file_system,
|
|
428
|
+
**kwargs,
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
else:
|
|
432
|
+
return df_module.read_parquet(*args, **kwargs)
|
|
216
433
|
except pyarrow.lib.ArrowInvalid as ex:
|
|
217
434
|
if not str(ex).startswith(
|
|
218
435
|
"Cannot compare timestamp with timezone to timestamp without timezone"
|
|
@@ -238,7 +455,24 @@ class DataStore(BaseRemoteClient):
|
|
|
238
455
|
additional_filters,
|
|
239
456
|
kwargs,
|
|
240
457
|
)
|
|
241
|
-
|
|
458
|
+
if (
|
|
459
|
+
optimize_discovery
|
|
460
|
+
and partitions_time_attributes
|
|
461
|
+
and DataStore._verify_path_partition_level(
|
|
462
|
+
urlparse(url).path, partitions
|
|
463
|
+
)
|
|
464
|
+
):
|
|
465
|
+
return DataStore._read_partitioned_parquet(
|
|
466
|
+
url,
|
|
467
|
+
start_time_inner,
|
|
468
|
+
end_time_inner,
|
|
469
|
+
partitions_time_attributes,
|
|
470
|
+
df_module,
|
|
471
|
+
file_system,
|
|
472
|
+
**kwargs,
|
|
473
|
+
)
|
|
474
|
+
else:
|
|
475
|
+
return df_module.read_parquet(*args, **kwargs)
|
|
242
476
|
else:
|
|
243
477
|
return df_module.read_parquet(*args, **kwargs)
|
|
244
478
|
|
|
@@ -261,6 +495,10 @@ class DataStore(BaseRemoteClient):
|
|
|
261
495
|
file_url = self._sanitize_url(url)
|
|
262
496
|
is_csv, is_json, drop_time_column = False, False, False
|
|
263
497
|
file_system = self.filesystem
|
|
498
|
+
|
|
499
|
+
# Feature flag optimize partition discovery by providing specific partition levels urls to the parquet reader
|
|
500
|
+
optimize_discovery = kwargs.pop("optimize_discovery", True)
|
|
501
|
+
|
|
264
502
|
if file_url.endswith(".csv") or format == "csv":
|
|
265
503
|
is_csv = True
|
|
266
504
|
drop_time_column = False
|
|
@@ -322,6 +560,7 @@ class DataStore(BaseRemoteClient):
|
|
|
322
560
|
start_time,
|
|
323
561
|
end_time,
|
|
324
562
|
additional_filters,
|
|
563
|
+
optimize_discovery,
|
|
325
564
|
)
|
|
326
565
|
|
|
327
566
|
elif file_url.endswith(".json") or format == "json":
|
|
@@ -347,7 +586,7 @@ class DataStore(BaseRemoteClient):
|
|
|
347
586
|
temp_file = tempfile.NamedTemporaryFile(delete=False)
|
|
348
587
|
self.download(self._join(subpath), temp_file.name)
|
|
349
588
|
df = reader(temp_file.name, **kwargs)
|
|
350
|
-
remove(temp_file.name)
|
|
589
|
+
os.remove(temp_file.name)
|
|
351
590
|
|
|
352
591
|
if is_json or is_csv:
|
|
353
592
|
# for parquet file the time filtering is executed in `reader`
|
|
@@ -387,6 +626,26 @@ class DataStore(BaseRemoteClient):
|
|
|
387
626
|
except ImportError:
|
|
388
627
|
return False
|
|
389
628
|
|
|
629
|
+
@staticmethod
|
|
630
|
+
def _verify_path_partition_level(base_path: str, partitions: list[str]) -> bool:
|
|
631
|
+
if not partitions:
|
|
632
|
+
return False
|
|
633
|
+
|
|
634
|
+
path_parts = base_path.strip("/").split("/")
|
|
635
|
+
path_parts = [part.split("=")[0] for part in path_parts if "=" in part]
|
|
636
|
+
if "hour" in partitions:
|
|
637
|
+
hour_index = partitions.index("hour")
|
|
638
|
+
else:
|
|
639
|
+
return False
|
|
640
|
+
for i, part in enumerate(partitions):
|
|
641
|
+
if not (
|
|
642
|
+
part in path_parts
|
|
643
|
+
or part in ["year", "month", "day", "hour"]
|
|
644
|
+
or i > hour_index
|
|
645
|
+
):
|
|
646
|
+
return False
|
|
647
|
+
return True
|
|
648
|
+
|
|
390
649
|
|
|
391
650
|
class DataItem:
|
|
392
651
|
"""Data input/output class abstracting access to various local/remote data sources
|
|
@@ -439,7 +698,7 @@ class DataItem:
|
|
|
439
698
|
@property
|
|
440
699
|
def suffix(self):
|
|
441
700
|
"""DataItem suffix (file extension) e.g. '.png'"""
|
|
442
|
-
_, file_ext = path.splitext(self._path)
|
|
701
|
+
_, file_ext = os.path.splitext(self._path)
|
|
443
702
|
return file_ext
|
|
444
703
|
|
|
445
704
|
@property
|
|
@@ -548,7 +807,7 @@ class DataItem:
|
|
|
548
807
|
return
|
|
549
808
|
|
|
550
809
|
if self._local_path:
|
|
551
|
-
remove(self._local_path)
|
|
810
|
+
os.remove(self._local_path)
|
|
552
811
|
self._local_path = ""
|
|
553
812
|
|
|
554
813
|
def as_df(
|
|
@@ -648,8 +907,10 @@ def basic_auth_header(user, password):
|
|
|
648
907
|
username = user.encode("latin1")
|
|
649
908
|
password = password.encode("latin1")
|
|
650
909
|
base = b64encode(b":".join((username, password))).strip()
|
|
651
|
-
authstr =
|
|
652
|
-
|
|
910
|
+
authstr = mlrun.common.schemas.AuthorizationHeaderPrefixes.basic + base.decode(
|
|
911
|
+
"ascii"
|
|
912
|
+
)
|
|
913
|
+
return {mlrun.common.schemas.HeaderNames.authorization: authstr}
|
|
653
914
|
|
|
654
915
|
|
|
655
916
|
class HttpStore(DataStore):
|
|
@@ -696,7 +957,10 @@ class HttpStore(DataStore):
|
|
|
696
957
|
token = self._get_secret_or_env("HTTPS_AUTH_TOKEN")
|
|
697
958
|
if token:
|
|
698
959
|
self._https_auth_token = token
|
|
699
|
-
self._headers.setdefault(
|
|
960
|
+
self._headers.setdefault(
|
|
961
|
+
mlrun.common.schemas.HeaderNames.authorization,
|
|
962
|
+
f"{mlrun.common.schemas.AuthorizationHeaderPrefixes.bearer}{token}",
|
|
963
|
+
)
|
|
700
964
|
|
|
701
965
|
def _validate_https_token(self):
|
|
702
966
|
if self._https_auth_token and self._schema in ["http"]:
|
mlrun/datastore/datastore.py
CHANGED
|
@@ -47,7 +47,7 @@ from .v3io import V3ioStore
|
|
|
47
47
|
in_memory_store = InMemoryStore()
|
|
48
48
|
|
|
49
49
|
|
|
50
|
-
def schema_to_store(schema) -> DataStore
|
|
50
|
+
def schema_to_store(schema) -> type[DataStore]:
|
|
51
51
|
# import store classes inside to enable making their dependencies optional (package extras)
|
|
52
52
|
|
|
53
53
|
if not schema or schema in get_local_file_schema():
|
|
@@ -16,7 +16,7 @@ import ast
|
|
|
16
16
|
import base64
|
|
17
17
|
import json
|
|
18
18
|
import typing
|
|
19
|
-
from urllib.parse import ParseResult, urlparse
|
|
19
|
+
from urllib.parse import ParseResult, quote, unquote, urlparse
|
|
20
20
|
|
|
21
21
|
import pydantic.v1
|
|
22
22
|
from deprecated import deprecated
|
|
@@ -283,8 +283,9 @@ class DatastoreProfileRedis(DatastoreProfile):
|
|
|
283
283
|
|
|
284
284
|
def url_with_credentials(self):
|
|
285
285
|
parsed_url = urlparse(self.endpoint_url)
|
|
286
|
-
username
|
|
287
|
-
|
|
286
|
+
# URL-encode username and password to handle special characters like @, :, /
|
|
287
|
+
username = quote(self.username, safe="") if self.username else None
|
|
288
|
+
password = quote(self.password, safe="") if self.password else None
|
|
288
289
|
netloc = parsed_url.hostname
|
|
289
290
|
if username:
|
|
290
291
|
if password:
|
|
@@ -448,40 +449,71 @@ class DatastoreProfileHdfs(DatastoreProfile):
|
|
|
448
449
|
return f"webhdfs://{self.host}:{self.http_port}{subpath}"
|
|
449
450
|
|
|
450
451
|
|
|
451
|
-
class
|
|
452
|
+
class DatastoreProfilePostgreSQL(DatastoreProfile):
|
|
452
453
|
"""
|
|
453
|
-
A profile that holds the required parameters for a
|
|
454
|
-
|
|
454
|
+
A profile that holds the required parameters for a PostgreSQL database.
|
|
455
|
+
PostgreSQL uses standard PostgreSQL connection parameters.
|
|
455
456
|
"""
|
|
456
457
|
|
|
457
|
-
type: str = pydantic.v1.Field("
|
|
458
|
+
type: str = pydantic.v1.Field("postgresql")
|
|
458
459
|
_private_attributes = ["password"]
|
|
459
460
|
user: str
|
|
460
461
|
# The password cannot be empty in real world scenarios. It's here just because of the profiles completion design.
|
|
461
462
|
password: typing.Optional[str]
|
|
462
463
|
host: str
|
|
463
464
|
port: int
|
|
465
|
+
database: str = "postgres" # Default PostgreSQL admin database
|
|
464
466
|
|
|
465
|
-
def dsn(self) -> str:
|
|
466
|
-
"""
|
|
467
|
-
|
|
467
|
+
def dsn(self, database: typing.Optional[str] = None) -> str:
|
|
468
|
+
"""
|
|
469
|
+
Get the Data Source Name of the configured PostgreSQL profile.
|
|
470
|
+
|
|
471
|
+
:param database: Optional database name to use instead of the configured one.
|
|
472
|
+
If None, uses the configured database.
|
|
473
|
+
:return: The DSN string.
|
|
474
|
+
"""
|
|
475
|
+
db = database or self.database
|
|
476
|
+
# URL-encode credentials and database to handle special characters
|
|
477
|
+
user = quote(self.user, safe="")
|
|
478
|
+
password = quote(self.password or "", safe="")
|
|
479
|
+
db_encoded = quote(db, safe="")
|
|
480
|
+
return f"{self.type}://{user}:{password}@{self.host}:{self.port}/{db_encoded}"
|
|
481
|
+
|
|
482
|
+
def admin_dsn(self) -> str:
|
|
483
|
+
"""
|
|
484
|
+
Get DSN for administrative operations using the 'postgres' database.
|
|
485
|
+
|
|
486
|
+
Assumes the default 'postgres' database exists (standard PostgreSQL setup).
|
|
487
|
+
Used for admin tasks like creating/dropping databases.
|
|
488
|
+
|
|
489
|
+
:return: DSN pointing to the 'postgres' database.
|
|
490
|
+
"""
|
|
491
|
+
return self.dsn(database="postgres")
|
|
468
492
|
|
|
469
493
|
@classmethod
|
|
470
|
-
def from_dsn(cls, dsn: str, profile_name: str) -> "
|
|
494
|
+
def from_dsn(cls, dsn: str, profile_name: str) -> "DatastoreProfilePostgreSQL":
|
|
471
495
|
"""
|
|
472
|
-
Construct a
|
|
496
|
+
Construct a PostgreSQL profile from DSN (connection string) and a name for the profile.
|
|
473
497
|
|
|
474
|
-
:param dsn: The DSN (Data Source Name) of the
|
|
498
|
+
:param dsn: The DSN (Data Source Name) of the PostgreSQL database,
|
|
499
|
+
e.g.: ``"postgresql://user:password@localhost:5432/mydb"``.
|
|
475
500
|
:param profile_name: The new profile's name.
|
|
476
|
-
:return: The
|
|
501
|
+
:return: The PostgreSQL profile.
|
|
477
502
|
"""
|
|
478
503
|
parsed_url = urlparse(dsn)
|
|
504
|
+
# URL-decode username, password, and database (urlparse doesn't decode them)
|
|
505
|
+
username = unquote(parsed_url.username) if parsed_url.username else None
|
|
506
|
+
password = unquote(parsed_url.password) if parsed_url.password else None
|
|
507
|
+
database = (
|
|
508
|
+
unquote(parsed_url.path.lstrip("/")) if parsed_url.path else "postgres"
|
|
509
|
+
)
|
|
479
510
|
return cls(
|
|
480
511
|
name=profile_name,
|
|
481
|
-
user=
|
|
482
|
-
password=
|
|
512
|
+
user=username,
|
|
513
|
+
password=password,
|
|
483
514
|
host=parsed_url.hostname,
|
|
484
515
|
port=parsed_url.port,
|
|
516
|
+
database=database or "postgres",
|
|
485
517
|
)
|
|
486
518
|
|
|
487
519
|
|
|
@@ -552,7 +584,7 @@ _DATASTORE_TYPE_TO_PROFILE_CLASS: dict[str, type[DatastoreProfile]] = {
|
|
|
552
584
|
"gcs": DatastoreProfileGCS,
|
|
553
585
|
"az": DatastoreProfileAzureBlob,
|
|
554
586
|
"hdfs": DatastoreProfileHdfs,
|
|
555
|
-
"
|
|
587
|
+
"postgresql": DatastoreProfilePostgreSQL,
|
|
556
588
|
"config": ConfigProfile,
|
|
557
589
|
"openai": OpenAIProfile,
|
|
558
590
|
"huggingface": HuggingFaceProfile,
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
import threading
|
|
15
15
|
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
16
16
|
|
|
17
17
|
import mlrun
|
|
@@ -41,6 +41,9 @@ class HuggingFaceProvider(ModelProvider):
|
|
|
41
41
|
into memory for inference. Ensure you have the required CPU/GPU and memory to use this operation.
|
|
42
42
|
"""
|
|
43
43
|
|
|
44
|
+
# locks for threading use cases
|
|
45
|
+
_client_lock = threading.Lock()
|
|
46
|
+
|
|
44
47
|
def __init__(
|
|
45
48
|
self,
|
|
46
49
|
parent,
|
|
@@ -224,7 +227,8 @@ class HuggingFaceProvider(ModelProvider):
|
|
|
224
227
|
|
|
225
228
|
self.options["model_kwargs"] = self.options.get("model_kwargs", {})
|
|
226
229
|
self.options["model_kwargs"]["local_files_only"] = True
|
|
227
|
-
|
|
230
|
+
with self._client_lock:
|
|
231
|
+
self._client = pipeline(model=self.model, **self.options)
|
|
228
232
|
self._expected_operation_type = Pipeline
|
|
229
233
|
except ImportError as exc:
|
|
230
234
|
raise ImportError("transformers package is not installed") from exc
|
|
@@ -11,8 +11,8 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
from collections.abc import Awaitable
|
|
15
|
-
from typing import Any,
|
|
14
|
+
from collections.abc import Awaitable, Callable
|
|
15
|
+
from typing import Any, Optional, Union
|
|
16
16
|
|
|
17
17
|
import mlrun.errors
|
|
18
18
|
from mlrun.common.types import StrEnum
|
|
@@ -12,8 +12,8 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import inspect
|
|
15
|
-
from collections.abc import Awaitable
|
|
16
|
-
from typing import TYPE_CHECKING, Any,
|
|
15
|
+
from collections.abc import Awaitable, Callable
|
|
16
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
17
17
|
|
|
18
18
|
import mlrun
|
|
19
19
|
from mlrun.datastore.model_provider.model_provider import (
|
mlrun/datastore/s3.py
CHANGED
|
@@ -18,12 +18,16 @@ from typing import Optional
|
|
|
18
18
|
from urllib.parse import urlparse
|
|
19
19
|
|
|
20
20
|
import boto3
|
|
21
|
+
import botocore.exceptions
|
|
21
22
|
from boto3.s3.transfer import TransferConfig
|
|
22
23
|
from fsspec.registry import get_filesystem_class
|
|
23
24
|
|
|
24
25
|
import mlrun.errors
|
|
25
26
|
|
|
26
27
|
from .base import DataStore, FileStats, make_datastore_schema_sanitizer
|
|
28
|
+
from .utils import parse_s3_bucket_and_key
|
|
29
|
+
|
|
30
|
+
__all__ = ["parse_s3_bucket_and_key"]
|
|
27
31
|
|
|
28
32
|
|
|
29
33
|
class S3Store(DataStore):
|
|
@@ -225,9 +229,17 @@ class S3Store(DataStore):
|
|
|
225
229
|
def get(self, key, size=None, offset=0):
|
|
226
230
|
bucket, key = self.get_bucket_and_key(key)
|
|
227
231
|
obj = self.s3.Object(bucket, key)
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
232
|
+
try:
|
|
233
|
+
if size or offset:
|
|
234
|
+
return obj.get(Range=S3Store.get_range(size, offset))["Body"].read()
|
|
235
|
+
return obj.get()["Body"].read()
|
|
236
|
+
|
|
237
|
+
except botocore.exceptions.ClientError as exc:
|
|
238
|
+
if exc.response["Error"]["Code"] == "NoSuchKey":
|
|
239
|
+
# "NoSuchKey" errors codes - equivalent to `FileNotFoundError`
|
|
240
|
+
raise FileNotFoundError(f"s3://{bucket}/{key}") from exc
|
|
241
|
+
# Other errors are raised as-is
|
|
242
|
+
raise
|
|
231
243
|
|
|
232
244
|
def put(self, key, data, append=False):
|
|
233
245
|
data, _ = self._prepare_put_data(data, append)
|
|
@@ -259,16 +271,3 @@ class S3Store(DataStore):
|
|
|
259
271
|
# In order to raise an error if there is connection error, ML-7056.
|
|
260
272
|
self.filesystem.exists(path=path)
|
|
261
273
|
self.filesystem.rm(path=path, recursive=recursive, maxdepth=maxdepth)
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
def parse_s3_bucket_and_key(s3_path):
|
|
265
|
-
try:
|
|
266
|
-
path_parts = s3_path.replace("s3://", "").split("/")
|
|
267
|
-
bucket = path_parts.pop(0)
|
|
268
|
-
key = "/".join(path_parts)
|
|
269
|
-
except Exception as exc:
|
|
270
|
-
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
271
|
-
"failed to parse s3 bucket and key"
|
|
272
|
-
) from exc
|
|
273
|
-
|
|
274
|
-
return bucket, key
|
mlrun/datastore/sources.py
CHANGED
|
@@ -460,7 +460,7 @@ class ParquetSource(BaseSourceDriver):
|
|
|
460
460
|
if not filter_tuple:
|
|
461
461
|
continue
|
|
462
462
|
col_name, op, value = filter_tuple
|
|
463
|
-
if op.lower() in ("in", "not in") and isinstance(value,
|
|
463
|
+
if op.lower() in ("in", "not in") and isinstance(value, list | tuple | set):
|
|
464
464
|
none_exists = False
|
|
465
465
|
value = list(value)
|
|
466
466
|
for sub_value in value:
|
|
@@ -76,9 +76,9 @@ class ResourceCache:
|
|
|
76
76
|
return self._tabels[uri]
|
|
77
77
|
|
|
78
78
|
if uri.startswith("v3io://") or uri.startswith("v3ios://"):
|
|
79
|
-
endpoint,
|
|
79
|
+
endpoint, path = parse_path(uri)
|
|
80
80
|
self._tabels[uri] = Table(
|
|
81
|
-
|
|
81
|
+
path,
|
|
82
82
|
V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api),
|
|
83
83
|
flush_interval_secs=mlrun.mlconf.feature_store.flush_interval,
|
|
84
84
|
)
|
|
@@ -87,10 +87,10 @@ class ResourceCache:
|
|
|
87
87
|
if uri.startswith("redis://") or uri.startswith("rediss://"):
|
|
88
88
|
from storey.redis_driver import RedisDriver
|
|
89
89
|
|
|
90
|
-
endpoint,
|
|
90
|
+
endpoint, path = parse_path(uri)
|
|
91
91
|
endpoint = endpoint or mlrun.mlconf.redis.url
|
|
92
92
|
self._tabels[uri] = Table(
|
|
93
|
-
|
|
93
|
+
path,
|
|
94
94
|
RedisDriver(redis_url=endpoint, key_prefix="/"),
|
|
95
95
|
flush_interval_secs=mlrun.mlconf.feature_store.flush_interval,
|
|
96
96
|
)
|