mlrun 1.10.0rc42__py3-none-any.whl → 1.10.1rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/config.py +3 -12
- mlrun/datastore/base.py +265 -7
- mlrun/datastore/datastore.py +1 -1
- mlrun/datastore/model_provider/huggingface_provider.py +6 -2
- mlrun/datastore/store_resources.py +4 -4
- mlrun/model_monitoring/applications/base.py +16 -2
- mlrun/projects/operations.py +10 -10
- mlrun/projects/project.py +34 -29
- mlrun/run.py +3 -3
- mlrun/runtimes/nuclio/function.py +4 -2
- mlrun/runtimes/nuclio/serving.py +17 -16
- mlrun/serving/server.py +41 -22
- mlrun/serving/states.py +70 -77
- mlrun/utils/helpers.py +3 -1
- mlrun/utils/notifications/notification/mail.py +38 -15
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.10.0rc42.dist-info → mlrun-1.10.1rc4.dist-info}/METADATA +9 -7
- {mlrun-1.10.0rc42.dist-info → mlrun-1.10.1rc4.dist-info}/RECORD +22 -22
- {mlrun-1.10.0rc42.dist-info → mlrun-1.10.1rc4.dist-info}/WHEEL +0 -0
- {mlrun-1.10.0rc42.dist-info → mlrun-1.10.1rc4.dist-info}/entry_points.txt +0 -0
- {mlrun-1.10.0rc42.dist-info → mlrun-1.10.1rc4.dist-info}/licenses/LICENSE +0 -0
- {mlrun-1.10.0rc42.dist-info → mlrun-1.10.1rc4.dist-info}/top_level.txt +0 -0
mlrun/config.py
CHANGED
|
@@ -66,7 +66,6 @@ default_config = {
|
|
|
66
66
|
"nuclio_version": "",
|
|
67
67
|
"default_nuclio_runtime": "python:3.11",
|
|
68
68
|
"nest_asyncio_enabled": "", # enable import of nest_asyncio for corner cases with old jupyter, set "1"
|
|
69
|
-
"ui_url": "", # remote/external mlrun UI url (for hyperlinks) (This is deprecated in favor of the ui block)
|
|
70
69
|
"remote_host": "",
|
|
71
70
|
"api_base_version": "v1",
|
|
72
71
|
"version": "", # will be set to current version
|
|
@@ -304,7 +303,7 @@ default_config = {
|
|
|
304
303
|
"application": {
|
|
305
304
|
"default_sidecar_internal_port": 8050,
|
|
306
305
|
"default_authentication_mode": mlrun.common.schemas.APIGatewayAuthenticationMode.none,
|
|
307
|
-
"default_worker_number":
|
|
306
|
+
"default_worker_number": 100,
|
|
308
307
|
},
|
|
309
308
|
},
|
|
310
309
|
# TODO: function defaults should be moved to the function spec config above
|
|
@@ -725,7 +724,7 @@ default_config = {
|
|
|
725
724
|
# Set false to avoid creating a global source (for example in a dark site)
|
|
726
725
|
"create": True,
|
|
727
726
|
"name": "default",
|
|
728
|
-
"description": "MLRun
|
|
727
|
+
"description": "MLRun hub",
|
|
729
728
|
"url": "https://mlrun.github.io/marketplace",
|
|
730
729
|
"channel": "master",
|
|
731
730
|
},
|
|
@@ -1280,10 +1279,7 @@ class Config:
|
|
|
1280
1279
|
|
|
1281
1280
|
@staticmethod
|
|
1282
1281
|
def resolve_ui_url():
|
|
1283
|
-
|
|
1284
|
-
# since the config class is used in a "recursive" way, we can't use property like we used in other places
|
|
1285
|
-
# since the property will need to be url, which exists in other structs as well
|
|
1286
|
-
return config.ui.url or config.ui_url
|
|
1282
|
+
return config.ui.url
|
|
1287
1283
|
|
|
1288
1284
|
def is_api_running_on_k8s(self):
|
|
1289
1285
|
# determine if the API service is attached to K8s cluster
|
|
@@ -1570,7 +1566,6 @@ def read_env(env=None, prefix=env_prefix):
|
|
|
1570
1566
|
"https://mlrun-api.", "https://framesd."
|
|
1571
1567
|
)
|
|
1572
1568
|
|
|
1573
|
-
uisvc = env.get("MLRUN_UI_SERVICE_HOST")
|
|
1574
1569
|
igz_domain = env.get("IGZ_NAMESPACE_DOMAIN")
|
|
1575
1570
|
|
|
1576
1571
|
# workaround to try and detect IGZ domain
|
|
@@ -1596,10 +1591,6 @@ def read_env(env=None, prefix=env_prefix):
|
|
|
1596
1591
|
if config.get("nuclio_dashboard_url") == "disabled":
|
|
1597
1592
|
config["nuclio_dashboard_url"] = ""
|
|
1598
1593
|
|
|
1599
|
-
if uisvc and not config.get("ui_url"):
|
|
1600
|
-
if igz_domain:
|
|
1601
|
-
config["ui_url"] = f"https://mlrun-ui.{igz_domain}"
|
|
1602
|
-
|
|
1603
1594
|
if log_level := config.get("log_level"):
|
|
1604
1595
|
import mlrun.utils.logger
|
|
1605
1596
|
|
mlrun/datastore/base.py
CHANGED
|
@@ -11,11 +11,14 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
import datetime
|
|
15
|
+
import os
|
|
16
|
+
import os.path
|
|
14
17
|
import tempfile
|
|
15
18
|
import urllib.parse
|
|
16
19
|
from base64 import b64encode
|
|
17
20
|
from copy import copy
|
|
18
|
-
from
|
|
21
|
+
from types import ModuleType
|
|
19
22
|
from typing import Optional, Union
|
|
20
23
|
from urllib.parse import urlparse
|
|
21
24
|
|
|
@@ -156,6 +159,195 @@ class DataStore(BaseRemoteClient):
|
|
|
156
159
|
def get_spark_options(self, path=None):
|
|
157
160
|
return {}
|
|
158
161
|
|
|
162
|
+
@staticmethod
|
|
163
|
+
def _is_directory_in_range(
|
|
164
|
+
start_time: Optional[datetime.datetime],
|
|
165
|
+
end_time: Optional[datetime.datetime],
|
|
166
|
+
year: int,
|
|
167
|
+
month: Optional[int] = None,
|
|
168
|
+
day: Optional[int] = None,
|
|
169
|
+
hour: Optional[int] = None,
|
|
170
|
+
**kwargs,
|
|
171
|
+
):
|
|
172
|
+
"""Check if a partition directory (year=.., month=.., etc.) is in the time range."""
|
|
173
|
+
from dateutil.relativedelta import relativedelta
|
|
174
|
+
|
|
175
|
+
partition_start = datetime.datetime(
|
|
176
|
+
year=year,
|
|
177
|
+
month=month or 1,
|
|
178
|
+
day=day or 1,
|
|
179
|
+
hour=hour or 0,
|
|
180
|
+
tzinfo=start_time.tzinfo if start_time else end_time.tzinfo,
|
|
181
|
+
)
|
|
182
|
+
partition_end = (
|
|
183
|
+
partition_start
|
|
184
|
+
+ relativedelta(
|
|
185
|
+
years=1 if month is None else 0,
|
|
186
|
+
months=1 if day is None and month is not None else 0,
|
|
187
|
+
days=1 if hour is None and day is not None else 0,
|
|
188
|
+
hours=1 if hour is not None else 0,
|
|
189
|
+
)
|
|
190
|
+
- datetime.timedelta(microseconds=1)
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
if (end_time and end_time < partition_start) or (
|
|
194
|
+
start_time and start_time > partition_end
|
|
195
|
+
):
|
|
196
|
+
return False
|
|
197
|
+
return True
|
|
198
|
+
|
|
199
|
+
@staticmethod
|
|
200
|
+
def _list_partition_paths_helper(
|
|
201
|
+
paths: list[str],
|
|
202
|
+
start_time: Optional[datetime.datetime],
|
|
203
|
+
end_time: Optional[datetime.datetime],
|
|
204
|
+
current_path: str,
|
|
205
|
+
partition_level: str,
|
|
206
|
+
filesystem,
|
|
207
|
+
):
|
|
208
|
+
directory_split = current_path.rsplit("/", 1)
|
|
209
|
+
time_unit = None
|
|
210
|
+
directory_start, directory_end = "", ""
|
|
211
|
+
if len(directory_split) == 2:
|
|
212
|
+
directory_start, directory_end = directory_split
|
|
213
|
+
time_unit = directory_end.split("=")[0] if "=" in directory_end else None
|
|
214
|
+
|
|
215
|
+
if not time_unit and directory_end.endswith((".parquet", ".pq")):
|
|
216
|
+
paths.append(directory_start.rstrip("/"))
|
|
217
|
+
return
|
|
218
|
+
elif time_unit and time_unit == partition_level:
|
|
219
|
+
paths.append(current_path.rstrip("/"))
|
|
220
|
+
return
|
|
221
|
+
|
|
222
|
+
directories = filesystem.ls(current_path, detail=True)
|
|
223
|
+
if len(directories) == 0:
|
|
224
|
+
return
|
|
225
|
+
for directory in directories:
|
|
226
|
+
current_path = directory["name"]
|
|
227
|
+
parts = [p for p in current_path.split("/") if "=" in p]
|
|
228
|
+
kwargs = {}
|
|
229
|
+
for part in parts:
|
|
230
|
+
key, value = part.split("=", 1)
|
|
231
|
+
if value.isdigit():
|
|
232
|
+
value = int(value)
|
|
233
|
+
kwargs[key] = value
|
|
234
|
+
if DataStore._is_directory_in_range(start_time, end_time, **kwargs):
|
|
235
|
+
DataStore._list_partition_paths_helper(
|
|
236
|
+
paths,
|
|
237
|
+
start_time,
|
|
238
|
+
end_time,
|
|
239
|
+
current_path,
|
|
240
|
+
partition_level,
|
|
241
|
+
filesystem,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
@staticmethod
|
|
245
|
+
def _list_partitioned_paths(
|
|
246
|
+
base_url: str,
|
|
247
|
+
start_time: Optional[datetime.datetime],
|
|
248
|
+
end_time: Optional[datetime.datetime],
|
|
249
|
+
partition_level: str,
|
|
250
|
+
filesystem,
|
|
251
|
+
):
|
|
252
|
+
paths = []
|
|
253
|
+
parsed_base_url = urlparse(base_url)
|
|
254
|
+
base_path = parsed_base_url.path
|
|
255
|
+
|
|
256
|
+
if parsed_base_url.scheme not in ["v3io", "v3ios"]:
|
|
257
|
+
base_path = parsed_base_url.netloc + base_path
|
|
258
|
+
|
|
259
|
+
DataStore._list_partition_paths_helper(
|
|
260
|
+
paths, start_time, end_time, base_path, partition_level, filesystem
|
|
261
|
+
)
|
|
262
|
+
paths = [
|
|
263
|
+
DataStore._reconstruct_path_from_base_url(parsed_base_url, path)
|
|
264
|
+
for path in paths
|
|
265
|
+
]
|
|
266
|
+
return paths
|
|
267
|
+
|
|
268
|
+
@staticmethod
|
|
269
|
+
def _reconstruct_path_from_base_url(
|
|
270
|
+
parsed_base_url: urllib.parse.ParseResult, returned_path: str
|
|
271
|
+
) -> str:
|
|
272
|
+
scheme = parsed_base_url.scheme
|
|
273
|
+
authority = parsed_base_url.netloc
|
|
274
|
+
returned_path = returned_path.lstrip("/")
|
|
275
|
+
if scheme == "v3io":
|
|
276
|
+
return f"{scheme}://{authority}/{returned_path}"
|
|
277
|
+
else:
|
|
278
|
+
return f"{scheme}://{returned_path}"
|
|
279
|
+
|
|
280
|
+
@staticmethod
|
|
281
|
+
def _clean_filters_for_partitions(
|
|
282
|
+
filters: list[list[tuple]],
|
|
283
|
+
partition_keys: list[str],
|
|
284
|
+
):
|
|
285
|
+
"""
|
|
286
|
+
Remove partition keys from filters.
|
|
287
|
+
|
|
288
|
+
:param filters: pandas-style filters
|
|
289
|
+
Example: [[('year','=',2025),('month','=',11),('timestamp','>',ts1)]]
|
|
290
|
+
:param partition_keys: partition columns handled via directory
|
|
291
|
+
|
|
292
|
+
:return list of list of tuples: cleaned filters without partition keys
|
|
293
|
+
"""
|
|
294
|
+
cleaned_filters = []
|
|
295
|
+
for group in filters:
|
|
296
|
+
new_group = [f for f in group if f[0] not in partition_keys]
|
|
297
|
+
if new_group:
|
|
298
|
+
cleaned_filters.append(new_group)
|
|
299
|
+
return cleaned_filters
|
|
300
|
+
|
|
301
|
+
@staticmethod
|
|
302
|
+
def _read_partitioned_parquet(
|
|
303
|
+
base_url: str,
|
|
304
|
+
start_time: Optional[datetime.datetime],
|
|
305
|
+
end_time: Optional[datetime.datetime],
|
|
306
|
+
partition_keys: list[str],
|
|
307
|
+
df_module: ModuleType,
|
|
308
|
+
filesystem: fsspec.AbstractFileSystem,
|
|
309
|
+
**kwargs,
|
|
310
|
+
):
|
|
311
|
+
"""
|
|
312
|
+
Reads only the relevant partitions and concatenates the results.
|
|
313
|
+
Note that partition_keys cannot be empty.
|
|
314
|
+
"""
|
|
315
|
+
logger.debug(f"Starting partition discovery process for {base_url}")
|
|
316
|
+
|
|
317
|
+
paths = DataStore._list_partitioned_paths(
|
|
318
|
+
base_url,
|
|
319
|
+
start_time,
|
|
320
|
+
end_time,
|
|
321
|
+
partition_keys[-1],
|
|
322
|
+
filesystem,
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
dfs = []
|
|
326
|
+
for current_path in paths:
|
|
327
|
+
try:
|
|
328
|
+
kwargs["filters"] = DataStore._clean_filters_for_partitions(
|
|
329
|
+
kwargs["filters"], partition_keys
|
|
330
|
+
)
|
|
331
|
+
df = df_module.read_parquet(current_path, **kwargs)
|
|
332
|
+
logger.debug(
|
|
333
|
+
"Finished reading DataFrame from subpath",
|
|
334
|
+
url=current_path,
|
|
335
|
+
)
|
|
336
|
+
dfs.append(df)
|
|
337
|
+
except FileNotFoundError as e:
|
|
338
|
+
# Skip partitions that don't exist or have no data
|
|
339
|
+
logger.warning(
|
|
340
|
+
"Failed to read DataFrame", url=current_path, exception=e
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
final_df = pd.concat(dfs) if dfs else pd.DataFrame()
|
|
344
|
+
logger.debug(
|
|
345
|
+
"Finished reading partitioned parquet files",
|
|
346
|
+
url=base_url,
|
|
347
|
+
columns=final_df.columns,
|
|
348
|
+
)
|
|
349
|
+
return final_df
|
|
350
|
+
|
|
159
351
|
@staticmethod
|
|
160
352
|
def _parquet_reader(
|
|
161
353
|
df_module,
|
|
@@ -165,6 +357,7 @@ class DataStore(BaseRemoteClient):
|
|
|
165
357
|
start_time,
|
|
166
358
|
end_time,
|
|
167
359
|
additional_filters,
|
|
360
|
+
optimize_discovery,
|
|
168
361
|
):
|
|
169
362
|
from storey.utils import find_filters, find_partitions
|
|
170
363
|
|
|
@@ -203,7 +396,10 @@ class DataStore(BaseRemoteClient):
|
|
|
203
396
|
)
|
|
204
397
|
|
|
205
398
|
if start_time or end_time or additional_filters:
|
|
206
|
-
partitions_time_attributes = find_partitions(
|
|
399
|
+
partitions_time_attributes, partitions = find_partitions(
|
|
400
|
+
url, file_system, True
|
|
401
|
+
)
|
|
402
|
+
logger.debug("Partitioned parquet read", partitions=partitions)
|
|
207
403
|
set_filters(
|
|
208
404
|
partitions_time_attributes,
|
|
209
405
|
start_time,
|
|
@@ -211,8 +407,28 @@ class DataStore(BaseRemoteClient):
|
|
|
211
407
|
additional_filters,
|
|
212
408
|
kwargs,
|
|
213
409
|
)
|
|
410
|
+
|
|
214
411
|
try:
|
|
215
|
-
|
|
412
|
+
if (
|
|
413
|
+
optimize_discovery
|
|
414
|
+
and partitions_time_attributes
|
|
415
|
+
and DataStore._verify_path_partition_level(
|
|
416
|
+
urlparse(url).path, partitions
|
|
417
|
+
)
|
|
418
|
+
and (start_time or end_time)
|
|
419
|
+
):
|
|
420
|
+
return DataStore._read_partitioned_parquet(
|
|
421
|
+
url,
|
|
422
|
+
start_time,
|
|
423
|
+
end_time,
|
|
424
|
+
partitions_time_attributes,
|
|
425
|
+
df_module,
|
|
426
|
+
file_system,
|
|
427
|
+
**kwargs,
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
else:
|
|
431
|
+
return df_module.read_parquet(*args, **kwargs)
|
|
216
432
|
except pyarrow.lib.ArrowInvalid as ex:
|
|
217
433
|
if not str(ex).startswith(
|
|
218
434
|
"Cannot compare timestamp with timezone to timestamp without timezone"
|
|
@@ -238,7 +454,24 @@ class DataStore(BaseRemoteClient):
|
|
|
238
454
|
additional_filters,
|
|
239
455
|
kwargs,
|
|
240
456
|
)
|
|
241
|
-
|
|
457
|
+
if (
|
|
458
|
+
optimize_discovery
|
|
459
|
+
and partitions_time_attributes
|
|
460
|
+
and DataStore._verify_path_partition_level(
|
|
461
|
+
urlparse(url).path, partitions
|
|
462
|
+
)
|
|
463
|
+
):
|
|
464
|
+
return DataStore._read_partitioned_parquet(
|
|
465
|
+
url,
|
|
466
|
+
start_time_inner,
|
|
467
|
+
end_time_inner,
|
|
468
|
+
partitions_time_attributes,
|
|
469
|
+
df_module,
|
|
470
|
+
file_system,
|
|
471
|
+
**kwargs,
|
|
472
|
+
)
|
|
473
|
+
else:
|
|
474
|
+
return df_module.read_parquet(*args, **kwargs)
|
|
242
475
|
else:
|
|
243
476
|
return df_module.read_parquet(*args, **kwargs)
|
|
244
477
|
|
|
@@ -261,6 +494,10 @@ class DataStore(BaseRemoteClient):
|
|
|
261
494
|
file_url = self._sanitize_url(url)
|
|
262
495
|
is_csv, is_json, drop_time_column = False, False, False
|
|
263
496
|
file_system = self.filesystem
|
|
497
|
+
|
|
498
|
+
# Feature flag optimize partition discovery by providing specific partition levels urls to the parquet reader
|
|
499
|
+
optimize_discovery = kwargs.pop("optimize_discovery", True)
|
|
500
|
+
|
|
264
501
|
if file_url.endswith(".csv") or format == "csv":
|
|
265
502
|
is_csv = True
|
|
266
503
|
drop_time_column = False
|
|
@@ -322,6 +559,7 @@ class DataStore(BaseRemoteClient):
|
|
|
322
559
|
start_time,
|
|
323
560
|
end_time,
|
|
324
561
|
additional_filters,
|
|
562
|
+
optimize_discovery,
|
|
325
563
|
)
|
|
326
564
|
|
|
327
565
|
elif file_url.endswith(".json") or format == "json":
|
|
@@ -347,7 +585,7 @@ class DataStore(BaseRemoteClient):
|
|
|
347
585
|
temp_file = tempfile.NamedTemporaryFile(delete=False)
|
|
348
586
|
self.download(self._join(subpath), temp_file.name)
|
|
349
587
|
df = reader(temp_file.name, **kwargs)
|
|
350
|
-
remove(temp_file.name)
|
|
588
|
+
os.remove(temp_file.name)
|
|
351
589
|
|
|
352
590
|
if is_json or is_csv:
|
|
353
591
|
# for parquet file the time filtering is executed in `reader`
|
|
@@ -387,6 +625,26 @@ class DataStore(BaseRemoteClient):
|
|
|
387
625
|
except ImportError:
|
|
388
626
|
return False
|
|
389
627
|
|
|
628
|
+
@staticmethod
|
|
629
|
+
def _verify_path_partition_level(base_path: str, partitions: list[str]) -> bool:
|
|
630
|
+
if not partitions:
|
|
631
|
+
return False
|
|
632
|
+
|
|
633
|
+
path_parts = base_path.strip("/").split("/")
|
|
634
|
+
path_parts = [part.split("=")[0] for part in path_parts if "=" in part]
|
|
635
|
+
if "hour" in partitions:
|
|
636
|
+
hour_index = partitions.index("hour")
|
|
637
|
+
else:
|
|
638
|
+
return False
|
|
639
|
+
for i, part in enumerate(partitions):
|
|
640
|
+
if not (
|
|
641
|
+
part in path_parts
|
|
642
|
+
or part in ["year", "month", "day", "hour"]
|
|
643
|
+
or i > hour_index
|
|
644
|
+
):
|
|
645
|
+
return False
|
|
646
|
+
return True
|
|
647
|
+
|
|
390
648
|
|
|
391
649
|
class DataItem:
|
|
392
650
|
"""Data input/output class abstracting access to various local/remote data sources
|
|
@@ -439,7 +697,7 @@ class DataItem:
|
|
|
439
697
|
@property
|
|
440
698
|
def suffix(self):
|
|
441
699
|
"""DataItem suffix (file extension) e.g. '.png'"""
|
|
442
|
-
_, file_ext = path.splitext(self._path)
|
|
700
|
+
_, file_ext = os.path.splitext(self._path)
|
|
443
701
|
return file_ext
|
|
444
702
|
|
|
445
703
|
@property
|
|
@@ -548,7 +806,7 @@ class DataItem:
|
|
|
548
806
|
return
|
|
549
807
|
|
|
550
808
|
if self._local_path:
|
|
551
|
-
remove(self._local_path)
|
|
809
|
+
os.remove(self._local_path)
|
|
552
810
|
self._local_path = ""
|
|
553
811
|
|
|
554
812
|
def as_df(
|
mlrun/datastore/datastore.py
CHANGED
|
@@ -47,7 +47,7 @@ from .v3io import V3ioStore
|
|
|
47
47
|
in_memory_store = InMemoryStore()
|
|
48
48
|
|
|
49
49
|
|
|
50
|
-
def schema_to_store(schema) -> DataStore
|
|
50
|
+
def schema_to_store(schema) -> type[DataStore]:
|
|
51
51
|
# import store classes inside to enable making their dependencies optional (package extras)
|
|
52
52
|
|
|
53
53
|
if not schema or schema in get_local_file_schema():
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
14
|
+
import threading
|
|
15
15
|
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
16
16
|
|
|
17
17
|
import mlrun
|
|
@@ -41,6 +41,9 @@ class HuggingFaceProvider(ModelProvider):
|
|
|
41
41
|
into memory for inference. Ensure you have the required CPU/GPU and memory to use this operation.
|
|
42
42
|
"""
|
|
43
43
|
|
|
44
|
+
# locks for threading use cases
|
|
45
|
+
_client_lock = threading.Lock()
|
|
46
|
+
|
|
44
47
|
def __init__(
|
|
45
48
|
self,
|
|
46
49
|
parent,
|
|
@@ -224,7 +227,8 @@ class HuggingFaceProvider(ModelProvider):
|
|
|
224
227
|
|
|
225
228
|
self.options["model_kwargs"] = self.options.get("model_kwargs", {})
|
|
226
229
|
self.options["model_kwargs"]["local_files_only"] = True
|
|
227
|
-
|
|
230
|
+
with self._client_lock:
|
|
231
|
+
self._client = pipeline(model=self.model, **self.options)
|
|
228
232
|
self._expected_operation_type = Pipeline
|
|
229
233
|
except ImportError as exc:
|
|
230
234
|
raise ImportError("transformers package is not installed") from exc
|
|
@@ -76,9 +76,9 @@ class ResourceCache:
|
|
|
76
76
|
return self._tabels[uri]
|
|
77
77
|
|
|
78
78
|
if uri.startswith("v3io://") or uri.startswith("v3ios://"):
|
|
79
|
-
endpoint,
|
|
79
|
+
endpoint, path = parse_path(uri)
|
|
80
80
|
self._tabels[uri] = Table(
|
|
81
|
-
|
|
81
|
+
path,
|
|
82
82
|
V3ioDriver(webapi=endpoint or mlrun.mlconf.v3io_api),
|
|
83
83
|
flush_interval_secs=mlrun.mlconf.feature_store.flush_interval,
|
|
84
84
|
)
|
|
@@ -87,10 +87,10 @@ class ResourceCache:
|
|
|
87
87
|
if uri.startswith("redis://") or uri.startswith("rediss://"):
|
|
88
88
|
from storey.redis_driver import RedisDriver
|
|
89
89
|
|
|
90
|
-
endpoint,
|
|
90
|
+
endpoint, path = parse_path(uri)
|
|
91
91
|
endpoint = endpoint or mlrun.mlconf.redis.url
|
|
92
92
|
self._tabels[uri] = Table(
|
|
93
|
-
|
|
93
|
+
path,
|
|
94
94
|
RedisDriver(redis_url=endpoint, key_prefix="/"),
|
|
95
95
|
flush_interval_secs=mlrun.mlconf.feature_store.flush_interval,
|
|
96
96
|
)
|
|
@@ -850,6 +850,11 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
850
850
|
* ``base_period``, ``int``
|
|
851
851
|
* ``write_output``, ``bool``
|
|
852
852
|
* ``existing_data_handling``, ``str``
|
|
853
|
+
* ``_init_args``, ``dict`` - the arguments for the application class constructor
|
|
854
|
+
(equivalent to ``class_arguments``)
|
|
855
|
+
|
|
856
|
+
See :py:meth:`~ModelMonitoringApplicationBase.evaluate` for more details
|
|
857
|
+
about these inputs and params.
|
|
853
858
|
|
|
854
859
|
For Git sources, add the source archive to the returned job and change the handler:
|
|
855
860
|
|
|
@@ -928,6 +933,7 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
928
933
|
image: Optional[str] = None,
|
|
929
934
|
with_repo: Optional[bool] = False,
|
|
930
935
|
class_handler: Optional[str] = None,
|
|
936
|
+
class_arguments: Optional[dict[str, Any]] = None,
|
|
931
937
|
requirements: Optional[Union[str, list[str]]] = None,
|
|
932
938
|
requirements_file: str = "",
|
|
933
939
|
endpoints: Union[list[tuple[str, str]], list[str], Literal["all"], None] = None,
|
|
@@ -963,7 +969,10 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
963
969
|
You do not need to have a model endpoint to use this option.
|
|
964
970
|
:param image: Docker image to run the job on (when running remotely).
|
|
965
971
|
:param with_repo: Whether to clone the current repo to the build source.
|
|
966
|
-
:param class_handler: The relative path to the class, useful when using Git sources or code
|
|
972
|
+
:param class_handler: The relative path to the application class, useful when using Git sources or code
|
|
973
|
+
from images.
|
|
974
|
+
:param class_arguments: The arguments for the application class constructor. These are passed to the
|
|
975
|
+
class ``__init__``. The values must be JSON-serializable.
|
|
967
976
|
:param requirements: List of Python requirements to be installed in the image.
|
|
968
977
|
:param requirements_file: Path to a Python requirements file to be installed in the image.
|
|
969
978
|
:param endpoints: The model endpoints to get the data from. The options are:
|
|
@@ -1041,7 +1050,9 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
1041
1050
|
project=project,
|
|
1042
1051
|
)
|
|
1043
1052
|
|
|
1044
|
-
params: dict[
|
|
1053
|
+
params: dict[
|
|
1054
|
+
str, Union[list, dict, str, int, None, ds_profile.DatastoreProfile]
|
|
1055
|
+
] = {}
|
|
1045
1056
|
if endpoints:
|
|
1046
1057
|
params["endpoints"] = endpoints
|
|
1047
1058
|
if sample_data is None:
|
|
@@ -1077,6 +1088,9 @@ class ModelMonitoringApplicationBase(MonitoringApplicationToDict, ABC):
|
|
|
1077
1088
|
)
|
|
1078
1089
|
params["stream_profile"] = stream_profile
|
|
1079
1090
|
|
|
1091
|
+
if class_arguments:
|
|
1092
|
+
params["_init_args"] = class_arguments
|
|
1093
|
+
|
|
1080
1094
|
inputs: dict[str, str] = {}
|
|
1081
1095
|
for data, identifier in [
|
|
1082
1096
|
(sample_data, "sample_data"),
|
mlrun/projects/operations.py
CHANGED
|
@@ -85,17 +85,17 @@ def run_function(
|
|
|
85
85
|
) -> Union[mlrun.model.RunObject, mlrun_pipelines.models.PipelineNodeWrapper]:
|
|
86
86
|
"""Run a local or remote task as part of a local/kubeflow pipeline
|
|
87
87
|
|
|
88
|
-
run_function()
|
|
89
|
-
function can be specified as an object or by name (str)
|
|
90
|
-
in the current project eliminating the need to redefine/edit functions.
|
|
88
|
+
run_function() allows you to execute a function locally, on a remote cluster, or as part of an automated workflow.
|
|
89
|
+
The function can be specified as an object or by name (str). When the function is specified by name it is looked up
|
|
90
|
+
in the current project, eliminating the need to redefine/edit functions.
|
|
91
91
|
|
|
92
|
-
|
|
92
|
+
When functions run as part of a workflow/pipeline (project.run()) some attributes can be set at the run level,
|
|
93
93
|
e.g. local=True will run all the functions locally, setting artifact_path will direct all outputs to the same path.
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
94
|
+
Project runs provide additional notifications/reporting and exception handling.
|
|
95
|
+
Inside a Kubeflow pipeline (KFP) run_function() generates KFP node (see PipelineNodeWrapper) which forms a DAG.
|
|
96
|
+
Some behavior may differ between regular runs and deferred KFP runs.
|
|
97
97
|
|
|
98
|
-
|
|
98
|
+
Example (use with function object)::
|
|
99
99
|
|
|
100
100
|
LABELS = "is_error"
|
|
101
101
|
MODEL_CLASS = "sklearn.ensemble.RandomForestClassifier"
|
|
@@ -107,7 +107,7 @@ def run_function(
|
|
|
107
107
|
inputs={"dataset": DATA_PATH},
|
|
108
108
|
)
|
|
109
109
|
|
|
110
|
-
|
|
110
|
+
Example (use with project)::
|
|
111
111
|
|
|
112
112
|
# create a project with two functions (local and from hub)
|
|
113
113
|
project = mlrun.new_project(project_name, "./proj)
|
|
@@ -119,7 +119,7 @@ def run_function(
|
|
|
119
119
|
run2 = run_function("train", params={"label_columns": LABELS, "model_class": MODEL_CLASS},
|
|
120
120
|
inputs={"dataset": run1.outputs["data"]})
|
|
121
121
|
|
|
122
|
-
|
|
122
|
+
Example (use in pipeline)::
|
|
123
123
|
|
|
124
124
|
@dsl.pipeline(name="test pipeline", description="test")
|
|
125
125
|
def my_pipe(url=""):
|