ob-metaflow 2.12.30.2__py2.py3-none-any.whl → 2.13.6.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow might be problematic. Click here for more details.
- metaflow/__init__.py +3 -0
- metaflow/cards.py +1 -0
- metaflow/cli.py +185 -717
- metaflow/cli_args.py +17 -0
- metaflow/cli_components/__init__.py +0 -0
- metaflow/cli_components/dump_cmd.py +96 -0
- metaflow/cli_components/init_cmd.py +51 -0
- metaflow/cli_components/run_cmds.py +362 -0
- metaflow/cli_components/step_cmd.py +176 -0
- metaflow/cli_components/utils.py +140 -0
- metaflow/cmd/develop/stub_generator.py +9 -2
- metaflow/datastore/flow_datastore.py +2 -2
- metaflow/decorators.py +63 -2
- metaflow/exception.py +8 -2
- metaflow/extension_support/plugins.py +42 -27
- metaflow/flowspec.py +176 -23
- metaflow/graph.py +28 -27
- metaflow/includefile.py +50 -22
- metaflow/lint.py +35 -20
- metaflow/metadata_provider/heartbeat.py +23 -8
- metaflow/metaflow_config.py +10 -1
- metaflow/multicore_utils.py +31 -14
- metaflow/package.py +17 -3
- metaflow/parameters.py +97 -25
- metaflow/plugins/__init__.py +22 -0
- metaflow/plugins/airflow/airflow.py +18 -17
- metaflow/plugins/airflow/airflow_cli.py +1 -0
- metaflow/plugins/argo/argo_client.py +0 -2
- metaflow/plugins/argo/argo_workflows.py +195 -132
- metaflow/plugins/argo/argo_workflows_cli.py +1 -1
- metaflow/plugins/argo/argo_workflows_decorator.py +2 -4
- metaflow/plugins/argo/argo_workflows_deployer_objects.py +51 -9
- metaflow/plugins/argo/jobset_input_paths.py +0 -1
- metaflow/plugins/aws/aws_utils.py +6 -1
- metaflow/plugins/aws/batch/batch_client.py +1 -3
- metaflow/plugins/aws/batch/batch_decorator.py +13 -13
- metaflow/plugins/aws/secrets_manager/aws_secrets_manager_secrets_provider.py +13 -10
- metaflow/plugins/aws/step_functions/dynamo_db_client.py +0 -3
- metaflow/plugins/aws/step_functions/production_token.py +1 -1
- metaflow/plugins/aws/step_functions/step_functions.py +33 -1
- metaflow/plugins/aws/step_functions/step_functions_cli.py +1 -1
- metaflow/plugins/aws/step_functions/step_functions_decorator.py +0 -1
- metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +7 -9
- metaflow/plugins/cards/card_cli.py +7 -2
- metaflow/plugins/cards/card_creator.py +1 -0
- metaflow/plugins/cards/card_decorator.py +79 -8
- metaflow/plugins/cards/card_modules/basic.py +56 -5
- metaflow/plugins/cards/card_modules/card.py +16 -1
- metaflow/plugins/cards/card_modules/components.py +64 -16
- metaflow/plugins/cards/card_modules/main.js +27 -25
- metaflow/plugins/cards/card_modules/test_cards.py +4 -4
- metaflow/plugins/cards/component_serializer.py +1 -1
- metaflow/plugins/datatools/s3/s3.py +12 -4
- metaflow/plugins/datatools/s3/s3op.py +3 -3
- metaflow/plugins/events_decorator.py +338 -186
- metaflow/plugins/kubernetes/kube_utils.py +84 -1
- metaflow/plugins/kubernetes/kubernetes.py +40 -92
- metaflow/plugins/kubernetes/kubernetes_cli.py +32 -7
- metaflow/plugins/kubernetes/kubernetes_decorator.py +76 -4
- metaflow/plugins/kubernetes/kubernetes_job.py +23 -20
- metaflow/plugins/kubernetes/kubernetes_jobsets.py +41 -20
- metaflow/plugins/kubernetes/spot_metadata_cli.py +69 -0
- metaflow/plugins/kubernetes/spot_monitor_sidecar.py +109 -0
- metaflow/plugins/parallel_decorator.py +4 -1
- metaflow/plugins/project_decorator.py +33 -5
- metaflow/plugins/pypi/bootstrap.py +249 -81
- metaflow/plugins/pypi/conda_decorator.py +20 -10
- metaflow/plugins/pypi/conda_environment.py +83 -27
- metaflow/plugins/pypi/micromamba.py +82 -37
- metaflow/plugins/pypi/pip.py +9 -6
- metaflow/plugins/pypi/pypi_decorator.py +11 -9
- metaflow/plugins/pypi/utils.py +4 -2
- metaflow/plugins/timeout_decorator.py +2 -2
- metaflow/runner/click_api.py +240 -50
- metaflow/runner/deployer.py +1 -1
- metaflow/runner/deployer_impl.py +12 -11
- metaflow/runner/metaflow_runner.py +68 -34
- metaflow/runner/nbdeploy.py +2 -0
- metaflow/runner/nbrun.py +1 -1
- metaflow/runner/subprocess_manager.py +61 -10
- metaflow/runner/utils.py +208 -44
- metaflow/runtime.py +216 -112
- metaflow/sidecar/sidecar_worker.py +1 -1
- metaflow/tracing/tracing_modules.py +4 -1
- metaflow/user_configs/__init__.py +0 -0
- metaflow/user_configs/config_decorators.py +563 -0
- metaflow/user_configs/config_options.py +548 -0
- metaflow/user_configs/config_parameters.py +436 -0
- metaflow/util.py +22 -0
- metaflow/version.py +1 -1
- {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/METADATA +12 -3
- {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/RECORD +96 -84
- {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/WHEEL +1 -1
- {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/LICENSE +0 -0
- {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/entry_points.txt +0 -0
- {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/top_level.txt +0 -0
metaflow/runtime.py
CHANGED
|
@@ -6,9 +6,12 @@ using local / remote processes
|
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
from __future__ import print_function
|
|
9
|
+
import json
|
|
9
10
|
import os
|
|
10
11
|
import sys
|
|
11
12
|
import fcntl
|
|
13
|
+
import re
|
|
14
|
+
import tempfile
|
|
12
15
|
import time
|
|
13
16
|
import subprocess
|
|
14
17
|
from datetime import datetime
|
|
@@ -31,6 +34,7 @@ from . import procpoll
|
|
|
31
34
|
from .datastore import TaskDataStoreSet
|
|
32
35
|
from .debug import debug
|
|
33
36
|
from .decorators import flow_decorators
|
|
37
|
+
from .flowspec import _FlowState
|
|
34
38
|
from .mflog import mflog, RUNTIME_LOG_SOURCE
|
|
35
39
|
from .util import to_unicode, compress_list, unicode_type
|
|
36
40
|
from .clone_util import clone_task_helper
|
|
@@ -39,6 +43,10 @@ from .unbounded_foreach import (
|
|
|
39
43
|
UBF_CONTROL,
|
|
40
44
|
UBF_TASK,
|
|
41
45
|
)
|
|
46
|
+
|
|
47
|
+
from .user_configs.config_options import ConfigInput
|
|
48
|
+
from .user_configs.config_parameters import dump_config_values
|
|
49
|
+
|
|
42
50
|
import metaflow.tracing as tracing
|
|
43
51
|
|
|
44
52
|
MAX_WORKERS = 16
|
|
@@ -49,7 +57,13 @@ PROGRESS_INTERVAL = 300 # s
|
|
|
49
57
|
# The following is a list of the (data) artifacts used by the runtime while
|
|
50
58
|
# executing a flow. These are prefetched during the resume operation by
|
|
51
59
|
# leveraging the TaskDataStoreSet.
|
|
52
|
-
PREFETCH_DATA_ARTIFACTS = [
|
|
60
|
+
PREFETCH_DATA_ARTIFACTS = [
|
|
61
|
+
"_foreach_stack",
|
|
62
|
+
"_task_ok",
|
|
63
|
+
"_transition",
|
|
64
|
+
"_control_mapper_tasks",
|
|
65
|
+
"_control_task_is_mapper_zero",
|
|
66
|
+
]
|
|
53
67
|
RESUME_POLL_SECONDS = 60
|
|
54
68
|
|
|
55
69
|
# Runtime must use logsource=RUNTIME_LOG_SOURCE for all loglines that it
|
|
@@ -111,7 +125,7 @@ class NativeRuntime(object):
|
|
|
111
125
|
self._clone_run_id = clone_run_id
|
|
112
126
|
self._clone_only = clone_only
|
|
113
127
|
self._cloned_tasks = []
|
|
114
|
-
self.
|
|
128
|
+
self._ran_or_scheduled_task_index = set()
|
|
115
129
|
self._reentrant = reentrant
|
|
116
130
|
self._run_url = None
|
|
117
131
|
|
|
@@ -269,6 +283,8 @@ class NativeRuntime(object):
|
|
|
269
283
|
step_name,
|
|
270
284
|
task_id,
|
|
271
285
|
pathspec_index,
|
|
286
|
+
cloned_task_pathspec_index,
|
|
287
|
+
finished_tuple,
|
|
272
288
|
ubf_context,
|
|
273
289
|
generate_task_obj,
|
|
274
290
|
verbose=False,
|
|
@@ -281,8 +297,13 @@ class NativeRuntime(object):
|
|
|
281
297
|
task.ubf_context = ubf_context
|
|
282
298
|
new_task_id = task.task_id
|
|
283
299
|
self._cloned_tasks.append(task)
|
|
284
|
-
self.
|
|
285
|
-
|
|
300
|
+
self._ran_or_scheduled_task_index.add(cloned_task_pathspec_index)
|
|
301
|
+
task_pathspec = "{}/{}/{}".format(self._run_id, step_name, new_task_id)
|
|
302
|
+
else:
|
|
303
|
+
task_pathspec = "{}/{}/{}".format(self._run_id, step_name, new_task_id)
|
|
304
|
+
Task.clone_pathspec_mapping[task_pathspec] = "{}/{}/{}".format(
|
|
305
|
+
self._clone_run_id, step_name, task_id
|
|
306
|
+
)
|
|
286
307
|
if verbose:
|
|
287
308
|
self._logger(
|
|
288
309
|
"Cloning task from {}/{}/{}/{} to {}/{}/{}/{}".format(
|
|
@@ -308,6 +329,8 @@ class NativeRuntime(object):
|
|
|
308
329
|
self._metadata,
|
|
309
330
|
origin_ds_set=self._origin_ds_set,
|
|
310
331
|
)
|
|
332
|
+
self._finished[(step_name, finished_tuple)] = task_pathspec
|
|
333
|
+
self._is_cloned[task_pathspec] = True
|
|
311
334
|
except Exception as e:
|
|
312
335
|
self._logger(
|
|
313
336
|
"Cloning {}/{}/{}/{} failed with error: {}".format(
|
|
@@ -323,7 +346,8 @@ class NativeRuntime(object):
|
|
|
323
346
|
|
|
324
347
|
inputs = []
|
|
325
348
|
|
|
326
|
-
ubf_mapper_tasks_to_clone =
|
|
349
|
+
ubf_mapper_tasks_to_clone = set()
|
|
350
|
+
ubf_control_tasks = set()
|
|
327
351
|
# We only clone ubf mapper tasks if the control task is complete.
|
|
328
352
|
# Here we need to check which control tasks are complete, and then get the corresponding
|
|
329
353
|
# mapper tasks.
|
|
@@ -331,13 +355,25 @@ class NativeRuntime(object):
|
|
|
331
355
|
_, step_name, task_id = task_ds.pathspec.split("/")
|
|
332
356
|
pathspec_index = task_ds.pathspec_index
|
|
333
357
|
if task_ds["_task_ok"] and step_name != "_parameters":
|
|
334
|
-
#
|
|
358
|
+
# Control task contains "_control_mapper_tasks" but, in the case of
|
|
359
|
+
# @parallel decorator, the control task is also a mapper task so we
|
|
360
|
+
# need to distinguish this using _control_task_is_mapper_zero
|
|
335
361
|
control_mapper_tasks = (
|
|
336
362
|
[]
|
|
337
363
|
if "_control_mapper_tasks" not in task_ds
|
|
338
364
|
else task_ds["_control_mapper_tasks"]
|
|
339
365
|
)
|
|
340
|
-
|
|
366
|
+
if control_mapper_tasks:
|
|
367
|
+
if task_ds.get("_control_task_is_mapper_zero", False):
|
|
368
|
+
# Strip out the control task of list of mapper tasks
|
|
369
|
+
ubf_control_tasks.add(control_mapper_tasks[0])
|
|
370
|
+
ubf_mapper_tasks_to_clone.update(control_mapper_tasks[1:])
|
|
371
|
+
else:
|
|
372
|
+
ubf_mapper_tasks_to_clone.update(control_mapper_tasks)
|
|
373
|
+
# Since we only add mapper tasks here, if we are not in the list
|
|
374
|
+
# we are a control task
|
|
375
|
+
if task_ds.pathspec not in ubf_mapper_tasks_to_clone:
|
|
376
|
+
ubf_control_tasks.add(task_ds.pathspec)
|
|
341
377
|
|
|
342
378
|
for task_ds in self._origin_ds_set:
|
|
343
379
|
_, step_name, task_id = task_ds.pathspec.split("/")
|
|
@@ -348,35 +384,58 @@ class NativeRuntime(object):
|
|
|
348
384
|
and step_name != "_parameters"
|
|
349
385
|
and (step_name not in self._steps_to_rerun)
|
|
350
386
|
):
|
|
351
|
-
# "_unbounded_foreach" is a special flag to indicate that the transition
|
|
352
|
-
#
|
|
353
|
-
#
|
|
387
|
+
# "_unbounded_foreach" is a special flag to indicate that the transition
|
|
388
|
+
# is an unbounded foreach.
|
|
389
|
+
# Both parent and splitted children tasks will have this flag set.
|
|
390
|
+
# The splitted control/mapper tasks
|
|
391
|
+
# are not foreach types because UBF is always followed by a join step.
|
|
354
392
|
is_ubf_task = (
|
|
355
393
|
"_unbounded_foreach" in task_ds and task_ds["_unbounded_foreach"]
|
|
356
|
-
) and (self._graph[step_name].
|
|
394
|
+
) and (self._graph[step_name].type != "foreach")
|
|
357
395
|
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
)
|
|
364
|
-
is_ubf_mapper_tasks = is_ubf_task and (not is_ubf_control_task)
|
|
365
|
-
if is_ubf_mapper_tasks and (
|
|
396
|
+
is_ubf_control_task = task_ds.pathspec in ubf_control_tasks
|
|
397
|
+
|
|
398
|
+
is_ubf_mapper_task = is_ubf_task and (not is_ubf_control_task)
|
|
399
|
+
|
|
400
|
+
if is_ubf_mapper_task and (
|
|
366
401
|
task_ds.pathspec not in ubf_mapper_tasks_to_clone
|
|
367
402
|
):
|
|
368
|
-
# Skip copying UBF mapper tasks if control
|
|
403
|
+
# Skip copying UBF mapper tasks if control task is incomplete.
|
|
369
404
|
continue
|
|
370
405
|
|
|
371
406
|
ubf_context = None
|
|
372
407
|
if is_ubf_task:
|
|
373
|
-
ubf_context = "ubf_test" if
|
|
408
|
+
ubf_context = "ubf_test" if is_ubf_mapper_task else "ubf_control"
|
|
409
|
+
|
|
410
|
+
finished_tuple = tuple(
|
|
411
|
+
[s._replace(value=0) for s in task_ds.get("_foreach_stack", ())]
|
|
412
|
+
)
|
|
413
|
+
cloned_task_pathspec_index = pathspec_index.split("/")[1]
|
|
414
|
+
if task_ds.get("_control_task_is_mapper_zero", False):
|
|
415
|
+
# Replace None with index 0 for control task as it is part of the
|
|
416
|
+
# UBF (as a mapper as well)
|
|
417
|
+
finished_tuple = finished_tuple[:-1] + (
|
|
418
|
+
finished_tuple[-1]._replace(index=0),
|
|
419
|
+
)
|
|
420
|
+
# We need this reverse override though because when we check
|
|
421
|
+
# if a task has been cloned in _queue_push, the index will be None
|
|
422
|
+
# because the _control_task_is_mapper_zero is set in the control
|
|
423
|
+
# task *itself* and *not* in the one that is launching the UBF nest.
|
|
424
|
+
# This means that _translate_index will use None.
|
|
425
|
+
cloned_task_pathspec_index = re.sub(
|
|
426
|
+
r"(\[(?:\d+, ?)*)0\]",
|
|
427
|
+
lambda m: (m.group(1) or "[") + "None]",
|
|
428
|
+
cloned_task_pathspec_index,
|
|
429
|
+
)
|
|
430
|
+
|
|
374
431
|
inputs.append(
|
|
375
432
|
(
|
|
376
433
|
step_name,
|
|
377
434
|
task_id,
|
|
378
435
|
pathspec_index,
|
|
379
|
-
|
|
436
|
+
cloned_task_pathspec_index,
|
|
437
|
+
finished_tuple,
|
|
438
|
+
is_ubf_mapper_task,
|
|
380
439
|
ubf_context,
|
|
381
440
|
)
|
|
382
441
|
)
|
|
@@ -388,15 +447,19 @@ class NativeRuntime(object):
|
|
|
388
447
|
step_name,
|
|
389
448
|
task_id,
|
|
390
449
|
pathspec_index,
|
|
450
|
+
cloned_task_pathspec_index,
|
|
451
|
+
finished_tuple,
|
|
391
452
|
ubf_context=ubf_context,
|
|
392
|
-
generate_task_obj=generate_task_obj and (not
|
|
453
|
+
generate_task_obj=generate_task_obj and (not is_ubf_mapper_task),
|
|
393
454
|
verbose=verbose,
|
|
394
455
|
)
|
|
395
456
|
for (
|
|
396
457
|
step_name,
|
|
397
458
|
task_id,
|
|
398
459
|
pathspec_index,
|
|
399
|
-
|
|
460
|
+
cloned_task_pathspec_index,
|
|
461
|
+
finished_tuple,
|
|
462
|
+
is_ubf_mapper_task,
|
|
400
463
|
ubf_context,
|
|
401
464
|
) in inputs
|
|
402
465
|
]
|
|
@@ -417,82 +480,95 @@ class NativeRuntime(object):
|
|
|
417
480
|
else:
|
|
418
481
|
self._queue_push("start", {})
|
|
419
482
|
progress_tstamp = time.time()
|
|
420
|
-
|
|
421
|
-
#
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
#
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
for k, v in self._active_tasks.items()
|
|
444
|
-
if k != 0 and v[0] > 0
|
|
445
|
-
]
|
|
446
|
-
)
|
|
447
|
-
if self._active_tasks[0] == 0:
|
|
448
|
-
msg = "No tasks are running."
|
|
483
|
+
with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8") as config_file:
|
|
484
|
+
# Configurations are passed through a file to avoid overloading the
|
|
485
|
+
# command-line. We only need to create this file once and it can be reused
|
|
486
|
+
# for any task launch
|
|
487
|
+
config_value = dump_config_values(self._flow)
|
|
488
|
+
if config_value:
|
|
489
|
+
json.dump(config_value, config_file)
|
|
490
|
+
config_file.flush()
|
|
491
|
+
self._config_file_name = config_file.name
|
|
492
|
+
else:
|
|
493
|
+
self._config_file_name = None
|
|
494
|
+
try:
|
|
495
|
+
# main scheduling loop
|
|
496
|
+
exception = None
|
|
497
|
+
while (
|
|
498
|
+
self._run_queue or self._active_tasks[0] > 0 or self._cloned_tasks
|
|
499
|
+
):
|
|
500
|
+
# 1. are any of the current workers finished?
|
|
501
|
+
if self._cloned_tasks:
|
|
502
|
+
finished_tasks = self._cloned_tasks
|
|
503
|
+
# reset the list of cloned tasks and let poll_workers handle
|
|
504
|
+
# the remaining transition
|
|
505
|
+
self._cloned_tasks = []
|
|
449
506
|
else:
|
|
450
|
-
|
|
451
|
-
|
|
507
|
+
finished_tasks = list(self._poll_workers())
|
|
508
|
+
# 2. push new tasks triggered by the finished tasks to the queue
|
|
509
|
+
self._queue_tasks(finished_tasks)
|
|
510
|
+
# 3. if there are available worker slots, pop and start tasks
|
|
511
|
+
# from the queue.
|
|
512
|
+
self._launch_workers()
|
|
513
|
+
|
|
514
|
+
if time.time() - progress_tstamp > PROGRESS_INTERVAL:
|
|
515
|
+
progress_tstamp = time.time()
|
|
516
|
+
tasks_print = ", ".join(
|
|
517
|
+
[
|
|
518
|
+
"%s (%d running; %d done)" % (k, v[0], v[1])
|
|
519
|
+
for k, v in self._active_tasks.items()
|
|
520
|
+
if k != 0 and v[0] > 0
|
|
521
|
+
]
|
|
522
|
+
)
|
|
523
|
+
if self._active_tasks[0] == 0:
|
|
524
|
+
msg = "No tasks are running."
|
|
452
525
|
else:
|
|
453
|
-
|
|
454
|
-
|
|
526
|
+
if self._active_tasks[0] == 1:
|
|
527
|
+
msg = "1 task is running: "
|
|
528
|
+
else:
|
|
529
|
+
msg = "%d tasks are running: " % self._active_tasks[0]
|
|
530
|
+
msg += "%s." % tasks_print
|
|
455
531
|
|
|
456
|
-
|
|
532
|
+
self._logger(msg, system_msg=True)
|
|
457
533
|
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
else:
|
|
461
|
-
if len(self._run_queue) == 1:
|
|
462
|
-
msg = "1 task is waiting in the queue: "
|
|
534
|
+
if len(self._run_queue) == 0:
|
|
535
|
+
msg = "No tasks are waiting in the queue."
|
|
463
536
|
else:
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
537
|
+
if len(self._run_queue) == 1:
|
|
538
|
+
msg = "1 task is waiting in the queue: "
|
|
539
|
+
else:
|
|
540
|
+
msg = "%d tasks are waiting in the queue." % len(
|
|
541
|
+
self._run_queue
|
|
542
|
+
)
|
|
467
543
|
|
|
468
|
-
self._logger(msg, system_msg=True)
|
|
469
|
-
if len(self._unprocessed_steps) > 0:
|
|
470
|
-
if len(self._unprocessed_steps) == 1:
|
|
471
|
-
msg = "%s step has not started" % (
|
|
472
|
-
next(iter(self._unprocessed_steps)),
|
|
473
|
-
)
|
|
474
|
-
else:
|
|
475
|
-
msg = "%d steps have not started: " % len(
|
|
476
|
-
self._unprocessed_steps
|
|
477
|
-
)
|
|
478
|
-
msg += "%s." % ", ".join(self._unprocessed_steps)
|
|
479
544
|
self._logger(msg, system_msg=True)
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
545
|
+
if len(self._unprocessed_steps) > 0:
|
|
546
|
+
if len(self._unprocessed_steps) == 1:
|
|
547
|
+
msg = "%s step has not started" % (
|
|
548
|
+
next(iter(self._unprocessed_steps)),
|
|
549
|
+
)
|
|
550
|
+
else:
|
|
551
|
+
msg = "%d steps have not started: " % len(
|
|
552
|
+
self._unprocessed_steps
|
|
553
|
+
)
|
|
554
|
+
msg += "%s." % ", ".join(self._unprocessed_steps)
|
|
555
|
+
self._logger(msg, system_msg=True)
|
|
556
|
+
|
|
557
|
+
except KeyboardInterrupt as ex:
|
|
558
|
+
self._logger("Workflow interrupted.", system_msg=True, bad=True)
|
|
559
|
+
self._killall()
|
|
560
|
+
exception = ex
|
|
561
|
+
raise
|
|
562
|
+
except Exception as ex:
|
|
563
|
+
self._logger("Workflow failed.", system_msg=True, bad=True)
|
|
564
|
+
self._killall()
|
|
565
|
+
exception = ex
|
|
566
|
+
raise
|
|
567
|
+
finally:
|
|
568
|
+
# on finish clean tasks
|
|
569
|
+
for step in self._flow:
|
|
570
|
+
for deco in step.decorators:
|
|
571
|
+
deco.runtime_finished(exception)
|
|
496
572
|
|
|
497
573
|
# assert that end was executed and it was successful
|
|
498
574
|
if ("end", ()) in self._finished:
|
|
@@ -546,7 +622,6 @@ class NativeRuntime(object):
|
|
|
546
622
|
# Given the current task information (task_index), the type of transition,
|
|
547
623
|
# and the split index, return the new task index.
|
|
548
624
|
def _translate_index(self, task, next_step, type, split_index=None):
|
|
549
|
-
import re
|
|
550
625
|
|
|
551
626
|
match = re.match(r"^(.+)\[(.*)\]$", task.task_index)
|
|
552
627
|
if match:
|
|
@@ -574,10 +649,18 @@ class NativeRuntime(object):
|
|
|
574
649
|
# Store the parameters needed for task creation, so that pushing on items
|
|
575
650
|
# onto the run_queue is an inexpensive operation.
|
|
576
651
|
def _queue_push(self, step, task_kwargs, index=None):
|
|
577
|
-
#
|
|
578
|
-
#
|
|
579
|
-
|
|
580
|
-
|
|
652
|
+
# In the case of cloning, we set all the cloned tasks as the
|
|
653
|
+
# finished tasks when pushing tasks using _queue_tasks. This means that we
|
|
654
|
+
# could potentially try to push the same task multiple times (for example
|
|
655
|
+
# if multiple parents of a join are cloned). We therefore keep track of what
|
|
656
|
+
# has executed (been cloned) or what has been scheduled and avoid scheduling
|
|
657
|
+
# it again.
|
|
658
|
+
if index:
|
|
659
|
+
if index in self._ran_or_scheduled_task_index:
|
|
660
|
+
# It has already run or been scheduled
|
|
661
|
+
return
|
|
662
|
+
# Note that we are scheduling this to run
|
|
663
|
+
self._ran_or_scheduled_task_index.add(index)
|
|
581
664
|
self._run_queue.insert(0, (step, task_kwargs))
|
|
582
665
|
# For foreaches, this will happen multiple time but is ok, becomes a no-op
|
|
583
666
|
self._unprocessed_steps.discard(step)
|
|
@@ -640,15 +723,18 @@ class NativeRuntime(object):
|
|
|
640
723
|
# If the control task is cloned, all mapper tasks should have been cloned
|
|
641
724
|
# as well, so we no longer need to handle cloning of mapper tasks in runtime.
|
|
642
725
|
|
|
643
|
-
# Update _finished
|
|
644
|
-
#
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
726
|
+
# Update _finished if we are not cloned. If we were cloned, we already
|
|
727
|
+
# updated _finished with the new tasks. Note that the *value* of mapper
|
|
728
|
+
# tasks is incorrect and contains the pathspec of the *cloned* run
|
|
729
|
+
# but we don't use it for anything. We could look to clean it up though
|
|
730
|
+
if not task.is_cloned:
|
|
731
|
+
_, foreach_stack = task.finished_id
|
|
732
|
+
top = foreach_stack[-1]
|
|
733
|
+
bottom = list(foreach_stack[:-1])
|
|
734
|
+
for i in range(num_splits):
|
|
735
|
+
s = tuple(bottom + [top._replace(index=i)])
|
|
736
|
+
self._finished[(task.step, s)] = mapper_tasks[i]
|
|
737
|
+
self._is_cloned[mapper_tasks[i]] = False
|
|
652
738
|
|
|
653
739
|
# Find and check status of control task and retrieve its pathspec
|
|
654
740
|
# for retrieving unbounded foreach cardinality.
|
|
@@ -901,7 +987,7 @@ class NativeRuntime(object):
|
|
|
901
987
|
)
|
|
902
988
|
return
|
|
903
989
|
|
|
904
|
-
worker = Worker(task, self._max_log_size)
|
|
990
|
+
worker = Worker(task, self._max_log_size, self._config_file_name)
|
|
905
991
|
for fd in worker.fds():
|
|
906
992
|
self._workers[fd] = worker
|
|
907
993
|
self._poll.add(fd)
|
|
@@ -1080,7 +1166,7 @@ class Task(object):
|
|
|
1080
1166
|
# To avoid the edge case where the resume leader is selected but has not
|
|
1081
1167
|
# yet written the _resume_leader metadata, we will wait for a few seconds.
|
|
1082
1168
|
# We will wait for resume leader for at most 3 times.
|
|
1083
|
-
for
|
|
1169
|
+
for _ in range(3):
|
|
1084
1170
|
if ds.has_metadata("_resume_leader", add_attempt=False):
|
|
1085
1171
|
resume_leader = ds.load_metadata(
|
|
1086
1172
|
["_resume_leader"], add_attempt=False
|
|
@@ -1181,7 +1267,6 @@ class Task(object):
|
|
|
1181
1267
|
# Open the output datastore only if the task is not being cloned.
|
|
1182
1268
|
if not self._is_cloned:
|
|
1183
1269
|
self.new_attempt()
|
|
1184
|
-
|
|
1185
1270
|
for deco in decos:
|
|
1186
1271
|
deco.runtime_task_created(
|
|
1187
1272
|
self._ds,
|
|
@@ -1448,6 +1533,15 @@ class CLIArgs(object):
|
|
|
1448
1533
|
for deco in flow_decorators(self.task.flow):
|
|
1449
1534
|
self.top_level_options.update(deco.get_top_level_options())
|
|
1450
1535
|
|
|
1536
|
+
# We also pass configuration options using the kv.<name> syntax which will cause
|
|
1537
|
+
# the configuration options to be loaded from the CONFIG file (or local-config-file
|
|
1538
|
+
# in the case of the local runtime)
|
|
1539
|
+
configs = self.task.flow._flow_state.get(_FlowState.CONFIGS)
|
|
1540
|
+
if configs:
|
|
1541
|
+
self.top_level_options["config-value"] = [
|
|
1542
|
+
(k, ConfigInput.make_key_name(k)) for k in configs
|
|
1543
|
+
]
|
|
1544
|
+
|
|
1451
1545
|
self.commands = ["step"]
|
|
1452
1546
|
self.command_args = [self.task.step]
|
|
1453
1547
|
self.command_options = {
|
|
@@ -1481,12 +1575,15 @@ class CLIArgs(object):
|
|
|
1481
1575
|
for value in v:
|
|
1482
1576
|
yield "--%s" % k
|
|
1483
1577
|
if not isinstance(value, bool):
|
|
1484
|
-
|
|
1578
|
+
value = value if isinstance(value, tuple) else (value,)
|
|
1579
|
+
for vv in value:
|
|
1580
|
+
yield to_unicode(vv)
|
|
1485
1581
|
|
|
1486
1582
|
args = list(self.entrypoint)
|
|
1487
1583
|
args.extend(_options(self.top_level_options))
|
|
1488
1584
|
args.extend(self.commands)
|
|
1489
1585
|
args.extend(self.command_args)
|
|
1586
|
+
|
|
1490
1587
|
args.extend(_options(self.command_options))
|
|
1491
1588
|
return args
|
|
1492
1589
|
|
|
@@ -1498,8 +1595,9 @@ class CLIArgs(object):
|
|
|
1498
1595
|
|
|
1499
1596
|
|
|
1500
1597
|
class Worker(object):
|
|
1501
|
-
def __init__(self, task, max_logs_size):
|
|
1598
|
+
def __init__(self, task, max_logs_size, config_file_name):
|
|
1502
1599
|
self.task = task
|
|
1600
|
+
self._config_file_name = config_file_name
|
|
1503
1601
|
self._proc = self._launch()
|
|
1504
1602
|
|
|
1505
1603
|
if task.retries > task.user_code_retries:
|
|
@@ -1551,6 +1649,12 @@ class Worker(object):
|
|
|
1551
1649
|
self.task.user_code_retries,
|
|
1552
1650
|
self.task.ubf_context,
|
|
1553
1651
|
)
|
|
1652
|
+
|
|
1653
|
+
# Add user configurations using a file to avoid using up too much space on the
|
|
1654
|
+
# command line
|
|
1655
|
+
if self._config_file_name:
|
|
1656
|
+
args.top_level_options["local-config-file"] = self._config_file_name
|
|
1657
|
+
# Pass configuration options
|
|
1554
1658
|
env.update(args.get_env())
|
|
1555
1659
|
env["PYTHONUNBUFFERED"] = "x"
|
|
1556
1660
|
tracing.inject_tracing_vars(env)
|
|
@@ -48,8 +48,8 @@ def process_messages(worker_type, worker):
|
|
|
48
48
|
pass
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
@tracing.cli_entrypoint("sidecar")
|
|
52
51
|
@click.command(help="Initialize workers")
|
|
52
|
+
@tracing.cli_entrypoint("sidecar")
|
|
53
53
|
@click.argument("worker-type")
|
|
54
54
|
def main(worker_type):
|
|
55
55
|
sidecar_type = SIDECARS.get(worker_type)
|
|
@@ -18,8 +18,11 @@ tracer_provider = None
|
|
|
18
18
|
|
|
19
19
|
def init_tracing():
|
|
20
20
|
global tracer_provider
|
|
21
|
+
# Disable logging from opentelemetry
|
|
22
|
+
import logging
|
|
23
|
+
logging.getLogger("opentelemetry").setLevel(logging.FATAL)
|
|
21
24
|
if tracer_provider is not None:
|
|
22
|
-
print("Tracing already initialized", file=sys.stderr)
|
|
25
|
+
# print("Tracing already initialized", file=sys.stderr)
|
|
23
26
|
return
|
|
24
27
|
|
|
25
28
|
from .propagator import EnvPropagator
|
|
File without changes
|