mindspore 2.7.0__cp310-cp310-win_amd64.whl → 2.7.0rc1__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mindspore might be problematic. Click here for more details.
- mindspore/.commit_id +1 -1
- mindspore/__init__.py +1 -1
- mindspore/_c_dataengine.cp310-win_amd64.pyd +0 -0
- mindspore/_c_expression.cp310-win_amd64.pyd +0 -0
- mindspore/_c_mindrecord.cp310-win_amd64.pyd +0 -0
- mindspore/_checkparam.py +2 -2
- mindspore/_extends/builtin_operations.py +3 -3
- mindspore/_extends/parallel_compile/akg_compiler/gen_custom_op_files.py +1 -1
- mindspore/_extends/parse/__init__.py +3 -3
- mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +1 -0
- mindspore/_extends/parse/parser.py +22 -28
- mindspore/_extends/parse/standard_method.py +1 -15
- mindspore/_extends/pijit/pijit_func_white_list.py +5 -2
- mindspore/_extends/remote/kernel_build_server_ascend.py +75 -0
- mindspore/amp.py +18 -0
- mindspore/avcodec-59.dll +0 -0
- mindspore/avdevice-59.dll +0 -0
- mindspore/avfilter-8.dll +0 -0
- mindspore/avformat-59.dll +0 -0
- mindspore/avutil-57.dll +0 -0
- mindspore/common/__init__.py +12 -18
- mindspore/common/_tensor_cpp_method.py +1 -1
- mindspore/common/_tensor_docs.py +38 -102
- mindspore/common/_utils.py +1 -9
- mindspore/common/api.py +106 -155
- mindspore/common/{dynamic_shape/auto_dynamic_shape.py → auto_dynamic_shape.py} +23 -17
- mindspore/common/dtype.py +57 -98
- mindspore/common/dump.py +1 -1
- mindspore/common/file_system.py +9 -59
- mindspore/common/hook_handle.py +3 -22
- mindspore/common/np_dtype.py +3 -3
- mindspore/common/parameter.py +20 -4
- mindspore/common/recompute.py +4 -2
- mindspore/common/tensor.py +52 -38
- mindspore/communication/_hccl_management.py +297 -0
- mindspore/context.py +21 -15
- mindspore/dataset/__init__.py +1 -1
- mindspore/dataset/audio/transforms.py +1 -1
- mindspore/dataset/core/config.py +1 -35
- mindspore/dataset/engine/datasets.py +315 -330
- mindspore/dataset/engine/datasets_user_defined.py +22 -38
- mindspore/dataset/transforms/c_transforms.py +2 -2
- mindspore/dataset/transforms/transforms.py +3 -3
- mindspore/dataset/vision/__init__.py +1 -1
- mindspore/dataset/vision/py_transforms.py +8 -8
- mindspore/dataset/vision/transforms.py +5 -17
- mindspore/dataset/vision/utils.py +21 -632
- mindspore/device_context/ascend/op_tuning.py +1 -35
- mindspore/dnnl.dll +0 -0
- mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +0 -3
- mindspore/include/api/cell.h +4 -28
- mindspore/include/api/cfg.h +7 -24
- mindspore/include/api/context.h +0 -1
- mindspore/include/api/delegate.h +2 -0
- mindspore/include/api/dual_abi_helper.h +19 -100
- mindspore/include/api/graph.h +1 -14
- mindspore/include/api/kernel.h +3 -16
- mindspore/include/api/kernel_api.h +1 -9
- mindspore/include/api/metrics/accuracy.h +0 -9
- mindspore/include/api/model.h +1 -5
- mindspore/include/api/model_group.h +0 -4
- mindspore/include/api/model_parallel_runner.h +0 -2
- mindspore/include/api/status.h +10 -48
- mindspore/include/api/types.h +1 -6
- mindspore/include/dataset/constants.h +0 -9
- mindspore/jpeg62.dll +0 -0
- mindspore/mindrecord/tools/cifar10.py +2 -3
- mindspore/mindrecord/tools/cifar10_to_mr.py +5 -5
- mindspore/mindspore_backend_common.dll +0 -0
- mindspore/mindspore_backend_manager.dll +0 -0
- mindspore/mindspore_common.dll +0 -0
- mindspore/mindspore_core.dll +0 -0
- mindspore/mindspore_cpu_res_manager.dll +0 -0
- mindspore/mindspore_dump.dll +0 -0
- mindspore/mindspore_frontend.dll +0 -0
- mindspore/mindspore_glog.dll +0 -0
- mindspore/mindspore_memory_pool.dll +0 -0
- mindspore/mindspore_ms_backend.dll +0 -0
- mindspore/mindspore_ops.dll +0 -0
- mindspore/mindspore_ops_host.dll +0 -0
- mindspore/mindspore_ops_kernel_common.dll +0 -0
- mindspore/mindspore_profiler.dll +0 -0
- mindspore/mindspore_pyboost.dll +0 -0
- mindspore/mindspore_pynative.dll +0 -0
- mindspore/mindspore_res_manager.dll +0 -0
- mindspore/mindspore_runtime_pipeline.dll +0 -0
- mindspore/mint/distributed/__init__.py +0 -4
- mindspore/mint/distributed/distributed.py +14 -217
- mindspore/mint/nn/layer/_functions.py +2 -1
- mindspore/mint/nn/layer/conv.py +6 -6
- mindspore/mint/nn/layer/normalization.py +3 -3
- mindspore/nn/cell.py +174 -216
- mindspore/nn/layer/activation.py +2 -4
- mindspore/nn/layer/basic.py +13 -7
- mindspore/nn/layer/image.py +1 -1
- mindspore/nn/optim/adam.py +3 -1
- mindspore/nn/optim/lamb.py +3 -1
- mindspore/nn/optim/tft_wrapper.py +3 -2
- mindspore/nn/probability/distribution/_utils/utils.py +2 -2
- mindspore/nn/wrap/cell_wrapper.py +5 -39
- mindspore/nn/wrap/grad_reducer.py +15 -0
- mindspore/numpy/array_creations.py +2 -2
- mindspore/numpy/utils_const.py +1 -1
- mindspore/opencv_core452.dll +0 -0
- mindspore/opencv_imgcodecs452.dll +0 -0
- mindspore/opencv_imgproc452.dll +0 -0
- mindspore/ops/_grad_experimental/grad_inner_ops.py +9 -0
- mindspore/ops/_op_impl/cpu/__init__.py +0 -1
- mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +2 -12
- mindspore/ops/auto_generate/gen_extend_func.py +4 -4
- mindspore/ops/auto_generate/gen_ops_def.py +16 -290
- mindspore/ops/auto_generate/gen_ops_prim.py +76 -563
- mindspore/ops/composite/base.py +1 -1
- mindspore/ops/composite/multitype_ops/_constexpr_utils.py +1 -1
- mindspore/ops/function/__init__.py +0 -1
- mindspore/ops/function/array_func.py +6 -10
- mindspore/ops/function/debug_func.py +2 -4
- mindspore/ops/function/grad/grad_func.py +12 -4
- mindspore/ops/function/math_func.py +32 -44
- mindspore/ops/function/nn_func.py +20 -18
- mindspore/ops/functional.py +1 -2
- mindspore/ops/functional_overload.py +12 -23
- mindspore/ops/operations/_inner_ops.py +12 -11
- mindspore/ops/operations/array_ops.py +50 -4
- mindspore/ops/operations/comm_ops.py +15 -1
- mindspore/ops/operations/custom_ops.py +4 -10
- mindspore/ops/operations/debug_ops.py +6 -6
- mindspore/ops/operations/manually_defined/ops_def.py +12 -12
- mindspore/ops/operations/math_ops.py +5 -5
- mindspore/ops/operations/nn_ops.py +1 -1
- mindspore/ops/primitive.py +10 -3
- mindspore/ops/tensor_method.py +7 -16
- mindspore/ops_generate/pyboost/gen_pyboost_func.py +16 -0
- mindspore/parallel/_auto_parallel_context.py +15 -5
- mindspore/parallel/_parallel_serialization.py +2 -3
- mindspore/parallel/_ps_context.py +2 -2
- mindspore/parallel/_transformer/transformer.py +4 -4
- mindspore/parallel/_utils.py +11 -5
- mindspore/parallel/auto_parallel.py +9 -23
- mindspore/parallel/checkpoint_transform.py +0 -2
- mindspore/parallel/cluster/process_entity/_api.py +1 -4
- mindspore/parallel/cluster/run.py +3 -5
- mindspore/parallel/function/reshard_func.py +5 -6
- mindspore/parallel/nn/parallel_cell_wrapper.py +3 -40
- mindspore/parallel/nn/parallel_grad_reducer.py +8 -0
- mindspore/parallel/shard.py +21 -7
- mindspore/parallel/transform_safetensors.py +4 -10
- mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +9 -10
- mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +1 -1
- mindspore/profiler/common/msprof_cmd_tool.py +2 -2
- mindspore/profiler/common/path_manager.py +0 -9
- mindspore/profiler/common/profiler_context.py +2 -25
- mindspore/profiler/common/profiler_meta_data.py +0 -1
- mindspore/profiler/common/profiler_op_analyse.py +6 -10
- mindspore/{ops/_op_impl/cpu/joinedstr_op.py → profiler/common/validator/__init__.py} +1 -15
- mindspore/profiler/common/validator/validate_path.py +84 -0
- mindspore/profiler/dynamic_profiler.py +46 -91
- mindspore/profiler/envprofiler.py +5 -30
- mindspore/profiler/experimental_config.py +1 -16
- mindspore/profiler/platform/cpu_profiler.py +4 -10
- mindspore/profiler/platform/npu_profiler.py +1 -1
- mindspore/profiler/profiler.py +145 -193
- mindspore/profiler/profiler_action_controller.py +1 -1
- mindspore/profiler/profiler_interface.py +2 -2
- mindspore/rewrite/symbol_tree/symbol_tree.py +1 -1
- mindspore/runtime/__init__.py +4 -6
- mindspore/runtime/executor.py +0 -27
- mindspore/runtime/memory.py +0 -1
- mindspore/runtime/thread_bind_core.py +1 -1
- mindspore/swresample-4.dll +0 -0
- mindspore/swscale-6.dll +0 -0
- mindspore/tinyxml2.dll +0 -0
- mindspore/train/_utils.py +3 -3
- mindspore/train/amp.py +3 -0
- mindspore/train/callback/_callback.py +1 -2
- mindspore/train/callback/_checkpoint.py +8 -1
- mindspore/train/callback/_flops_collector.py +6 -10
- mindspore/train/callback/_train_fault_tolerance.py +7 -3
- mindspore/train/data_sink.py +4 -4
- mindspore/train/dataset_helper.py +5 -5
- mindspore/train/model.py +20 -4
- mindspore/train/serialization.py +15 -35
- mindspore/train/train_thor/model_thor.py +2 -2
- mindspore/turbojpeg.dll +0 -0
- mindspore/utils/hooks.py +81 -0
- mindspore/utils/utils.py +8 -8
- mindspore/version.py +1 -1
- {mindspore-2.7.0.dist-info → mindspore-2.7.0rc1.dist-info}/METADATA +1 -1
- {mindspore-2.7.0.dist-info → mindspore-2.7.0rc1.dist-info}/RECORD +193 -192
- mindspore/_extends/parallel_compile/akg_compiler/custom.py +0 -1109
- mindspore/common/dynamic_shape/__init__.py +0 -0
- mindspore/common/dynamic_shape/enable_dynamic.py +0 -197
- /mindspore/common/{dynamic_shape/_auto_dynamic.py → _auto_dynamic.py} +0 -0
- {mindspore-2.7.0.dist-info → mindspore-2.7.0rc1.dist-info}/WHEEL +0 -0
- {mindspore-2.7.0.dist-info → mindspore-2.7.0rc1.dist-info}/entry_points.txt +0 -0
- {mindspore-2.7.0.dist-info → mindspore-2.7.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -29,8 +29,10 @@ import atexit
|
|
|
29
29
|
import glob
|
|
30
30
|
import json
|
|
31
31
|
import os
|
|
32
|
+
import queue
|
|
32
33
|
import signal
|
|
33
34
|
import stat
|
|
35
|
+
import subprocess
|
|
34
36
|
import warnings
|
|
35
37
|
|
|
36
38
|
import time
|
|
@@ -39,7 +41,6 @@ import multiprocessing
|
|
|
39
41
|
from importlib import import_module
|
|
40
42
|
import sys
|
|
41
43
|
import threading
|
|
42
|
-
from types import GeneratorType
|
|
43
44
|
|
|
44
45
|
import copy
|
|
45
46
|
import weakref
|
|
@@ -64,6 +65,7 @@ from mindspore.dataset.engine import samplers
|
|
|
64
65
|
from mindspore.dataset.engine.samplers import Shuffle
|
|
65
66
|
from .iterators import DictIterator, TupleIterator, DummyIterator, check_iterator_cleanup, _set_iterator_cleanup, \
|
|
66
67
|
ITERATORS_LIST, _unset_iterator_cleanup, _cleanup_the_iterators_if_created
|
|
68
|
+
from .queue import _SharedQueue, _Queue
|
|
67
69
|
from .validators import check_batch, check_shuffle, check_map, check_filter, check_repeat, check_skip, check_zip, \
|
|
68
70
|
check_rename, check_device_send, check_take, check_output_shape, check_project, \
|
|
69
71
|
check_sync_wait, check_zip_dataset, check_add_column, check_concat, check_split, check_bucket_batch_by_length, \
|
|
@@ -71,8 +73,7 @@ from .validators import check_batch, check_shuffle, check_map, check_filter, che
|
|
|
71
73
|
check_total_batch, check_sync_update
|
|
72
74
|
from ..core.config import get_callback_timeout, _init_device_info, get_enable_shared_mem, get_num_parallel_workers, \
|
|
73
75
|
get_enable_watchdog, get_seed, set_seed, get_debug_mode, get_multiprocessing_timeout_interval, \
|
|
74
|
-
_get_debug_hook_list, get_multiprocessing_start_method
|
|
75
|
-
get_error_samples_mode, ErrorSamplesMode
|
|
76
|
+
_get_debug_hook_list, get_multiprocessing_start_method
|
|
76
77
|
from ..core.datatypes import mstype_to_detype
|
|
77
78
|
from ..core.validator_helpers import replace_none
|
|
78
79
|
from ..core.py_util_helpers import ExceptionHandler
|
|
@@ -2751,6 +2752,8 @@ class BatchDataset(UnionBaseDataset):
|
|
|
2751
2752
|
|
|
2752
2753
|
self.process_pool = _PythonMultiprocessing(get_multiprocessing_start_method(), self.num_parallel_workers,
|
|
2753
2754
|
str(self), [self.per_batch_map], self.max_rowsize)
|
|
2755
|
+
# Wrap per_batch_map into _PythonCallable
|
|
2756
|
+
self.per_batch_map = _PythonCallable(self.per_batch_map, 0, self.process_pool)
|
|
2754
2757
|
else:
|
|
2755
2758
|
if self.per_batch_map is not None:
|
|
2756
2759
|
self.per_batch_map = FuncWrapper(self.per_batch_map)
|
|
@@ -3054,6 +3057,95 @@ _OP_NAME = dict()
|
|
|
3054
3057
|
_OP_PROCESS = dict()
|
|
3055
3058
|
|
|
3056
3059
|
|
|
3060
|
+
# PythonCallable wrapper for multiprocess pyfunc
|
|
3061
|
+
class _PythonCallable:
|
|
3062
|
+
"""
|
|
3063
|
+
Internal Python function wrapper for multiprocessing pyfunc.
|
|
3064
|
+
"""
|
|
3065
|
+
|
|
3066
|
+
def __init__(self, py_callable, idx, pool=None):
|
|
3067
|
+
# Original Python callable from user.
|
|
3068
|
+
self.py_callable = py_callable
|
|
3069
|
+
# Process pool created for current iterator.
|
|
3070
|
+
self.pool = pool
|
|
3071
|
+
# Python callable index
|
|
3072
|
+
self.idx = idx
|
|
3073
|
+
|
|
3074
|
+
def __call__(self, *args):
|
|
3075
|
+
result = None
|
|
3076
|
+
get_data_from_worker_process = False
|
|
3077
|
+
while get_data_from_worker_process is False:
|
|
3078
|
+
if self.pool.is_running() and check_iterator_cleanup() is False:
|
|
3079
|
+
try:
|
|
3080
|
+
result = self.pool.execute(self.idx, *args)
|
|
3081
|
+
except multiprocessing.TimeoutError:
|
|
3082
|
+
continue
|
|
3083
|
+
get_data_from_worker_process = True
|
|
3084
|
+
else:
|
|
3085
|
+
# worker process is stopped
|
|
3086
|
+
logger.info("The worker process of map operation is stopped. "
|
|
3087
|
+
"So return None to main thread and break the main thread.")
|
|
3088
|
+
return None
|
|
3089
|
+
# got value from worker process
|
|
3090
|
+
if not isinstance(result, tuple) and get_data_from_worker_process is True:
|
|
3091
|
+
result = (result,)
|
|
3092
|
+
return result
|
|
3093
|
+
|
|
3094
|
+
def to_json(self):
|
|
3095
|
+
return self.py_callable.to_json()
|
|
3096
|
+
|
|
3097
|
+
|
|
3098
|
+
# used when python_multiprocessing=True in map
|
|
3099
|
+
class Pipe:
|
|
3100
|
+
"""
|
|
3101
|
+
Class to handle communication between the master process and the worker processes.
|
|
3102
|
+
"""
|
|
3103
|
+
|
|
3104
|
+
def __init__(self, warning_ctl, shared_memory=False, max_rowsize=(-1, -1)):
|
|
3105
|
+
self.shared_memory = shared_memory
|
|
3106
|
+
self.eof = multiprocessing.Event()
|
|
3107
|
+
if self.shared_memory:
|
|
3108
|
+
self.in_queue = _SharedQueue(1, warning_ctl, max_rowsize=max_rowsize[0])
|
|
3109
|
+
self.res_queue = _SharedQueue(1, warning_ctl, max_rowsize=max_rowsize[1])
|
|
3110
|
+
else:
|
|
3111
|
+
self.in_queue = _Queue(1)
|
|
3112
|
+
self.res_queue = _Queue(1)
|
|
3113
|
+
self.in_queue.cancel_join_thread() # Ensure that the process does not hung when exiting
|
|
3114
|
+
|
|
3115
|
+
def master_send(self, func_index, data):
|
|
3116
|
+
self.in_queue.put_nowait((func_index, *data))
|
|
3117
|
+
|
|
3118
|
+
def master_receive(self):
|
|
3119
|
+
if self.eof is None:
|
|
3120
|
+
raise RuntimeError("EOF is none when get data from worker.")
|
|
3121
|
+
if self.eof.is_set():
|
|
3122
|
+
return None
|
|
3123
|
+
return self.res_queue.get(timeout=1)
|
|
3124
|
+
|
|
3125
|
+
def master_close(self):
|
|
3126
|
+
self.eof.set()
|
|
3127
|
+
self.send_finish_signal_to_worker()
|
|
3128
|
+
self.send_finish_signal()
|
|
3129
|
+
|
|
3130
|
+
def send_finish_signal(self):
|
|
3131
|
+
self.worker_send(None)
|
|
3132
|
+
|
|
3133
|
+
def send_finish_signal_to_worker(self):
|
|
3134
|
+
self.master_send(0, "QUIT")
|
|
3135
|
+
|
|
3136
|
+
def worker_send(self, data):
|
|
3137
|
+
self.res_queue.put_until(data, timeout=1, exit_signal=self.eof)
|
|
3138
|
+
|
|
3139
|
+
def worker_receive(self):
|
|
3140
|
+
result = self.in_queue.get_until(timeout=1, exit_signal=self.eof)
|
|
3141
|
+
if result is None:
|
|
3142
|
+
return result
|
|
3143
|
+
if len(result) == 1:
|
|
3144
|
+
raise RuntimeError(f"Corrupted data. Worker received {len(result)} elements, it should be more than 1.")
|
|
3145
|
+
func_index, *data = result
|
|
3146
|
+
return func_index, tuple(data)
|
|
3147
|
+
|
|
3148
|
+
|
|
3057
3149
|
def _main_process_already_exit():
|
|
3058
3150
|
"""
|
|
3059
3151
|
Judge whether main process already exit.
|
|
@@ -3066,18 +3158,15 @@ def _main_process_already_exit():
|
|
|
3066
3158
|
return False
|
|
3067
3159
|
|
|
3068
3160
|
|
|
3069
|
-
def _worker_loop(
|
|
3161
|
+
def _worker_loop(operations, pipe, worker_id):
|
|
3070
3162
|
"""
|
|
3071
3163
|
Multiprocess worker process loop.
|
|
3072
|
-
The worker process(Python Layer) gets data from / sends data to map / batch thread(C++ layer) by message queue
|
|
3073
|
-
and shared memory. This logic no longer uses the Python multi-process pool, in_queue, and out_queue for
|
|
3074
|
-
data transferring.
|
|
3075
3164
|
"""
|
|
3076
3165
|
# Initialize C++ side signal handlers
|
|
3077
3166
|
cde.register_worker_handlers()
|
|
3078
3167
|
|
|
3079
|
-
|
|
3080
|
-
|
|
3168
|
+
# Ensure that the process does not hang when exiting
|
|
3169
|
+
pipe.res_queue.cancel_join_thread()
|
|
3081
3170
|
|
|
3082
3171
|
def _ignore_sigint():
|
|
3083
3172
|
"""
|
|
@@ -3091,197 +3180,121 @@ def _worker_loop(quit_signal, operations, worker_id, op_type, key, video_backend
|
|
|
3091
3180
|
if get_seed() != 5489:
|
|
3092
3181
|
set_seed(get_seed() + worker_id)
|
|
3093
3182
|
|
|
3094
|
-
msg_queue = cde.MessageQueue(key)
|
|
3095
|
-
msg_queue.set_release_flag(False)
|
|
3096
|
-
shm_queue = cde.SharedMemoryQueue(key)
|
|
3097
|
-
shm_queue.set_release_flag(False)
|
|
3098
|
-
|
|
3099
|
-
pid = str(os.getpid())
|
|
3100
|
-
ppid = str(os.getppid())
|
|
3101
|
-
|
|
3102
|
-
# Scenario: when the main process is killed, worker processe needs to release shm & msg.
|
|
3103
|
-
# The shm id and msg id should be released by SIGTERM in worker handler
|
|
3104
|
-
cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
|
|
3105
|
-
msg_queue.msg_queue_id)
|
|
3106
|
-
|
|
3107
|
-
num_receive = 0
|
|
3108
|
-
num_send = 0
|
|
3109
3183
|
while not _main_process_already_exit():
|
|
3110
3184
|
_ignore_sigint()
|
|
3111
3185
|
|
|
3112
|
-
|
|
3113
|
-
if
|
|
3186
|
+
result = pipe.worker_receive()
|
|
3187
|
+
if result is None:
|
|
3114
3188
|
return
|
|
3115
|
-
|
|
3116
|
-
|
|
3117
|
-
|
|
3189
|
+
(idx, input_tensors) = result
|
|
3190
|
+
if input_tensors == "QUIT":
|
|
3191
|
+
break
|
|
3118
3192
|
try:
|
|
3119
|
-
|
|
3120
|
-
msg_queue.msg_queue_id)
|
|
3121
|
-
msg_queue.msg_rcv(cde.MASTER_SEND_DATA_MSG)
|
|
3122
|
-
cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
|
|
3123
|
-
msg_queue.msg_queue_id)
|
|
3124
|
-
except RuntimeError as err:
|
|
3125
|
-
cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
|
|
3126
|
-
msg_queue.msg_queue_id)
|
|
3127
|
-
# the msg_queue had been released by main process, ignore it in worker process
|
|
3128
|
-
if "errno: 2" in str(err):
|
|
3129
|
-
# Because the worker process does not release msg and shm, continue
|
|
3130
|
-
continue
|
|
3131
|
-
raise err
|
|
3193
|
+
output_tensors = operations[idx](*input_tensors)
|
|
3132
3194
|
|
|
3133
|
-
|
|
3134
|
-
|
|
3135
|
-
|
|
3136
|
-
|
|
3195
|
+
pipe.worker_send(output_tensors)
|
|
3196
|
+
except Exception:
|
|
3197
|
+
pipe.worker_send(ExceptionHandler(where="in map(or batch) worker and execute Python function"))
|
|
3198
|
+
# Do not return
|
|
3137
3199
|
|
|
3138
|
-
|
|
3200
|
+
# release the queue when stop the worker by master
|
|
3201
|
+
del pipe.in_queue
|
|
3202
|
+
del pipe.res_queue
|
|
3139
3203
|
|
|
3140
|
-
logger.info("Python process {} worker({}) receives {} samples from map thread.".format(op_type, worker_id,
|
|
3141
|
-
num_receive))
|
|
3142
3204
|
|
|
3143
|
-
|
|
3144
|
-
|
|
3145
|
-
|
|
3146
|
-
tensor_row = shm_queue.to_tensor_row(msg_queue.shm_id, msg_queue.shm_size)
|
|
3205
|
+
def worker_target(operations, worker_id):
|
|
3206
|
+
logger.info("Multiprocessing start method: {}".format(multiprocessing.get_start_method()))
|
|
3207
|
+
return lambda pipe: _worker_loop(operations, pipe, worker_id)
|
|
3147
3208
|
|
|
3148
|
-
## 3. convert TensorRow to Python tuple which elements are a column
|
|
3149
|
-
tuple_column = cde.convert_tensor_row_to_py_tuple(tensor_row)
|
|
3150
3209
|
|
|
3151
|
-
|
|
3152
|
-
|
|
3153
|
-
|
|
3154
|
-
|
|
3210
|
+
class WorkerTarget:
|
|
3211
|
+
def __init__(self, operations, pipe, worker_id):
|
|
3212
|
+
self.operations = operations
|
|
3213
|
+
self.pipe = pipe
|
|
3214
|
+
self.worker_id = worker_id
|
|
3215
|
+
logger.info("Multiprocessing start method: {}".format(multiprocessing.get_start_method()))
|
|
3155
3216
|
|
|
3156
|
-
|
|
3157
|
-
|
|
3158
|
-
# The list indicate the multi rows
|
|
3159
|
-
tuple_list_column = cde.convert_tensor_table_to_py_tuple_list(tensor_table)
|
|
3217
|
+
def __call__(self):
|
|
3218
|
+
return _worker_loop(self.operations, self.pipe, self.worker_id)
|
|
3160
3219
|
|
|
3161
|
-
py_func_input = (*tuple_list_column, batch_info)
|
|
3162
|
-
else:
|
|
3163
|
-
raise RuntimeError("The op_type: {} is invalid.".format(op_type))
|
|
3164
3220
|
|
|
3165
|
-
|
|
3166
|
-
|
|
3167
|
-
|
|
3221
|
+
class _MPWorker(multiprocessing.Process):
|
|
3222
|
+
"""
|
|
3223
|
+
Worker process for multiprocessing.
|
|
3224
|
+
"""
|
|
3168
3225
|
|
|
3169
|
-
|
|
3170
|
-
|
|
3171
|
-
|
|
3172
|
-
|
|
3226
|
+
def __init__(self, operations, warning_ctl, max_rowsize=(-1, -1), worker_id=0):
|
|
3227
|
+
shared_memory = get_enable_shared_mem()
|
|
3228
|
+
self.pipe = Pipe(warning_ctl, shared_memory=shared_memory, max_rowsize=max_rowsize)
|
|
3229
|
+
self.check_interval = get_multiprocessing_timeout_interval()
|
|
3230
|
+
super().__init__(target=worker_target(operations, worker_id), name="MapWorker" + str(worker_id),
|
|
3231
|
+
args=(self.pipe,), daemon=True)
|
|
3232
|
+
|
|
3233
|
+
def execute(self, idx, *args):
|
|
3234
|
+
"""Acquiring data from a worker in an infinite loop"""
|
|
3235
|
+
self.pipe.master_send(idx, args)
|
|
3236
|
+
time_s = time.time()
|
|
3237
|
+
wait_count = 1
|
|
3238
|
+
while True:
|
|
3239
|
+
cost_time = time.time() - time_s
|
|
3240
|
+
if cost_time / self.check_interval >= wait_count:
|
|
3241
|
+
wait_count += 1
|
|
3242
|
+
logger.warning("It has been waiting for " + "%.3f" % cost_time + "s because the sub-process "
|
|
3243
|
+
"worker of the map operation is hanging. "
|
|
3244
|
+
"Check whether the user defined data transform is too slow or the "
|
|
3245
|
+
"output data is too large. You can also set the timeout interval by "
|
|
3246
|
+
"ds.config.set_multiprocessing_timeout_interval to adjust the output frequency "
|
|
3247
|
+
"of this log.")
|
|
3248
|
+
pid = self.pid
|
|
3249
|
+
logger.warning("Map worker subprocess ID {} is stuck.".format(pid))
|
|
3250
|
+
install_status, _ = subprocess.getstatusoutput("py-spy --version")
|
|
3251
|
+
if install_status == 0:
|
|
3252
|
+
stack = subprocess.getoutput("py-spy dump -p {} -l".format(pid))
|
|
3253
|
+
logger.warning("Map worker subprocess stack:\n{}".format(stack))
|
|
3173
3254
|
else:
|
|
3174
|
-
|
|
3175
|
-
|
|
3176
|
-
# << send procedure <<
|
|
3177
|
-
# the result is None
|
|
3178
|
-
if py_func_output is None:
|
|
3179
|
-
raise RuntimeError("Got None from Python Function which is defined by {}".format(op_type))
|
|
3180
|
-
|
|
3181
|
-
# convert the output to tuple
|
|
3182
|
-
if not isinstance(py_func_output, tuple):
|
|
3183
|
-
py_func_output = (py_func_output,)
|
|
3184
|
-
|
|
3185
|
-
if op_type == cde.MAP_OP:
|
|
3186
|
-
# check if the map return Generator type
|
|
3187
|
-
for item in py_func_output:
|
|
3188
|
-
if isinstance(item, GeneratorType):
|
|
3189
|
-
raise RuntimeError("Cannot pickle <class 'generator'> object, please verify pyfunc "
|
|
3190
|
-
"return with numpy array")
|
|
3191
|
-
|
|
3192
|
-
## 1. convert Python tuple to TensorRow
|
|
3193
|
-
output_tensor_row = cde.convert_py_tuple_to_tensor_row(py_func_output)
|
|
3194
|
-
|
|
3195
|
-
## 2. convert TensorRow to shared memory
|
|
3196
|
-
shm_queue.from_tensor_row(output_tensor_row)
|
|
3197
|
-
elif op_type == cde.BATCH_OP:
|
|
3198
|
-
## 1. convert Python tuple tuple to TensorTable
|
|
3199
|
-
output_tensor_table, concat_batch = cde.convert_py_tuple_list_to_tensor_table(py_func_output)
|
|
3200
|
-
|
|
3201
|
-
## 2. convert TensorTable to shared memory
|
|
3202
|
-
shm_queue.from_tensor_table(output_tensor_table, batch_info, concat_batch)
|
|
3203
|
-
else:
|
|
3204
|
-
raise RuntimeError("The op_type: {} is invalid.".format(op_type))
|
|
3205
|
-
|
|
3206
|
-
## 3. send message queue which contains shared memory to map C++ thread in main process
|
|
3207
|
-
cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
|
|
3208
|
-
msg_queue.msg_queue_id)
|
|
3209
|
-
msg_queue.msg_snd(cde.WORKER_SEND_DATA_MSG, shm_queue.get_shm_id(), shm_queue.get_shm_size())
|
|
3210
|
-
cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
|
|
3211
|
-
msg_queue.msg_queue_id)
|
|
3212
|
-
|
|
3213
|
-
num_send += 1
|
|
3214
|
-
logger.info("Python process {} worker({}) sends {} samples to map thread.".format(op_type, worker_id,
|
|
3215
|
-
num_send))
|
|
3216
|
-
except Exception:
|
|
3255
|
+
logger.warning("Please `pip install py-spy` to get the stacks of the stuck process.")
|
|
3217
3256
|
try:
|
|
3218
|
-
|
|
3219
|
-
|
|
3220
|
-
|
|
3221
|
-
|
|
3222
|
-
|
|
3223
|
-
|
|
3224
|
-
|
|
3225
|
-
|
|
3226
|
-
|
|
3227
|
-
|
|
3228
|
-
|
|
3229
|
-
|
|
3230
|
-
|
|
3231
|
-
|
|
3232
|
-
|
|
3233
|
-
|
|
3234
|
-
|
|
3235
|
-
|
|
3236
|
-
|
|
3237
|
-
|
|
3238
|
-
|
|
3239
|
-
|
|
3240
|
-
|
|
3241
|
-
|
|
3242
|
-
|
|
3243
|
-
|
|
3244
|
-
|
|
3245
|
-
|
|
3246
|
-
|
|
3247
|
-
|
|
3248
|
-
|
|
3249
|
-
|
|
3250
|
-
|
|
3251
|
-
|
|
3252
|
-
# release the eager executor which is used by current process
|
|
3253
|
-
transforms.transforms.clean_unused_executors()
|
|
3254
|
-
|
|
3255
|
-
while not _main_process_already_exit():
|
|
3256
|
-
# quit by close_worker
|
|
3257
|
-
if quit_signal.is_set():
|
|
3257
|
+
res = self.pipe.master_receive()
|
|
3258
|
+
except queue.Empty:
|
|
3259
|
+
continue
|
|
3260
|
+
if res is None:
|
|
3261
|
+
# receive finish signal
|
|
3262
|
+
return None
|
|
3263
|
+
if isinstance(res, ExceptionHandler):
|
|
3264
|
+
res.reraise()
|
|
3265
|
+
return res
|
|
3266
|
+
|
|
3267
|
+
def close(self):
|
|
3268
|
+
try:
|
|
3269
|
+
if self.is_alive():
|
|
3270
|
+
# release the eager executor which is used by current process
|
|
3271
|
+
transforms.transforms.clean_unused_executors()
|
|
3272
|
+
|
|
3273
|
+
logger.info(f"Closing worker with PID: {self.pid}")
|
|
3274
|
+
self.pipe.master_close()
|
|
3275
|
+
|
|
3276
|
+
process_dir = os.path.join('/proc', str(self.pid))
|
|
3277
|
+
while self.is_alive() and os.path.exists(process_dir):
|
|
3278
|
+
logger.info("Waiting for worker {} closed ...".format(self.pid))
|
|
3279
|
+
time.sleep(0.001)
|
|
3280
|
+
|
|
3281
|
+
# del the handle which hold by master
|
|
3282
|
+
del self.pipe.in_queue
|
|
3283
|
+
del self.pipe.res_queue
|
|
3284
|
+
super().terminate()
|
|
3285
|
+
super().join()
|
|
3286
|
+
super().close()
|
|
3287
|
+
|
|
3288
|
+
except ValueError:
|
|
3289
|
+
# Process has been closed already
|
|
3258
3290
|
return
|
|
3291
|
+
return
|
|
3259
3292
|
|
|
3260
|
-
|
|
3261
|
-
|
|
3262
|
-
|
|
3263
|
-
|
|
3264
|
-
|
|
3265
|
-
msg_queue.release()
|
|
3266
|
-
shm_queue.set_release_flag(True)
|
|
3267
|
-
shm_queue.release()
|
|
3268
|
-
|
|
3269
|
-
|
|
3270
|
-
class WorkerTarget:
|
|
3271
|
-
"""Mulitprocess mode for dataset map or batch"""
|
|
3272
|
-
def __init__(self, quit_signal, operations, worker_id, op_type, ftok_key):
|
|
3273
|
-
self.quit_signal = quit_signal
|
|
3274
|
-
self.operations = operations
|
|
3275
|
-
self.worker_id = worker_id
|
|
3276
|
-
self.op_type = op_type
|
|
3277
|
-
self.ftok_key = ftok_key
|
|
3278
|
-
start_method = multiprocessing.get_start_method()
|
|
3279
|
-
logger.info("Multiprocessing start method: {}".format(start_method))
|
|
3280
|
-
self.video_backend = get_video_backend() if start_method == 'spawn' else None
|
|
3281
|
-
|
|
3282
|
-
def __call__(self):
|
|
3283
|
-
return _worker_loop(self.quit_signal, self.operations, self.worker_id, self.op_type, self.ftok_key,
|
|
3284
|
-
self.video_backend)
|
|
3293
|
+
def is_alive(self):
|
|
3294
|
+
try:
|
|
3295
|
+
return super().is_alive()
|
|
3296
|
+
except ValueError:
|
|
3297
|
+
return False
|
|
3285
3298
|
|
|
3286
3299
|
|
|
3287
3300
|
def worker_is_alive(worker):
|
|
@@ -3292,31 +3305,24 @@ def worker_is_alive(worker):
|
|
|
3292
3305
|
return False
|
|
3293
3306
|
|
|
3294
3307
|
|
|
3295
|
-
def close_worker(worker,
|
|
3308
|
+
def close_worker(worker, pipe):
|
|
3296
3309
|
"""Close the subprocess worker in spawn mode"""
|
|
3297
3310
|
try:
|
|
3298
3311
|
if worker_is_alive(worker):
|
|
3299
3312
|
# release the eager executor which is used by current process
|
|
3300
3313
|
transforms.transforms.clean_unused_executors()
|
|
3301
3314
|
|
|
3302
|
-
|
|
3303
|
-
|
|
3304
|
-
eof.set()
|
|
3305
|
-
|
|
3306
|
-
# wait timeout
|
|
3307
|
-
wait_timeout = 2
|
|
3308
|
-
start_time = time.time()
|
|
3315
|
+
logger.info(f"Closing worker with PID: {worker.pid}")
|
|
3316
|
+
pipe.master_close()
|
|
3309
3317
|
|
|
3310
3318
|
process_dir = os.path.join('/proc', str(worker.pid))
|
|
3311
3319
|
while worker_is_alive(worker) and os.path.exists(process_dir):
|
|
3312
3320
|
logger.info("Waiting for worker {} closed ...".format(worker.pid))
|
|
3313
3321
|
time.sleep(0.5)
|
|
3314
3322
|
|
|
3315
|
-
# maybe the worker is hung by msg_queue.MsgRcv, so break the loop and terminate it in next step
|
|
3316
|
-
if time.time() - start_time > wait_timeout:
|
|
3317
|
-
break
|
|
3318
|
-
|
|
3319
3323
|
# del the handle which hold by master
|
|
3324
|
+
del pipe.in_queue
|
|
3325
|
+
del pipe.res_queue
|
|
3320
3326
|
worker.terminate()
|
|
3321
3327
|
worker.join()
|
|
3322
3328
|
worker.close()
|
|
@@ -3373,8 +3379,7 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3373
3379
|
self.warning_ctl = None
|
|
3374
3380
|
# cache thread (get_ident()) to worker_id mapping in Python layer
|
|
3375
3381
|
self.python_threads_to_workers = {}
|
|
3376
|
-
self.
|
|
3377
|
-
self.eof_clean_process = None
|
|
3382
|
+
self.eof = None
|
|
3378
3383
|
self.running = False
|
|
3379
3384
|
|
|
3380
3385
|
def __del__(self):
|
|
@@ -3450,39 +3455,19 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3450
3455
|
del workers
|
|
3451
3456
|
os.kill(os.getpid(), signal.SIGTERM)
|
|
3452
3457
|
|
|
3453
|
-
def launch(self, op_id
|
|
3458
|
+
def launch(self, op_id=-1):
|
|
3454
3459
|
"""
|
|
3455
3460
|
Launch Python multiprocessing pool.
|
|
3456
3461
|
|
|
3457
3462
|
Args:
|
|
3458
|
-
op_id
|
|
3459
|
-
op_type (str): Indicate MapOp / BatchOp
|
|
3460
|
-
ftok_keys (list[int]): the ftok key of list for msg queue and shm queue
|
|
3463
|
+
op_id: ID for operation to have Python multiprocessing pool launched
|
|
3461
3464
|
|
|
3462
3465
|
Returns:
|
|
3463
3466
|
Python multiprocessing pool is launched.
|
|
3464
3467
|
"""
|
|
3465
3468
|
self.python_threads_to_workers = {}
|
|
3466
|
-
|
|
3467
|
-
if not isinstance(op_id, int):
|
|
3468
|
-
raise RuntimeError("The op_id is not int.")
|
|
3469
3469
|
self.op_id = op_id
|
|
3470
|
-
|
|
3471
|
-
valid_op_type = [cde.MAP_OP, cde.BATCH_OP]
|
|
3472
|
-
if op_type not in valid_op_type:
|
|
3473
|
-
raise RuntimeError("The op_type: {} is not in {}.".format(op_type, valid_op_type))
|
|
3474
|
-
self.op_type = op_type
|
|
3475
|
-
|
|
3476
|
-
if not isinstance(ftok_keys, list):
|
|
3477
|
-
raise RuntimeError("The ftok_keys is not a list.")
|
|
3478
|
-
if not all(isinstance(x, int) for x in ftok_keys):
|
|
3479
|
-
raise RuntimeError("The item in ftok_keys is not all int.")
|
|
3480
|
-
if len(ftok_keys) != self.num_parallel_workers:
|
|
3481
|
-
raise RuntimeError("The len of ftok_keys is not equal to num_parallel_workers.")
|
|
3482
|
-
self.ftok_keys = ftok_keys
|
|
3483
|
-
|
|
3484
|
-
logger.info("Launching new Python multiprocessing pool for Op: " + self.op_type + "(" + str(self.op_id) + \
|
|
3485
|
-
"), ftok_keys: " + str(self.ftok_keys))
|
|
3470
|
+
logger.info("Launching new Python multiprocessing pool for Op: " + str(self.op_id))
|
|
3486
3471
|
if self.is_mp_enabled():
|
|
3487
3472
|
message = "Launching a new Python multiprocessing pool while a pool already exists!" + \
|
|
3488
3473
|
" The existing pool will be terminated first."
|
|
@@ -3505,21 +3490,30 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3505
3490
|
raise Exception("Pool was already created, close it first.")
|
|
3506
3491
|
|
|
3507
3492
|
self.workers = []
|
|
3493
|
+
self.pipes = []
|
|
3494
|
+
self.check_interval = get_multiprocessing_timeout_interval()
|
|
3508
3495
|
self.warning_ctl = multiprocessing.Value('i', 0)
|
|
3496
|
+
if self.start_method == "fork":
|
|
3497
|
+
# Construct python worker processes
|
|
3498
|
+
for worker_id in range(self.num_parallel_workers):
|
|
3499
|
+
worker = _MPWorker(self.operations, self.warning_ctl, self.max_rowsize, worker_id)
|
|
3500
|
+
worker.start()
|
|
3501
|
+
self.workers.append(worker)
|
|
3502
|
+
else:
|
|
3503
|
+
multiprocessing.set_start_method(self.start_method, True)
|
|
3509
3504
|
|
|
3510
|
-
|
|
3511
|
-
|
|
3512
|
-
|
|
3513
|
-
|
|
3514
|
-
|
|
3515
|
-
|
|
3516
|
-
|
|
3517
|
-
|
|
3518
|
-
|
|
3519
|
-
|
|
3520
|
-
worker.start()
|
|
3505
|
+
# Construct python worker processes
|
|
3506
|
+
for worker_id in range(self.num_parallel_workers):
|
|
3507
|
+
shared_memory = get_enable_shared_mem()
|
|
3508
|
+
pipe = Pipe(self.warning_ctl, shared_memory=shared_memory, max_rowsize=self.max_rowsize)
|
|
3509
|
+
self.check_interval = get_multiprocessing_timeout_interval()
|
|
3510
|
+
worker = multiprocessing.Process(target=WorkerTarget(self.operations, pipe, worker_id),
|
|
3511
|
+
name="MapWorker" + str(worker_id), daemon=True)
|
|
3512
|
+
self.workers.append(worker)
|
|
3513
|
+
self.pipes.append(pipe)
|
|
3514
|
+
worker.start()
|
|
3521
3515
|
|
|
3522
|
-
|
|
3516
|
+
multiprocessing.set_start_method("fork", True)
|
|
3523
3517
|
|
|
3524
3518
|
logger.info("Launch worker process(es): {}".format(self.get_pids()))
|
|
3525
3519
|
|
|
@@ -3533,20 +3527,6 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3533
3527
|
atexit.register(lambda cleanup: cleanup()() if cleanup() is not None else None,
|
|
3534
3528
|
weakref.WeakMethod(self.terminate))
|
|
3535
3529
|
|
|
3536
|
-
# Ensure that all workers are in the running state
|
|
3537
|
-
start = time.time()
|
|
3538
|
-
wait_time = 120 # 120s
|
|
3539
|
-
while True:
|
|
3540
|
-
if self.is_running():
|
|
3541
|
-
logger.info("All workers has been running state.")
|
|
3542
|
-
break
|
|
3543
|
-
else:
|
|
3544
|
-
time.sleep(0.5)
|
|
3545
|
-
if time.time() - start > wait_time:
|
|
3546
|
-
logger.error("All worker processes have not reached the running state within " + str(wait_time) +
|
|
3547
|
-
" seconds, data processing errors may occur.")
|
|
3548
|
-
break
|
|
3549
|
-
|
|
3550
3530
|
def terminate(self):
|
|
3551
3531
|
if self.running:
|
|
3552
3532
|
# abort the monitor first and then close all the workers
|
|
@@ -3575,8 +3555,7 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3575
3555
|
continue
|
|
3576
3556
|
return self.pids
|
|
3577
3557
|
|
|
3578
|
-
def add_new_workers(self, num_new_workers
|
|
3579
|
-
"""Used by AutoTune"""
|
|
3558
|
+
def add_new_workers(self, num_new_workers):
|
|
3580
3559
|
logger.info(
|
|
3581
3560
|
"Increasing num_parallel_workers of Python Multiprocessing pool for Op:" + str(self.op_id) +
|
|
3582
3561
|
", old num_workers=" + str(self.num_parallel_workers) + " new num_workers=" + str(
|
|
@@ -3584,14 +3563,9 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3584
3563
|
num_new_workers) + ".")
|
|
3585
3564
|
self.terminate()
|
|
3586
3565
|
self.num_parallel_workers += num_new_workers
|
|
3566
|
+
self.launch(self.op_id)
|
|
3587
3567
|
|
|
3588
|
-
|
|
3589
|
-
raise RuntimeError("Add new workers failed, the num_workers is not equal size of ftok_keys.")
|
|
3590
|
-
|
|
3591
|
-
self.launch(self.op_id, op_type, ftok_keys)
|
|
3592
|
-
|
|
3593
|
-
def remove_workers(self, num_removed_workers, op_type, ftok_keys):
|
|
3594
|
-
"""Used by AutoTune"""
|
|
3568
|
+
def remove_workers(self, num_removed_workers):
|
|
3595
3569
|
logger.info(
|
|
3596
3570
|
"Decreasing num_parallel_workers of Python Multiprocessing pool for Op:" + str(self.op_id) +
|
|
3597
3571
|
", old num_workers=" + str(self.num_parallel_workers) + " new num_workers=" + str(
|
|
@@ -3599,15 +3573,60 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3599
3573
|
num_removed_workers) + ".")
|
|
3600
3574
|
self.terminate()
|
|
3601
3575
|
self.num_parallel_workers -= num_removed_workers
|
|
3602
|
-
|
|
3603
|
-
if self.num_parallel_workers != len(ftok_keys):
|
|
3604
|
-
raise RuntimeError("Remove workers failed, the num_workers is not equal size of ftok_keys.")
|
|
3605
|
-
|
|
3606
|
-
self.launch(self.op_id, op_type, ftok_keys)
|
|
3576
|
+
self.launch(self.op_id)
|
|
3607
3577
|
|
|
3608
3578
|
def is_mp_enabled(self):
|
|
3609
3579
|
return self.workers is not None
|
|
3610
3580
|
|
|
3581
|
+
def execute(self, idx, *args):
|
|
3582
|
+
"""
|
|
3583
|
+
Execute
|
|
3584
|
+
"""
|
|
3585
|
+
t_id = threading.get_ident()
|
|
3586
|
+
# get the worker_id from Python layer cache first, get from Cpp layer if not found.
|
|
3587
|
+
worker_id = self.python_threads_to_workers.setdefault(t_id, self.get_thread_to_worker())
|
|
3588
|
+
if worker_id >= len(self.workers):
|
|
3589
|
+
raise RuntimeError("[Internal] worker_id value is greater than number of available workers!")
|
|
3590
|
+
|
|
3591
|
+
# todo check_iterator_cleanup
|
|
3592
|
+
if self.is_running() and check_iterator_cleanup() is False:
|
|
3593
|
+
if self.start_method == "fork":
|
|
3594
|
+
return self.workers[worker_id].execute(idx, *args)
|
|
3595
|
+
# spawn mode
|
|
3596
|
+
self.pipes[worker_id].master_send(idx, args)
|
|
3597
|
+
time_s = time.time()
|
|
3598
|
+
wait_count = 1
|
|
3599
|
+
while True:
|
|
3600
|
+
cost_time = time.time() - time_s
|
|
3601
|
+
if cost_time / self.check_interval >= wait_count:
|
|
3602
|
+
wait_count += 1
|
|
3603
|
+
logger.warning("It has been waiting for " + "%.3f" % cost_time + "s because the sub-process "
|
|
3604
|
+
"worker of the map operation is hanging. "
|
|
3605
|
+
"Check whether the user defined data transform is too slow or the "
|
|
3606
|
+
"output data is too large. You can also set the timeout interval by "
|
|
3607
|
+
"ds.config.set_multiprocessing_timeout_interval to adjust the output frequency "
|
|
3608
|
+
"of this log.")
|
|
3609
|
+
pid = self.workers[worker_id].pid
|
|
3610
|
+
logger.warning("Map worker subprocess ID {} is stuck.".format(pid))
|
|
3611
|
+
install_status, _ = subprocess.getstatusoutput("py-spy --version")
|
|
3612
|
+
if install_status == 0:
|
|
3613
|
+
stack = subprocess.getoutput("py-spy dump -p {} -l".format(pid))
|
|
3614
|
+
logger.warning("Map worker subprocess stack:\n{}".format(stack))
|
|
3615
|
+
else:
|
|
3616
|
+
logger.warning("Please `pip install py-spy` to get the stacks of the stuck process.")
|
|
3617
|
+
try:
|
|
3618
|
+
res = self.pipes[worker_id].master_receive()
|
|
3619
|
+
except queue.Empty:
|
|
3620
|
+
continue
|
|
3621
|
+
if res is None:
|
|
3622
|
+
# receive finish signal
|
|
3623
|
+
return None
|
|
3624
|
+
if isinstance(res, ExceptionHandler):
|
|
3625
|
+
res.reraise()
|
|
3626
|
+
return res
|
|
3627
|
+
|
|
3628
|
+
return None
|
|
3629
|
+
|
|
3611
3630
|
def _launch_monitor(self):
|
|
3612
3631
|
"""
|
|
3613
3632
|
Launch a clean process and register subprocess to be monitored by the watch dog.
|
|
@@ -3615,10 +3634,10 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3615
3634
|
The watch dog will clean up subprocesses and main process when any subprocess exited.
|
|
3616
3635
|
"""
|
|
3617
3636
|
if platform.system().lower() != 'windows':
|
|
3618
|
-
self.
|
|
3637
|
+
self.eof = multiprocessing.Event()
|
|
3619
3638
|
self.cleaning_process = multiprocessing.Process(target=self._clean_process,
|
|
3620
3639
|
name="MapCleanProcess",
|
|
3621
|
-
args=(self.ppid, self.workers, self.
|
|
3640
|
+
args=(self.ppid, self.workers, self.eof),
|
|
3622
3641
|
daemon=True)
|
|
3623
3642
|
self.cleaning_process.start()
|
|
3624
3643
|
logger.info("Launch clean process {} to monitor worker "
|
|
@@ -3634,9 +3653,8 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3634
3653
|
"""Deregister workers monitored by the watch dog and join clean process."""
|
|
3635
3654
|
if get_enable_watchdog():
|
|
3636
3655
|
cde.deregister_worker_pids(id(self))
|
|
3637
|
-
if hasattr(self, 'eof') and self.
|
|
3638
|
-
|
|
3639
|
-
self.eof_clean_process.set()
|
|
3656
|
+
if hasattr(self, 'eof') and self.eof is not None:
|
|
3657
|
+
self.eof.set()
|
|
3640
3658
|
if hasattr(self, 'cleaning_process') and self.cleaning_process is not None:
|
|
3641
3659
|
# let the quit event notify the cleaning process to exit
|
|
3642
3660
|
self.cleaning_process.join(timeout=5)
|
|
@@ -3647,14 +3665,20 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3647
3665
|
|
|
3648
3666
|
def is_running(self):
|
|
3649
3667
|
if hasattr(self, 'workers') and self.workers is not None:
|
|
3668
|
+
if self.start_method == "fork":
|
|
3669
|
+
return all([w.is_alive() for w in self.workers])
|
|
3650
3670
|
return all([worker_is_alive(w) for w in self.workers])
|
|
3651
3671
|
return False
|
|
3652
3672
|
|
|
3653
3673
|
def close_all_workers(self):
|
|
3654
3674
|
"""Close all the subprocess workers"""
|
|
3655
3675
|
if hasattr(self, 'workers') and self.workers is not None:
|
|
3656
|
-
|
|
3657
|
-
|
|
3676
|
+
if self.start_method == "fork":
|
|
3677
|
+
for w in self.workers:
|
|
3678
|
+
w.close()
|
|
3679
|
+
else:
|
|
3680
|
+
for i, w in enumerate(self.workers):
|
|
3681
|
+
close_worker(w, self.pipes[i])
|
|
3658
3682
|
|
|
3659
3683
|
check_interval = get_multiprocessing_timeout_interval()
|
|
3660
3684
|
for w in self.workers:
|
|
@@ -3671,8 +3695,12 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3671
3695
|
continue
|
|
3672
3696
|
raise e
|
|
3673
3697
|
try:
|
|
3674
|
-
if
|
|
3675
|
-
|
|
3698
|
+
if self.start_method == "fork":
|
|
3699
|
+
if w.is_alive():
|
|
3700
|
+
os.close(subprocess_file_descriptor)
|
|
3701
|
+
else:
|
|
3702
|
+
if worker_is_alive(w):
|
|
3703
|
+
os.close(subprocess_file_descriptor)
|
|
3676
3704
|
except OSError as e:
|
|
3677
3705
|
# Maybe the file descriptor had been released, so ignore the 'Bad file descriptor'
|
|
3678
3706
|
if "Bad file descriptor" not in str(e):
|
|
@@ -3681,12 +3709,8 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
|
|
|
3681
3709
|
# use clear to release the handle which is better than self.workers = None
|
|
3682
3710
|
self.workers.clear()
|
|
3683
3711
|
self.workers = None
|
|
3684
|
-
self.
|
|
3685
|
-
self.
|
|
3686
|
-
|
|
3687
|
-
# as it can cause the main process to not exit when PyFunc executes very slowly so release
|
|
3688
|
-
# the shm & msg here
|
|
3689
|
-
cde.release_shm_and_msg_by_worker_pids(self.pids)
|
|
3712
|
+
self.pipes.clear()
|
|
3713
|
+
self.pipes = None
|
|
3690
3714
|
self.pids = None
|
|
3691
3715
|
|
|
3692
3716
|
|
|
@@ -3764,22 +3788,7 @@ class MapDataset(UnionBaseDataset):
|
|
|
3764
3788
|
|
|
3765
3789
|
count_old_transforms, count_new_transforms, count_non_data_vision_transforms = \
|
|
3766
3790
|
self.__count_transforms(operations)
|
|
3767
|
-
count_py_ops = self.__count_py_ops(operations)
|
|
3768
3791
|
count_pyfunc = self.__count_pyfuncs(operations)
|
|
3769
|
-
|
|
3770
|
-
# Whether to execute ops in the thread mode
|
|
3771
|
-
# op_type python_multiprocessing run_in_thread
|
|
3772
|
-
# c_op(s) false yes
|
|
3773
|
-
# c_op(s) true yes
|
|
3774
|
-
# py_op(s) / PyFunc false yes
|
|
3775
|
-
# py_op(s) / PyFunc true no
|
|
3776
|
-
# c_op(s) + py_op(s) / PyFunc false yes
|
|
3777
|
-
# c_op(s) + py_op(s) / PyFunc true no
|
|
3778
|
-
run_in_thread = not self.python_multiprocessing or (count_pyfunc == 0 and count_py_ops == 0) or get_debug_mode()
|
|
3779
|
-
|
|
3780
|
-
if self.python_multiprocessing and platform.system().lower() == 'windows':
|
|
3781
|
-
run_in_thread = True
|
|
3782
|
-
|
|
3783
3792
|
if count_new_transforms + count_pyfunc == len(operations):
|
|
3784
3793
|
prev_op = None
|
|
3785
3794
|
for op in operations:
|
|
@@ -3797,43 +3806,18 @@ class MapDataset(UnionBaseDataset):
|
|
|
3797
3806
|
op.implementation = Implementation.C
|
|
3798
3807
|
prev_op = op
|
|
3799
3808
|
operations = self.__insert_debug_wrapper(operations)
|
|
3800
|
-
|
|
3801
|
-
operations = transforms.transforms.Compose.reduce(operations)
|
|
3809
|
+
operations = transforms.transforms.Compose.reduce(operations)
|
|
3802
3810
|
elif count_old_transforms + count_pyfunc + count_non_data_vision_transforms == len(operations):
|
|
3803
3811
|
operations = self.__insert_debug_wrapper(operations)
|
|
3804
|
-
|
|
3805
|
-
operations = transforms.py_transforms.Compose.reduce(operations)
|
|
3812
|
+
operations = transforms.py_transforms.Compose.reduce(operations)
|
|
3806
3813
|
else:
|
|
3807
3814
|
raise RuntimeError("Mixing old legacy c/py_transforms and new unified transforms is not allowed.")
|
|
3808
3815
|
|
|
3809
|
-
|
|
3810
|
-
self.operations = self.__process_final_operations(operations)
|
|
3811
|
-
else:
|
|
3812
|
-
self.operations = operations
|
|
3816
|
+
self.operations = self.__process_final_operations(operations)
|
|
3813
3817
|
self.prepare_multiprocessing()
|
|
3814
3818
|
|
|
3815
3819
|
callbacks = [cb.create_runtime_obj() for cb in self.callbacks]
|
|
3816
|
-
|
|
3817
|
-
## thread mode
|
|
3818
|
-
if run_in_thread:
|
|
3819
|
-
return cde.MapNode(children[0], self.operations, self.input_columns, self.output_columns,
|
|
3820
|
-
callbacks, OffloadToManualOffloadMode.get(self.offload), self.process_pool)
|
|
3821
|
-
|
|
3822
|
-
# Bind self.operations with self.process_pool
|
|
3823
|
-
class _BindProcessPoolWithOperations:
|
|
3824
|
-
def __init__(self, pool, operations):
|
|
3825
|
-
self.pool = pool
|
|
3826
|
-
self.operations = operations
|
|
3827
|
-
|
|
3828
|
-
def __call__(self):
|
|
3829
|
-
pass
|
|
3830
|
-
|
|
3831
|
-
self.bound = _BindProcessPoolWithOperations(self.process_pool, self.operations)
|
|
3832
|
-
|
|
3833
|
-
## process mode
|
|
3834
|
-
# in multi process mode, we just transfer the self.bound which is not really used in c layer
|
|
3835
|
-
# because when the pipeline is running, map thread transfer data through c++ shm & msg to Python Worker Process
|
|
3836
|
-
return cde.MapNode(children[0], [self.bound], self.input_columns, self.output_columns,
|
|
3820
|
+
return cde.MapNode(children[0], self.operations, self.input_columns, self.output_columns,
|
|
3837
3821
|
callbacks, OffloadToManualOffloadMode.get(self.offload), self.process_pool)
|
|
3838
3822
|
|
|
3839
3823
|
def __deepcopy__(self, memodict):
|
|
@@ -3886,22 +3870,10 @@ class MapDataset(UnionBaseDataset):
|
|
|
3886
3870
|
@staticmethod
|
|
3887
3871
|
def __count_pyfuncs(operations):
|
|
3888
3872
|
"""
|
|
3889
|
-
Count the number of pyfuncs operations
|
|
3873
|
+
Count the number of pyfuncs operations
|
|
3890
3874
|
"""
|
|
3891
3875
|
return sum([1 if isinstance(op, FuncWrapper) else 0 for op in operations])
|
|
3892
3876
|
|
|
3893
|
-
@staticmethod
|
|
3894
|
-
def __count_py_ops(operations):
|
|
3895
|
-
"""
|
|
3896
|
-
Count the number of python operations which is built-in
|
|
3897
|
-
"""
|
|
3898
|
-
count = 0
|
|
3899
|
-
for op in operations:
|
|
3900
|
-
if hasattr(op, "implementation") and op.implementation != Implementation.C \
|
|
3901
|
-
and op.implementation is not None:
|
|
3902
|
-
count += 1
|
|
3903
|
-
return count
|
|
3904
|
-
|
|
3905
3877
|
@staticmethod
|
|
3906
3878
|
def __count_transforms(operations):
|
|
3907
3879
|
"""
|
|
@@ -3965,6 +3937,7 @@ class MapDataset(UnionBaseDataset):
|
|
|
3965
3937
|
" Ignoring Python multiprocessing for map operation.")
|
|
3966
3938
|
return
|
|
3967
3939
|
if self.python_multiprocessing:
|
|
3940
|
+
iter_specific_operations = []
|
|
3968
3941
|
callable_list = []
|
|
3969
3942
|
|
|
3970
3943
|
# If user didn't specify num_parallel_workers, set it to default
|
|
@@ -3981,6 +3954,18 @@ class MapDataset(UnionBaseDataset):
|
|
|
3981
3954
|
self.process_pool = _PythonMultiprocessing(get_multiprocessing_start_method(),
|
|
3982
3955
|
self.num_parallel_workers, str(self),
|
|
3983
3956
|
callable_list, self.max_rowsize)
|
|
3957
|
+
# Pass #2
|
|
3958
|
+
idx = 0
|
|
3959
|
+
for op in self.operations:
|
|
3960
|
+
# our c transforms is now callable and should not be run in Python multithreading
|
|
3961
|
+
if MapDataset.__operation_valid_for_multiprocessing(op):
|
|
3962
|
+
# Wrap Python callable into _PythonCallable
|
|
3963
|
+
iter_specific_operations.append(_PythonCallable(op, idx, self.process_pool))
|
|
3964
|
+
idx += 1
|
|
3965
|
+
else:
|
|
3966
|
+
# CPP ops remain the same
|
|
3967
|
+
iter_specific_operations.append(op)
|
|
3968
|
+
self.operations = iter_specific_operations
|
|
3984
3969
|
|
|
3985
3970
|
def __insert_debug_wrapper(self, operations):
|
|
3986
3971
|
"""
|