mindspore 2.7.0__cp310-cp310-win_amd64.whl → 2.7.0rc1__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mindspore might be problematic. Click here for more details.

Files changed (196) hide show
  1. mindspore/.commit_id +1 -1
  2. mindspore/__init__.py +1 -1
  3. mindspore/_c_dataengine.cp310-win_amd64.pyd +0 -0
  4. mindspore/_c_expression.cp310-win_amd64.pyd +0 -0
  5. mindspore/_c_mindrecord.cp310-win_amd64.pyd +0 -0
  6. mindspore/_checkparam.py +2 -2
  7. mindspore/_extends/builtin_operations.py +3 -3
  8. mindspore/_extends/parallel_compile/akg_compiler/gen_custom_op_files.py +1 -1
  9. mindspore/_extends/parse/__init__.py +3 -3
  10. mindspore/_extends/parse/deprecated/deprecated_tensor_method.py +1 -0
  11. mindspore/_extends/parse/parser.py +22 -28
  12. mindspore/_extends/parse/standard_method.py +1 -15
  13. mindspore/_extends/pijit/pijit_func_white_list.py +5 -2
  14. mindspore/_extends/remote/kernel_build_server_ascend.py +75 -0
  15. mindspore/amp.py +18 -0
  16. mindspore/avcodec-59.dll +0 -0
  17. mindspore/avdevice-59.dll +0 -0
  18. mindspore/avfilter-8.dll +0 -0
  19. mindspore/avformat-59.dll +0 -0
  20. mindspore/avutil-57.dll +0 -0
  21. mindspore/common/__init__.py +12 -18
  22. mindspore/common/_tensor_cpp_method.py +1 -1
  23. mindspore/common/_tensor_docs.py +38 -102
  24. mindspore/common/_utils.py +1 -9
  25. mindspore/common/api.py +106 -155
  26. mindspore/common/{dynamic_shape/auto_dynamic_shape.py → auto_dynamic_shape.py} +23 -17
  27. mindspore/common/dtype.py +57 -98
  28. mindspore/common/dump.py +1 -1
  29. mindspore/common/file_system.py +9 -59
  30. mindspore/common/hook_handle.py +3 -22
  31. mindspore/common/np_dtype.py +3 -3
  32. mindspore/common/parameter.py +20 -4
  33. mindspore/common/recompute.py +4 -2
  34. mindspore/common/tensor.py +52 -38
  35. mindspore/communication/_hccl_management.py +297 -0
  36. mindspore/context.py +21 -15
  37. mindspore/dataset/__init__.py +1 -1
  38. mindspore/dataset/audio/transforms.py +1 -1
  39. mindspore/dataset/core/config.py +1 -35
  40. mindspore/dataset/engine/datasets.py +315 -330
  41. mindspore/dataset/engine/datasets_user_defined.py +22 -38
  42. mindspore/dataset/transforms/c_transforms.py +2 -2
  43. mindspore/dataset/transforms/transforms.py +3 -3
  44. mindspore/dataset/vision/__init__.py +1 -1
  45. mindspore/dataset/vision/py_transforms.py +8 -8
  46. mindspore/dataset/vision/transforms.py +5 -17
  47. mindspore/dataset/vision/utils.py +21 -632
  48. mindspore/device_context/ascend/op_tuning.py +1 -35
  49. mindspore/dnnl.dll +0 -0
  50. mindspore/experimental/llm_boost/ascend_native/llama_boost_ascend_native.py +0 -3
  51. mindspore/include/api/cell.h +4 -28
  52. mindspore/include/api/cfg.h +7 -24
  53. mindspore/include/api/context.h +0 -1
  54. mindspore/include/api/delegate.h +2 -0
  55. mindspore/include/api/dual_abi_helper.h +19 -100
  56. mindspore/include/api/graph.h +1 -14
  57. mindspore/include/api/kernel.h +3 -16
  58. mindspore/include/api/kernel_api.h +1 -9
  59. mindspore/include/api/metrics/accuracy.h +0 -9
  60. mindspore/include/api/model.h +1 -5
  61. mindspore/include/api/model_group.h +0 -4
  62. mindspore/include/api/model_parallel_runner.h +0 -2
  63. mindspore/include/api/status.h +10 -48
  64. mindspore/include/api/types.h +1 -6
  65. mindspore/include/dataset/constants.h +0 -9
  66. mindspore/jpeg62.dll +0 -0
  67. mindspore/mindrecord/tools/cifar10.py +2 -3
  68. mindspore/mindrecord/tools/cifar10_to_mr.py +5 -5
  69. mindspore/mindspore_backend_common.dll +0 -0
  70. mindspore/mindspore_backend_manager.dll +0 -0
  71. mindspore/mindspore_common.dll +0 -0
  72. mindspore/mindspore_core.dll +0 -0
  73. mindspore/mindspore_cpu_res_manager.dll +0 -0
  74. mindspore/mindspore_dump.dll +0 -0
  75. mindspore/mindspore_frontend.dll +0 -0
  76. mindspore/mindspore_glog.dll +0 -0
  77. mindspore/mindspore_memory_pool.dll +0 -0
  78. mindspore/mindspore_ms_backend.dll +0 -0
  79. mindspore/mindspore_ops.dll +0 -0
  80. mindspore/mindspore_ops_host.dll +0 -0
  81. mindspore/mindspore_ops_kernel_common.dll +0 -0
  82. mindspore/mindspore_profiler.dll +0 -0
  83. mindspore/mindspore_pyboost.dll +0 -0
  84. mindspore/mindspore_pynative.dll +0 -0
  85. mindspore/mindspore_res_manager.dll +0 -0
  86. mindspore/mindspore_runtime_pipeline.dll +0 -0
  87. mindspore/mint/distributed/__init__.py +0 -4
  88. mindspore/mint/distributed/distributed.py +14 -217
  89. mindspore/mint/nn/layer/_functions.py +2 -1
  90. mindspore/mint/nn/layer/conv.py +6 -6
  91. mindspore/mint/nn/layer/normalization.py +3 -3
  92. mindspore/nn/cell.py +174 -216
  93. mindspore/nn/layer/activation.py +2 -4
  94. mindspore/nn/layer/basic.py +13 -7
  95. mindspore/nn/layer/image.py +1 -1
  96. mindspore/nn/optim/adam.py +3 -1
  97. mindspore/nn/optim/lamb.py +3 -1
  98. mindspore/nn/optim/tft_wrapper.py +3 -2
  99. mindspore/nn/probability/distribution/_utils/utils.py +2 -2
  100. mindspore/nn/wrap/cell_wrapper.py +5 -39
  101. mindspore/nn/wrap/grad_reducer.py +15 -0
  102. mindspore/numpy/array_creations.py +2 -2
  103. mindspore/numpy/utils_const.py +1 -1
  104. mindspore/opencv_core452.dll +0 -0
  105. mindspore/opencv_imgcodecs452.dll +0 -0
  106. mindspore/opencv_imgproc452.dll +0 -0
  107. mindspore/ops/_grad_experimental/grad_inner_ops.py +9 -0
  108. mindspore/ops/_op_impl/cpu/__init__.py +0 -1
  109. mindspore/ops/auto_generate/cpp_create_prim_instance_helper.py +2 -12
  110. mindspore/ops/auto_generate/gen_extend_func.py +4 -4
  111. mindspore/ops/auto_generate/gen_ops_def.py +16 -290
  112. mindspore/ops/auto_generate/gen_ops_prim.py +76 -563
  113. mindspore/ops/composite/base.py +1 -1
  114. mindspore/ops/composite/multitype_ops/_constexpr_utils.py +1 -1
  115. mindspore/ops/function/__init__.py +0 -1
  116. mindspore/ops/function/array_func.py +6 -10
  117. mindspore/ops/function/debug_func.py +2 -4
  118. mindspore/ops/function/grad/grad_func.py +12 -4
  119. mindspore/ops/function/math_func.py +32 -44
  120. mindspore/ops/function/nn_func.py +20 -18
  121. mindspore/ops/functional.py +1 -2
  122. mindspore/ops/functional_overload.py +12 -23
  123. mindspore/ops/operations/_inner_ops.py +12 -11
  124. mindspore/ops/operations/array_ops.py +50 -4
  125. mindspore/ops/operations/comm_ops.py +15 -1
  126. mindspore/ops/operations/custom_ops.py +4 -10
  127. mindspore/ops/operations/debug_ops.py +6 -6
  128. mindspore/ops/operations/manually_defined/ops_def.py +12 -12
  129. mindspore/ops/operations/math_ops.py +5 -5
  130. mindspore/ops/operations/nn_ops.py +1 -1
  131. mindspore/ops/primitive.py +10 -3
  132. mindspore/ops/tensor_method.py +7 -16
  133. mindspore/ops_generate/pyboost/gen_pyboost_func.py +16 -0
  134. mindspore/parallel/_auto_parallel_context.py +15 -5
  135. mindspore/parallel/_parallel_serialization.py +2 -3
  136. mindspore/parallel/_ps_context.py +2 -2
  137. mindspore/parallel/_transformer/transformer.py +4 -4
  138. mindspore/parallel/_utils.py +11 -5
  139. mindspore/parallel/auto_parallel.py +9 -23
  140. mindspore/parallel/checkpoint_transform.py +0 -2
  141. mindspore/parallel/cluster/process_entity/_api.py +1 -4
  142. mindspore/parallel/cluster/run.py +3 -5
  143. mindspore/parallel/function/reshard_func.py +5 -6
  144. mindspore/parallel/nn/parallel_cell_wrapper.py +3 -40
  145. mindspore/parallel/nn/parallel_grad_reducer.py +8 -0
  146. mindspore/parallel/shard.py +21 -7
  147. mindspore/parallel/transform_safetensors.py +4 -10
  148. mindspore/profiler/analysis/viewer/ascend_kernel_details_viewer.py +9 -10
  149. mindspore/profiler/analysis/viewer/ascend_op_memory_viewer.py +1 -1
  150. mindspore/profiler/common/msprof_cmd_tool.py +2 -2
  151. mindspore/profiler/common/path_manager.py +0 -9
  152. mindspore/profiler/common/profiler_context.py +2 -25
  153. mindspore/profiler/common/profiler_meta_data.py +0 -1
  154. mindspore/profiler/common/profiler_op_analyse.py +6 -10
  155. mindspore/{ops/_op_impl/cpu/joinedstr_op.py → profiler/common/validator/__init__.py} +1 -15
  156. mindspore/profiler/common/validator/validate_path.py +84 -0
  157. mindspore/profiler/dynamic_profiler.py +46 -91
  158. mindspore/profiler/envprofiler.py +5 -30
  159. mindspore/profiler/experimental_config.py +1 -16
  160. mindspore/profiler/platform/cpu_profiler.py +4 -10
  161. mindspore/profiler/platform/npu_profiler.py +1 -1
  162. mindspore/profiler/profiler.py +145 -193
  163. mindspore/profiler/profiler_action_controller.py +1 -1
  164. mindspore/profiler/profiler_interface.py +2 -2
  165. mindspore/rewrite/symbol_tree/symbol_tree.py +1 -1
  166. mindspore/runtime/__init__.py +4 -6
  167. mindspore/runtime/executor.py +0 -27
  168. mindspore/runtime/memory.py +0 -1
  169. mindspore/runtime/thread_bind_core.py +1 -1
  170. mindspore/swresample-4.dll +0 -0
  171. mindspore/swscale-6.dll +0 -0
  172. mindspore/tinyxml2.dll +0 -0
  173. mindspore/train/_utils.py +3 -3
  174. mindspore/train/amp.py +3 -0
  175. mindspore/train/callback/_callback.py +1 -2
  176. mindspore/train/callback/_checkpoint.py +8 -1
  177. mindspore/train/callback/_flops_collector.py +6 -10
  178. mindspore/train/callback/_train_fault_tolerance.py +7 -3
  179. mindspore/train/data_sink.py +4 -4
  180. mindspore/train/dataset_helper.py +5 -5
  181. mindspore/train/model.py +20 -4
  182. mindspore/train/serialization.py +15 -35
  183. mindspore/train/train_thor/model_thor.py +2 -2
  184. mindspore/turbojpeg.dll +0 -0
  185. mindspore/utils/hooks.py +81 -0
  186. mindspore/utils/utils.py +8 -8
  187. mindspore/version.py +1 -1
  188. {mindspore-2.7.0.dist-info → mindspore-2.7.0rc1.dist-info}/METADATA +1 -1
  189. {mindspore-2.7.0.dist-info → mindspore-2.7.0rc1.dist-info}/RECORD +193 -192
  190. mindspore/_extends/parallel_compile/akg_compiler/custom.py +0 -1109
  191. mindspore/common/dynamic_shape/__init__.py +0 -0
  192. mindspore/common/dynamic_shape/enable_dynamic.py +0 -197
  193. /mindspore/common/{dynamic_shape/_auto_dynamic.py → _auto_dynamic.py} +0 -0
  194. {mindspore-2.7.0.dist-info → mindspore-2.7.0rc1.dist-info}/WHEEL +0 -0
  195. {mindspore-2.7.0.dist-info → mindspore-2.7.0rc1.dist-info}/entry_points.txt +0 -0
  196. {mindspore-2.7.0.dist-info → mindspore-2.7.0rc1.dist-info}/top_level.txt +0 -0
@@ -29,8 +29,10 @@ import atexit
29
29
  import glob
30
30
  import json
31
31
  import os
32
+ import queue
32
33
  import signal
33
34
  import stat
35
+ import subprocess
34
36
  import warnings
35
37
 
36
38
  import time
@@ -39,7 +41,6 @@ import multiprocessing
39
41
  from importlib import import_module
40
42
  import sys
41
43
  import threading
42
- from types import GeneratorType
43
44
 
44
45
  import copy
45
46
  import weakref
@@ -64,6 +65,7 @@ from mindspore.dataset.engine import samplers
64
65
  from mindspore.dataset.engine.samplers import Shuffle
65
66
  from .iterators import DictIterator, TupleIterator, DummyIterator, check_iterator_cleanup, _set_iterator_cleanup, \
66
67
  ITERATORS_LIST, _unset_iterator_cleanup, _cleanup_the_iterators_if_created
68
+ from .queue import _SharedQueue, _Queue
67
69
  from .validators import check_batch, check_shuffle, check_map, check_filter, check_repeat, check_skip, check_zip, \
68
70
  check_rename, check_device_send, check_take, check_output_shape, check_project, \
69
71
  check_sync_wait, check_zip_dataset, check_add_column, check_concat, check_split, check_bucket_batch_by_length, \
@@ -71,8 +73,7 @@ from .validators import check_batch, check_shuffle, check_map, check_filter, che
71
73
  check_total_batch, check_sync_update
72
74
  from ..core.config import get_callback_timeout, _init_device_info, get_enable_shared_mem, get_num_parallel_workers, \
73
75
  get_enable_watchdog, get_seed, set_seed, get_debug_mode, get_multiprocessing_timeout_interval, \
74
- _get_debug_hook_list, get_multiprocessing_start_method, get_video_backend, set_video_backend, \
75
- get_error_samples_mode, ErrorSamplesMode
76
+ _get_debug_hook_list, get_multiprocessing_start_method
76
77
  from ..core.datatypes import mstype_to_detype
77
78
  from ..core.validator_helpers import replace_none
78
79
  from ..core.py_util_helpers import ExceptionHandler
@@ -2751,6 +2752,8 @@ class BatchDataset(UnionBaseDataset):
2751
2752
 
2752
2753
  self.process_pool = _PythonMultiprocessing(get_multiprocessing_start_method(), self.num_parallel_workers,
2753
2754
  str(self), [self.per_batch_map], self.max_rowsize)
2755
+ # Wrap per_batch_map into _PythonCallable
2756
+ self.per_batch_map = _PythonCallable(self.per_batch_map, 0, self.process_pool)
2754
2757
  else:
2755
2758
  if self.per_batch_map is not None:
2756
2759
  self.per_batch_map = FuncWrapper(self.per_batch_map)
@@ -3054,6 +3057,95 @@ _OP_NAME = dict()
3054
3057
  _OP_PROCESS = dict()
3055
3058
 
3056
3059
 
3060
+ # PythonCallable wrapper for multiprocess pyfunc
3061
+ class _PythonCallable:
3062
+ """
3063
+ Internal Python function wrapper for multiprocessing pyfunc.
3064
+ """
3065
+
3066
+ def __init__(self, py_callable, idx, pool=None):
3067
+ # Original Python callable from user.
3068
+ self.py_callable = py_callable
3069
+ # Process pool created for current iterator.
3070
+ self.pool = pool
3071
+ # Python callable index
3072
+ self.idx = idx
3073
+
3074
+ def __call__(self, *args):
3075
+ result = None
3076
+ get_data_from_worker_process = False
3077
+ while get_data_from_worker_process is False:
3078
+ if self.pool.is_running() and check_iterator_cleanup() is False:
3079
+ try:
3080
+ result = self.pool.execute(self.idx, *args)
3081
+ except multiprocessing.TimeoutError:
3082
+ continue
3083
+ get_data_from_worker_process = True
3084
+ else:
3085
+ # worker process is stopped
3086
+ logger.info("The worker process of map operation is stopped. "
3087
+ "So return None to main thread and break the main thread.")
3088
+ return None
3089
+ # got value from worker process
3090
+ if not isinstance(result, tuple) and get_data_from_worker_process is True:
3091
+ result = (result,)
3092
+ return result
3093
+
3094
+ def to_json(self):
3095
+ return self.py_callable.to_json()
3096
+
3097
+
3098
+ # used when python_multiprocessing=True in map
3099
+ class Pipe:
3100
+ """
3101
+ Class to handle communication between the master process and the worker processes.
3102
+ """
3103
+
3104
+ def __init__(self, warning_ctl, shared_memory=False, max_rowsize=(-1, -1)):
3105
+ self.shared_memory = shared_memory
3106
+ self.eof = multiprocessing.Event()
3107
+ if self.shared_memory:
3108
+ self.in_queue = _SharedQueue(1, warning_ctl, max_rowsize=max_rowsize[0])
3109
+ self.res_queue = _SharedQueue(1, warning_ctl, max_rowsize=max_rowsize[1])
3110
+ else:
3111
+ self.in_queue = _Queue(1)
3112
+ self.res_queue = _Queue(1)
3113
+ self.in_queue.cancel_join_thread() # Ensure that the process does not hung when exiting
3114
+
3115
+ def master_send(self, func_index, data):
3116
+ self.in_queue.put_nowait((func_index, *data))
3117
+
3118
+ def master_receive(self):
3119
+ if self.eof is None:
3120
+ raise RuntimeError("EOF is none when get data from worker.")
3121
+ if self.eof.is_set():
3122
+ return None
3123
+ return self.res_queue.get(timeout=1)
3124
+
3125
+ def master_close(self):
3126
+ self.eof.set()
3127
+ self.send_finish_signal_to_worker()
3128
+ self.send_finish_signal()
3129
+
3130
+ def send_finish_signal(self):
3131
+ self.worker_send(None)
3132
+
3133
+ def send_finish_signal_to_worker(self):
3134
+ self.master_send(0, "QUIT")
3135
+
3136
+ def worker_send(self, data):
3137
+ self.res_queue.put_until(data, timeout=1, exit_signal=self.eof)
3138
+
3139
+ def worker_receive(self):
3140
+ result = self.in_queue.get_until(timeout=1, exit_signal=self.eof)
3141
+ if result is None:
3142
+ return result
3143
+ if len(result) == 1:
3144
+ raise RuntimeError(f"Corrupted data. Worker received {len(result)} elements, it should be more than 1.")
3145
+ func_index, *data = result
3146
+ return func_index, tuple(data)
3147
+
3148
+
3057
3149
  def _main_process_already_exit():
3058
3150
  """
3059
3151
  Judge whether main process already exit.
@@ -3066,18 +3158,15 @@ def _main_process_already_exit():
3066
3158
  return False
3067
3159
 
3068
3160
 
3069
- def _worker_loop(quit_signal, operations, worker_id, op_type, key, video_backend=None):
3161
+ def _worker_loop(operations, pipe, worker_id):
3070
3162
  """
3071
3163
  Multiprocess worker process loop.
3072
- The worker process(Python Layer) gets data from / sends data to map / batch thread(C++ layer) by message queue
3073
- and shared memory. This logic no longer uses the Python multi-process pool, in_queue, and out_queue for
3074
- data transferring.
3075
3164
  """
3076
3165
  # Initialize C++ side signal handlers
3077
3166
  cde.register_worker_handlers()
3078
3167
 
3079
- if video_backend is not None:
3080
- set_video_backend(video_backend)
3168
+ # Ensure that the process does not hang when exiting
3169
+ pipe.res_queue.cancel_join_thread()
3081
3170
 
3082
3171
  def _ignore_sigint():
3083
3172
  """
@@ -3091,197 +3180,121 @@ def _worker_loop(quit_signal, operations, worker_id, op_type, key, video_backend
3091
3180
  if get_seed() != 5489:
3092
3181
  set_seed(get_seed() + worker_id)
3093
3182
 
3094
- msg_queue = cde.MessageQueue(key)
3095
- msg_queue.set_release_flag(False)
3096
- shm_queue = cde.SharedMemoryQueue(key)
3097
- shm_queue.set_release_flag(False)
3098
-
3099
- pid = str(os.getpid())
3100
- ppid = str(os.getppid())
3101
-
3102
- # Scenario: when the main process is killed, worker processe needs to release shm & msg.
3103
- # The shm id and msg id should be released by SIGTERM in worker handler
3104
- cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
3105
- msg_queue.msg_queue_id)
3106
-
3107
- num_receive = 0
3108
- num_send = 0
3109
3183
  while not _main_process_already_exit():
3110
3184
  _ignore_sigint()
3111
3185
 
3112
- # quit by close_worker
3113
- if quit_signal.is_set():
3186
+ result = pipe.worker_receive()
3187
+ if result is None:
3114
3188
  return
3115
-
3116
- # >> receive procedure >>
3117
- ## 1. get message queue which contains shared memory info from map C++ thread in main process
3189
+ (idx, input_tensors) = result
3190
+ if input_tensors == "QUIT":
3191
+ break
3118
3192
  try:
3119
- cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
3120
- msg_queue.msg_queue_id)
3121
- msg_queue.msg_rcv(cde.MASTER_SEND_DATA_MSG)
3122
- cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
3123
- msg_queue.msg_queue_id)
3124
- except RuntimeError as err:
3125
- cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
3126
- msg_queue.msg_queue_id)
3127
- # the msg_queue had been released by main process, ignore it in worker process
3128
- if "errno: 2" in str(err):
3129
- # Because the worker process does not release msg and shm, continue
3130
- continue
3131
- raise err
3193
+ output_tensors = operations[idx](*input_tensors)
3132
3194
 
3133
- ## when the message queue had been released, break the loop
3134
- if msg_queue.message_queue_state() == cde.MessageState.RELEASED:
3135
- logger.info("The message queue had been released, worker loop end.")
3136
- break
3195
+ pipe.worker_send(output_tensors)
3196
+ except Exception:
3197
+ pipe.worker_send(ExceptionHandler(where="in map(or batch) worker and execute Python function"))
3198
+ # Do not return
3137
3199
 
3138
- num_receive += 1
3200
+ # release the queue when stop the worker by master
3201
+ del pipe.in_queue
3202
+ del pipe.res_queue
3139
3203
 
3140
- logger.info("Python process {} worker({}) receives {} samples from map thread.".format(op_type, worker_id,
3141
- num_receive))
3142
3204
 
3143
- # convert the data from shm to python data
3144
- if op_type == cde.MAP_OP:
3145
- ## 2. construct shared memory to TensorRow which contains one / more columns
3146
- tensor_row = shm_queue.to_tensor_row(msg_queue.shm_id, msg_queue.shm_size)
3205
+ def worker_target(operations, worker_id):
3206
+ logger.info("Multiprocessing start method: {}".format(multiprocessing.get_start_method()))
3207
+ return lambda pipe: _worker_loop(operations, pipe, worker_id)
3147
3208
 
3148
- ## 3. convert TensorRow to Python tuple which elements are a column
3149
- tuple_column = cde.convert_tensor_row_to_py_tuple(tensor_row)
3150
3209
 
3151
- py_func_input = tuple_column
3152
- elif op_type == cde.BATCH_OP:
3153
- ## 2. construct shard memory to TensorTable which contains one / more TensorRow & CBatchInfo
3154
- tensor_table, batch_info, _ = shm_queue.to_tensor_table(msg_queue.shm_id, msg_queue.shm_size)
3210
+ class WorkerTarget:
3211
+ def __init__(self, operations, pipe, worker_id):
3212
+ self.operations = operations
3213
+ self.pipe = pipe
3214
+ self.worker_id = worker_id
3215
+ logger.info("Multiprocessing start method: {}".format(multiprocessing.get_start_method()))
3155
3216
 
3156
- ## 3. convert TensorTable to Python tuple tuple
3157
- # The tuple indicate the multi columns
3158
- # The list indicate the multi rows
3159
- tuple_list_column = cde.convert_tensor_table_to_py_tuple_list(tensor_table)
3217
+ def __call__(self):
3218
+ return _worker_loop(self.operations, self.pipe, self.worker_id)
3160
3219
 
3161
- py_func_input = (*tuple_list_column, batch_info)
3162
- else:
3163
- raise RuntimeError("The op_type: {} is invalid.".format(op_type))
3164
3220
 
3165
- # execute the pyfunc
3166
- try:
3167
- py_func_output = py_func_input
3221
+ class _MPWorker(multiprocessing.Process):
3222
+ """
3223
+ Worker process for multiprocessing.
3224
+ """
3168
3225
 
3169
- # execute the remaining operations
3170
- for idx in range(len(operations)):
3171
- if isinstance(py_func_output, tuple):
3172
- py_func_output = operations[idx](*py_func_output)
3226
+ def __init__(self, operations, warning_ctl, max_rowsize=(-1, -1), worker_id=0):
3227
+ shared_memory = get_enable_shared_mem()
3228
+ self.pipe = Pipe(warning_ctl, shared_memory=shared_memory, max_rowsize=max_rowsize)
3229
+ self.check_interval = get_multiprocessing_timeout_interval()
3230
+ super().__init__(target=worker_target(operations, worker_id), name="MapWorker" + str(worker_id),
3231
+ args=(self.pipe,), daemon=True)
3232
+
3233
+ def execute(self, idx, *args):
3234
+ """Acquiring data from a worker in an infinite loop"""
3235
+ self.pipe.master_send(idx, args)
3236
+ time_s = time.time()
3237
+ wait_count = 1
3238
+ while True:
3239
+ cost_time = time.time() - time_s
3240
+ if cost_time / self.check_interval >= wait_count:
3241
+ wait_count += 1
3242
+ logger.warning("It has been waiting for " + "%.3f" % cost_time + "s because the sub-process "
3243
+ "worker of the map operation is hanging. "
3244
+ "Check whether the user defined data transform is too slow or the "
3245
+ "output data is too large. You can also set the timeout interval by "
3246
+ "ds.config.set_multiprocessing_timeout_interval to adjust the output frequency "
3247
+ "of this log.")
3248
+ pid = self.pid
3249
+ logger.warning("Map worker subprocess ID {} is stuck.".format(pid))
3250
+ install_status, _ = subprocess.getstatusoutput("py-spy --version")
3251
+ if install_status == 0:
3252
+ stack = subprocess.getoutput("py-spy dump -p {} -l".format(pid))
3253
+ logger.warning("Map worker subprocess stack:\n{}".format(stack))
3173
3254
  else:
3174
- py_func_output = operations[idx](py_func_output)
3175
-
3176
- # << send procedure <<
3177
- # the result is None
3178
- if py_func_output is None:
3179
- raise RuntimeError("Got None from Python Function which is defined by {}".format(op_type))
3180
-
3181
- # convert the output to tuple
3182
- if not isinstance(py_func_output, tuple):
3183
- py_func_output = (py_func_output,)
3184
-
3185
- if op_type == cde.MAP_OP:
3186
- # check if the map return Generator type
3187
- for item in py_func_output:
3188
- if isinstance(item, GeneratorType):
3189
- raise RuntimeError("Cannot pickle <class 'generator'> object, please verify pyfunc "
3190
- "return with numpy array")
3191
-
3192
- ## 1. convert Python tuple to TensorRow
3193
- output_tensor_row = cde.convert_py_tuple_to_tensor_row(py_func_output)
3194
-
3195
- ## 2. convert TensorRow to shared memory
3196
- shm_queue.from_tensor_row(output_tensor_row)
3197
- elif op_type == cde.BATCH_OP:
3198
- ## 1. convert Python tuple tuple to TensorTable
3199
- output_tensor_table, concat_batch = cde.convert_py_tuple_list_to_tensor_table(py_func_output)
3200
-
3201
- ## 2. convert TensorTable to shared memory
3202
- shm_queue.from_tensor_table(output_tensor_table, batch_info, concat_batch)
3203
- else:
3204
- raise RuntimeError("The op_type: {} is invalid.".format(op_type))
3205
-
3206
- ## 3. send message queue which contains shared memory to map C++ thread in main process
3207
- cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
3208
- msg_queue.msg_queue_id)
3209
- msg_queue.msg_snd(cde.WORKER_SEND_DATA_MSG, shm_queue.get_shm_id(), shm_queue.get_shm_size())
3210
- cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
3211
- msg_queue.msg_queue_id)
3212
-
3213
- num_send += 1
3214
- logger.info("Python process {} worker({}) sends {} samples to map thread.".format(op_type, worker_id,
3215
- num_send))
3216
- except Exception:
3255
+ logger.warning("Please `pip install py-spy` to get the stacks of the stuck process.")
3217
3256
  try:
3218
- if op_type == cde.MAP_OP:
3219
- pyfunc_err = ExceptionHandler(where="in map worker and execute Python function")
3220
- elif op_type == cde.BATCH_OP:
3221
- pyfunc_err = ExceptionHandler(where="in batch(per_batch_map) worker and execute Python function")
3222
- else:
3223
- pyfunc_err = "The op_type: {} is invalid.".format(op_type)
3224
- pyfunc_err.reraise()
3225
- except Exception as err:
3226
- _, _, exc_tb = sys.exc_info()
3227
- fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
3228
-
3229
- if op_type == cde.MAP_OP:
3230
- logger.info("Got exception {} from Map Worker({})".format(str(err), worker_id))
3231
- elif op_type == cde.BATCH_OP:
3232
- logger.info("Got exception {} from Batch Worker({})".format(str(err), worker_id))
3233
- else:
3234
- logger.info("The op_type: {} is invalid.".format(op_type))
3235
-
3236
- # err_code, lineno, filename, err_desc
3237
- msg_queue.serialize_status(cde.StatusCode.MD_PY_FUNC_EXCEPTION, exc_tb.tb_lineno, fname, str(err))
3238
-
3239
- cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
3240
- msg_queue.msg_queue_id)
3241
- msg_queue.msg_snd(cde.WORKER_SEND_DATA_MSG, shm_queue.get_shm_id(), shm_queue.get_shm_size())
3242
- cde.register_shm_id_and_msg_id(pid + "_" + ppid + "_" + str(op_type), shm_queue.get_shm_id(),
3243
- msg_queue.msg_queue_id)
3244
-
3245
- # worker error
3246
- if get_error_samples_mode() == ErrorSamplesMode.RETURN:
3247
- break
3248
- else:
3249
- # continue the loop, when the get_error_samples_mode() is REPLACE or SKIP
3250
- continue
3251
-
3252
- # release the eager executor which is used by current process
3253
- transforms.transforms.clean_unused_executors()
3254
-
3255
- while not _main_process_already_exit():
3256
- # quit by close_worker
3257
- if quit_signal.is_set():
3257
+ res = self.pipe.master_receive()
3258
+ except queue.Empty:
3259
+ continue
3260
+ if res is None:
3261
+ # receive finish signal
3262
+ return None
3263
+ if isinstance(res, ExceptionHandler):
3264
+ res.reraise()
3265
+ return res
3266
+
3267
+ def close(self):
3268
+ try:
3269
+ if self.is_alive():
3270
+ # release the eager executor which is used by current process
3271
+ transforms.transforms.clean_unused_executors()
3272
+
3273
+ logger.info(f"Closing worker with PID: {self.pid}")
3274
+ self.pipe.master_close()
3275
+
3276
+ process_dir = os.path.join('/proc', str(self.pid))
3277
+ while self.is_alive() and os.path.exists(process_dir):
3278
+ logger.info("Waiting for worker {} closed ...".format(self.pid))
3279
+ time.sleep(0.001)
3280
+
3281
+ # del the handle which hold by master
3282
+ del self.pipe.in_queue
3283
+ del self.pipe.res_queue
3284
+ super().terminate()
3285
+ super().join()
3286
+ super().close()
3287
+
3288
+ except ValueError:
3289
+ # Process has been closed already
3258
3290
  return
3291
+ return
3259
3292
 
3260
- logger.info("The worker process is waiting for the main process to exit.")
3261
- time.sleep(0.1)
3262
-
3263
- # the main process is not exist yet which maybe killed -9
3264
- msg_queue.set_release_flag(True)
3265
- msg_queue.release()
3266
- shm_queue.set_release_flag(True)
3267
- shm_queue.release()
3268
-
3269
-
3270
- class WorkerTarget:
3271
- """Mulitprocess mode for dataset map or batch"""
3272
- def __init__(self, quit_signal, operations, worker_id, op_type, ftok_key):
3273
- self.quit_signal = quit_signal
3274
- self.operations = operations
3275
- self.worker_id = worker_id
3276
- self.op_type = op_type
3277
- self.ftok_key = ftok_key
3278
- start_method = multiprocessing.get_start_method()
3279
- logger.info("Multiprocessing start method: {}".format(start_method))
3280
- self.video_backend = get_video_backend() if start_method == 'spawn' else None
3281
-
3282
- def __call__(self):
3283
- return _worker_loop(self.quit_signal, self.operations, self.worker_id, self.op_type, self.ftok_key,
3284
- self.video_backend)
3293
+ def is_alive(self):
3294
+ try:
3295
+ return super().is_alive()
3296
+ except ValueError:
3297
+ return False
3285
3298
 
3286
3299
 
3287
3300
  def worker_is_alive(worker):
@@ -3292,31 +3305,24 @@ def worker_is_alive(worker):
3292
3305
  return False
3293
3306
 
3294
3307
 
3295
- def close_worker(worker, eof):
3308
+ def close_worker(worker, pipe):
3296
3309
  """Close the subprocess worker in spawn mode"""
3297
3310
  try:
3298
3311
  if worker_is_alive(worker):
3299
3312
  # release the eager executor which is used by current process
3300
3313
  transforms.transforms.clean_unused_executors()
3301
3314
 
3302
- # let the worker exit
3303
- logger.info("Set eof flag for worker with PID: {}.".format(worker.pid))
3304
- eof.set()
3305
-
3306
- # wait timeout
3307
- wait_timeout = 2
3308
- start_time = time.time()
3315
+ logger.info(f"Closing worker with PID: {worker.pid}")
3316
+ pipe.master_close()
3309
3317
 
3310
3318
  process_dir = os.path.join('/proc', str(worker.pid))
3311
3319
  while worker_is_alive(worker) and os.path.exists(process_dir):
3312
3320
  logger.info("Waiting for worker {} closed ...".format(worker.pid))
3313
3321
  time.sleep(0.5)
3314
3322
 
3315
- # maybe the worker is hung by msg_queue.MsgRcv, so break the loop and terminate it in next step
3316
- if time.time() - start_time > wait_timeout:
3317
- break
3318
-
3319
3323
  # del the handle which hold by master
3324
+ del pipe.in_queue
3325
+ del pipe.res_queue
3320
3326
  worker.terminate()
3321
3327
  worker.join()
3322
3328
  worker.close()
@@ -3373,8 +3379,7 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3373
3379
  self.warning_ctl = None
3374
3380
  # cache thread (get_ident()) to worker_id mapping in Python layer
3375
3381
  self.python_threads_to_workers = {}
3376
- self.eof_workers = []
3377
- self.eof_clean_process = None
3382
+ self.eof = None
3378
3383
  self.running = False
3379
3384
 
3380
3385
  def __del__(self):
@@ -3450,39 +3455,19 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3450
3455
  del workers
3451
3456
  os.kill(os.getpid(), signal.SIGTERM)
3452
3457
 
3453
- def launch(self, op_id, op_type, ftok_keys):
3458
+ def launch(self, op_id=-1):
3454
3459
  """
3455
3460
  Launch Python multiprocessing pool.
3456
3461
 
3457
3462
  Args:
3458
- op_id (int): ID for operation to have Python multiprocessing pool launched
3459
- op_type (str): Indicate MapOp / BatchOp
3460
- ftok_keys (list[int]): the ftok key of list for msg queue and shm queue
3463
+ op_id: ID for operation to have Python multiprocessing pool launched
3461
3464
 
3462
3465
  Returns:
3463
3466
  Python multiprocessing pool is launched.
3464
3467
  """
3465
3468
  self.python_threads_to_workers = {}
3466
-
3467
- if not isinstance(op_id, int):
3468
- raise RuntimeError("The op_id is not int.")
3469
3469
  self.op_id = op_id
3470
-
3471
- valid_op_type = [cde.MAP_OP, cde.BATCH_OP]
3472
- if op_type not in valid_op_type:
3473
- raise RuntimeError("The op_type: {} is not in {}.".format(op_type, valid_op_type))
3474
- self.op_type = op_type
3475
-
3476
- if not isinstance(ftok_keys, list):
3477
- raise RuntimeError("The ftok_keys is not a list.")
3478
- if not all(isinstance(x, int) for x in ftok_keys):
3479
- raise RuntimeError("The item in ftok_keys is not all int.")
3480
- if len(ftok_keys) != self.num_parallel_workers:
3481
- raise RuntimeError("The len of ftok_keys is not equal to num_parallel_workers.")
3482
- self.ftok_keys = ftok_keys
3483
-
3484
- logger.info("Launching new Python multiprocessing pool for Op: " + self.op_type + "(" + str(self.op_id) + \
3485
- "), ftok_keys: " + str(self.ftok_keys))
3470
+ logger.info("Launching new Python multiprocessing pool for Op: " + str(self.op_id))
3486
3471
  if self.is_mp_enabled():
3487
3472
  message = "Launching a new Python multiprocessing pool while a pool already exists!" + \
3488
3473
  " The existing pool will be terminated first."
@@ -3505,21 +3490,30 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3505
3490
  raise Exception("Pool was already created, close it first.")
3506
3491
 
3507
3492
  self.workers = []
3493
+ self.pipes = []
3494
+ self.check_interval = get_multiprocessing_timeout_interval()
3508
3495
  self.warning_ctl = multiprocessing.Value('i', 0)
3496
+ if self.start_method == "fork":
3497
+ # Construct python worker processes
3498
+ for worker_id in range(self.num_parallel_workers):
3499
+ worker = _MPWorker(self.operations, self.warning_ctl, self.max_rowsize, worker_id)
3500
+ worker.start()
3501
+ self.workers.append(worker)
3502
+ else:
3503
+ multiprocessing.set_start_method(self.start_method, True)
3509
3504
 
3510
- multiprocessing.set_start_method(self.start_method, True)
3511
-
3512
- # Construct python worker processes
3513
- for worker_id in range(self.num_parallel_workers):
3514
- eof = multiprocessing.Event()
3515
- worker = multiprocessing.Process(target=WorkerTarget(eof, self.operations, worker_id, self.op_type,
3516
- self.ftok_keys[worker_id]),
3517
- name="MapWorker" + str(worker_id), daemon=True)
3518
- self.eof_workers.append(eof)
3519
- self.workers.append(worker)
3520
- worker.start()
3505
+ # Construct python worker processes
3506
+ for worker_id in range(self.num_parallel_workers):
3507
+ shared_memory = get_enable_shared_mem()
3508
+ pipe = Pipe(self.warning_ctl, shared_memory=shared_memory, max_rowsize=self.max_rowsize)
3509
+ self.check_interval = get_multiprocessing_timeout_interval()
3510
+ worker = multiprocessing.Process(target=WorkerTarget(self.operations, pipe, worker_id),
3511
+ name="MapWorker" + str(worker_id), daemon=True)
3512
+ self.workers.append(worker)
3513
+ self.pipes.append(pipe)
3514
+ worker.start()
3521
3515
 
3522
- multiprocessing.set_start_method("fork", True)
3516
+ multiprocessing.set_start_method("fork", True)
3523
3517
 
3524
3518
  logger.info("Launch worker process(es): {}".format(self.get_pids()))
3525
3519
 
@@ -3533,20 +3527,6 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3533
3527
  atexit.register(lambda cleanup: cleanup()() if cleanup() is not None else None,
3534
3528
  weakref.WeakMethod(self.terminate))
3535
3529
 
3536
- # Ensure that all workers are in the running state
3537
- start = time.time()
3538
- wait_time = 120 # 120s
3539
- while True:
3540
- if self.is_running():
3541
- logger.info("All workers has been running state.")
3542
- break
3543
- else:
3544
- time.sleep(0.5)
3545
- if time.time() - start > wait_time:
3546
- logger.error("All worker processes have not reached the running state within " + str(wait_time) +
3547
- " seconds, data processing errors may occur.")
3548
- break
3549
-
3550
3530
  def terminate(self):
3551
3531
  if self.running:
3552
3532
  # abort the monitor first and then close all the workers
@@ -3575,8 +3555,7 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3575
3555
  continue
3576
3556
  return self.pids
3577
3557
 
3578
- def add_new_workers(self, num_new_workers, op_type, ftok_keys):
3579
- """Used by AutoTune"""
3558
+ def add_new_workers(self, num_new_workers):
3580
3559
  logger.info(
3581
3560
  "Increasing num_parallel_workers of Python Multiprocessing pool for Op:" + str(self.op_id) +
3582
3561
  ", old num_workers=" + str(self.num_parallel_workers) + " new num_workers=" + str(
@@ -3584,14 +3563,9 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3584
3563
  num_new_workers) + ".")
3585
3564
  self.terminate()
3586
3565
  self.num_parallel_workers += num_new_workers
3566
+ self.launch(self.op_id)
3587
3567
 
3588
- if self.num_parallel_workers != len(ftok_keys):
3589
- raise RuntimeError("Add new workers failed, the num_workers is not equal size of ftok_keys.")
3590
-
3591
- self.launch(self.op_id, op_type, ftok_keys)
3592
-
3593
- def remove_workers(self, num_removed_workers, op_type, ftok_keys):
3594
- """Used by AutoTune"""
3568
+ def remove_workers(self, num_removed_workers):
3595
3569
  logger.info(
3596
3570
  "Decreasing num_parallel_workers of Python Multiprocessing pool for Op:" + str(self.op_id) +
3597
3571
  ", old num_workers=" + str(self.num_parallel_workers) + " new num_workers=" + str(
@@ -3599,15 +3573,60 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3599
3573
  num_removed_workers) + ".")
3600
3574
  self.terminate()
3601
3575
  self.num_parallel_workers -= num_removed_workers
3602
-
3603
- if self.num_parallel_workers != len(ftok_keys):
3604
- raise RuntimeError("Remove workers failed, the num_workers is not equal size of ftok_keys.")
3605
-
3606
- self.launch(self.op_id, op_type, ftok_keys)
3576
+ self.launch(self.op_id)
3607
3577
 
3608
3578
  def is_mp_enabled(self):
3609
3579
  return self.workers is not None
3610
3580
 
3581
+ def execute(self, idx, *args):
3582
+ """
3583
+ Execute
3584
+ """
3585
+ t_id = threading.get_ident()
3586
+ # get the worker_id from Python layer cache first, get from Cpp layer if not found.
3587
+ worker_id = self.python_threads_to_workers.setdefault(t_id, self.get_thread_to_worker())
3588
+ if worker_id >= len(self.workers):
3589
+ raise RuntimeError("[Internal] worker_id value is greater than number of available workers!")
3590
+
3591
+ # todo check_iterator_cleanup
3592
+ if self.is_running() and check_iterator_cleanup() is False:
3593
+ if self.start_method == "fork":
3594
+ return self.workers[worker_id].execute(idx, *args)
3595
+ # spawn mode
3596
+ self.pipes[worker_id].master_send(idx, args)
3597
+ time_s = time.time()
3598
+ wait_count = 1
3599
+ while True:
3600
+ cost_time = time.time() - time_s
3601
+ if cost_time / self.check_interval >= wait_count:
3602
+ wait_count += 1
3603
+ logger.warning("It has been waiting for " + "%.3f" % cost_time + "s because the sub-process "
3604
+ "worker of the map operation is hanging. "
3605
+ "Check whether the user defined data transform is too slow or the "
3606
+ "output data is too large. You can also set the timeout interval by "
3607
+ "ds.config.set_multiprocessing_timeout_interval to adjust the output frequency "
3608
+ "of this log.")
3609
+ pid = self.workers[worker_id].pid
3610
+ logger.warning("Map worker subprocess ID {} is stuck.".format(pid))
3611
+ install_status, _ = subprocess.getstatusoutput("py-spy --version")
3612
+ if install_status == 0:
3613
+ stack = subprocess.getoutput("py-spy dump -p {} -l".format(pid))
3614
+ logger.warning("Map worker subprocess stack:\n{}".format(stack))
3615
+ else:
3616
+ logger.warning("Please `pip install py-spy` to get the stacks of the stuck process.")
3617
+ try:
3618
+ res = self.pipes[worker_id].master_receive()
3619
+ except queue.Empty:
3620
+ continue
3621
+ if res is None:
3622
+ # receive finish signal
3623
+ return None
3624
+ if isinstance(res, ExceptionHandler):
3625
+ res.reraise()
3626
+ return res
3627
+
3628
+ return None
3629
+
3611
3630
  def _launch_monitor(self):
3612
3631
  """
3613
3632
  Launch a clean process and register subprocess to be monitored by the watch dog.
@@ -3615,10 +3634,10 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3615
3634
  The watch dog will clean up subprocesses and main process when any subprocess exited.
3616
3635
  """
3617
3636
  if platform.system().lower() != 'windows':
3618
- self.eof_clean_process = multiprocessing.Event()
3637
+ self.eof = multiprocessing.Event()
3619
3638
  self.cleaning_process = multiprocessing.Process(target=self._clean_process,
3620
3639
  name="MapCleanProcess",
3621
- args=(self.ppid, self.workers, self.eof_clean_process),
3640
+ args=(self.ppid, self.workers, self.eof),
3622
3641
  daemon=True)
3623
3642
  self.cleaning_process.start()
3624
3643
  logger.info("Launch clean process {} to monitor worker "
@@ -3634,9 +3653,8 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3634
3653
  """Deregister workers monitored by the watch dog and join clean process."""
3635
3654
  if get_enable_watchdog():
3636
3655
  cde.deregister_worker_pids(id(self))
3637
- if hasattr(self, 'eof') and self.eof_clean_process is not None:
3638
- logger.info("Set eof flag for cleaning_process.")
3639
- self.eof_clean_process.set()
3656
+ if hasattr(self, 'eof') and self.eof is not None:
3657
+ self.eof.set()
3640
3658
  if hasattr(self, 'cleaning_process') and self.cleaning_process is not None:
3641
3659
  # let the quit event notify the cleaning process to exit
3642
3660
  self.cleaning_process.join(timeout=5)
@@ -3647,14 +3665,20 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3647
3665
 
3648
3666
  def is_running(self):
3649
3667
  if hasattr(self, 'workers') and self.workers is not None:
3668
+ if self.start_method == "fork":
3669
+ return all([w.is_alive() for w in self.workers])
3650
3670
  return all([worker_is_alive(w) for w in self.workers])
3651
3671
  return False
3652
3672
 
3653
3673
  def close_all_workers(self):
3654
3674
  """Close all the subprocess workers"""
3655
3675
  if hasattr(self, 'workers') and self.workers is not None:
3656
- for index in range(len(self.workers)):
3657
- close_worker(self.workers[index], self.eof_workers[index])
3676
+ if self.start_method == "fork":
3677
+ for w in self.workers:
3678
+ w.close()
3679
+ else:
3680
+ for i, w in enumerate(self.workers):
3681
+ close_worker(w, self.pipes[i])
3658
3682
 
3659
3683
  check_interval = get_multiprocessing_timeout_interval()
3660
3684
  for w in self.workers:
@@ -3671,8 +3695,12 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3671
3695
  continue
3672
3696
  raise e
3673
3697
  try:
3674
- if worker_is_alive(w):
3675
- os.close(subprocess_file_descriptor)
3698
+ if self.start_method == "fork":
3699
+ if w.is_alive():
3700
+ os.close(subprocess_file_descriptor)
3701
+ else:
3702
+ if worker_is_alive(w):
3703
+ os.close(subprocess_file_descriptor)
3676
3704
  except OSError as e:
3677
3705
  # Maybe the file descriptor had been released, so ignore the 'Bad file descriptor'
3678
3706
  if "Bad file descriptor" not in str(e):
@@ -3681,12 +3709,8 @@ class _PythonMultiprocessing(cde.PythonMultiprocessingRuntime):
3681
3709
  # use clear to release the handle which is better than self.workers = None
3682
3710
  self.workers.clear()
3683
3711
  self.workers = None
3684
- self.eof_workers.clear()
3685
- self.eof_workers = []
3686
-
3687
- # as it can cause the main process to not exit when PyFunc executes very slowly so release
3688
- # the shm & msg here
3689
- cde.release_shm_and_msg_by_worker_pids(self.pids)
3712
+ self.pipes.clear()
3713
+ self.pipes = None
3690
3714
  self.pids = None
3691
3715
 
3692
3716
 
@@ -3764,22 +3788,7 @@ class MapDataset(UnionBaseDataset):
3764
3788
 
3765
3789
  count_old_transforms, count_new_transforms, count_non_data_vision_transforms = \
3766
3790
  self.__count_transforms(operations)
3767
- count_py_ops = self.__count_py_ops(operations)
3768
3791
  count_pyfunc = self.__count_pyfuncs(operations)
3769
-
3770
- # Whether to execute ops in the thread mode
3771
- # op_type python_multiprocessing run_in_thread
3772
- # c_op(s) false yes
3773
- # c_op(s) true yes
3774
- # py_op(s) / PyFunc false yes
3775
- # py_op(s) / PyFunc true no
3776
- # c_op(s) + py_op(s) / PyFunc false yes
3777
- # c_op(s) + py_op(s) / PyFunc true no
3778
- run_in_thread = not self.python_multiprocessing or (count_pyfunc == 0 and count_py_ops == 0) or get_debug_mode()
3779
-
3780
- if self.python_multiprocessing and platform.system().lower() == 'windows':
3781
- run_in_thread = True
3782
-
3783
3792
  if count_new_transforms + count_pyfunc == len(operations):
3784
3793
  prev_op = None
3785
3794
  for op in operations:
@@ -3797,43 +3806,18 @@ class MapDataset(UnionBaseDataset):
3797
3806
  op.implementation = Implementation.C
3798
3807
  prev_op = op
3799
3808
  operations = self.__insert_debug_wrapper(operations)
3800
- if run_in_thread:
3801
- operations = transforms.transforms.Compose.reduce(operations)
3809
+ operations = transforms.transforms.Compose.reduce(operations)
3802
3810
  elif count_old_transforms + count_pyfunc + count_non_data_vision_transforms == len(operations):
3803
3811
  operations = self.__insert_debug_wrapper(operations)
3804
- if run_in_thread:
3805
- operations = transforms.py_transforms.Compose.reduce(operations)
3812
+ operations = transforms.py_transforms.Compose.reduce(operations)
3806
3813
  else:
3807
3814
  raise RuntimeError("Mixing old legacy c/py_transforms and new unified transforms is not allowed.")
3808
3815
 
3809
- if run_in_thread:
3810
- self.operations = self.__process_final_operations(operations)
3811
- else:
3812
- self.operations = operations
3816
+ self.operations = self.__process_final_operations(operations)
3813
3817
  self.prepare_multiprocessing()
3814
3818
 
3815
3819
  callbacks = [cb.create_runtime_obj() for cb in self.callbacks]
3816
-
3817
- ## thread mode
3818
- if run_in_thread:
3819
- return cde.MapNode(children[0], self.operations, self.input_columns, self.output_columns,
3820
- callbacks, OffloadToManualOffloadMode.get(self.offload), self.process_pool)
3821
-
3822
- # Bind self.operations with self.process_pool
3823
- class _BindProcessPoolWithOperations:
3824
- def __init__(self, pool, operations):
3825
- self.pool = pool
3826
- self.operations = operations
3827
-
3828
- def __call__(self):
3829
- pass
3830
-
3831
- self.bound = _BindProcessPoolWithOperations(self.process_pool, self.operations)
3832
-
3833
- ## process mode
3834
- # in multi process mode, we just transfer the self.bound which is not really used in c layer
3835
- # because when the pipeline is running, map thread transfer data through c++ shm & msg to Python Worker Process
3836
- return cde.MapNode(children[0], [self.bound], self.input_columns, self.output_columns,
3820
+ return cde.MapNode(children[0], self.operations, self.input_columns, self.output_columns,
3837
3821
  callbacks, OffloadToManualOffloadMode.get(self.offload), self.process_pool)
3838
3822
 
3839
3823
  def __deepcopy__(self, memodict):
@@ -3886,22 +3870,10 @@ class MapDataset(UnionBaseDataset):
3886
3870
  @staticmethod
3887
3871
  def __count_pyfuncs(operations):
3888
3872
  """
3889
- Count the number of pyfuncs operations which is defined by user
3873
+ Count the number of pyfuncs operations
3890
3874
  """
3891
3875
  return sum([1 if isinstance(op, FuncWrapper) else 0 for op in operations])
3892
3876
 
3893
- @staticmethod
3894
- def __count_py_ops(operations):
3895
- """
3896
- Count the number of python operations which is built-in
3897
- """
3898
- count = 0
3899
- for op in operations:
3900
- if hasattr(op, "implementation") and op.implementation != Implementation.C \
3901
- and op.implementation is not None:
3902
- count += 1
3903
- return count
3904
-
3905
3877
  @staticmethod
3906
3878
  def __count_transforms(operations):
3907
3879
  """
@@ -3965,6 +3937,7 @@ class MapDataset(UnionBaseDataset):
3965
3937
  " Ignoring Python multiprocessing for map operation.")
3966
3938
  return
3967
3939
  if self.python_multiprocessing:
3940
+ iter_specific_operations = []
3968
3941
  callable_list = []
3969
3942
 
3970
3943
  # If user didn't specify num_parallel_workers, set it to default
@@ -3981,6 +3954,18 @@ class MapDataset(UnionBaseDataset):
3981
3954
  self.process_pool = _PythonMultiprocessing(get_multiprocessing_start_method(),
3982
3955
  self.num_parallel_workers, str(self),
3983
3956
  callable_list, self.max_rowsize)
3957
+ # Pass #2
3958
+ idx = 0
3959
+ for op in self.operations:
3960
+ # our c transforms is now callable and should not be run in Python multithreading
3961
+ if MapDataset.__operation_valid_for_multiprocessing(op):
3962
+ # Wrap Python callable into _PythonCallable
3963
+ iter_specific_operations.append(_PythonCallable(op, idx, self.process_pool))
3964
+ idx += 1
3965
+ else:
3966
+ # CPP ops remain the same
3967
+ iter_specific_operations.append(op)
3968
+ self.operations = iter_specific_operations
3984
3969
 
3985
3970
  def __insert_debug_wrapper(self, operations):
3986
3971
  """