python-wml 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of python-wml might be problematic. Click here for more details.

Files changed (164) hide show
  1. python_wml-3.0.0.dist-info/LICENSE +23 -0
  2. python_wml-3.0.0.dist-info/METADATA +51 -0
  3. python_wml-3.0.0.dist-info/RECORD +164 -0
  4. python_wml-3.0.0.dist-info/WHEEL +5 -0
  5. python_wml-3.0.0.dist-info/top_level.txt +1 -0
  6. wml/__init__.py +0 -0
  7. wml/basic_data_def/__init__.py +2 -0
  8. wml/basic_data_def/detection_data_def.py +279 -0
  9. wml/basic_data_def/io_data_def.py +2 -0
  10. wml/basic_img_utils.py +816 -0
  11. wml/img_patch.py +92 -0
  12. wml/img_utils.py +571 -0
  13. wml/iotoolkit/__init__.py +17 -0
  14. wml/iotoolkit/aic_keypoint.py +115 -0
  15. wml/iotoolkit/baidu_mask_toolkit.py +244 -0
  16. wml/iotoolkit/base_dataset.py +210 -0
  17. wml/iotoolkit/bboxes_statistics.py +515 -0
  18. wml/iotoolkit/build.py +0 -0
  19. wml/iotoolkit/cityscapes_toolkit.py +183 -0
  20. wml/iotoolkit/classification_data_statistics.py +25 -0
  21. wml/iotoolkit/coco_data_fwd.py +225 -0
  22. wml/iotoolkit/coco_keypoints.py +118 -0
  23. wml/iotoolkit/coco_keypoints_fmt2.py +103 -0
  24. wml/iotoolkit/coco_toolkit.py +397 -0
  25. wml/iotoolkit/coco_wholebody.py +269 -0
  26. wml/iotoolkit/common.py +108 -0
  27. wml/iotoolkit/crowd_pose.py +146 -0
  28. wml/iotoolkit/fast_labelme.py +110 -0
  29. wml/iotoolkit/image_folder.py +95 -0
  30. wml/iotoolkit/imgs_cache.py +58 -0
  31. wml/iotoolkit/imgs_reader_mt.py +73 -0
  32. wml/iotoolkit/labelme_base.py +102 -0
  33. wml/iotoolkit/labelme_json_to_img.py +49 -0
  34. wml/iotoolkit/labelme_toolkit.py +117 -0
  35. wml/iotoolkit/labelme_toolkit_fwd.py +733 -0
  36. wml/iotoolkit/labelmemckeypoints_dataset.py +169 -0
  37. wml/iotoolkit/lspet.py +48 -0
  38. wml/iotoolkit/mapillary_vistas_toolkit.py +269 -0
  39. wml/iotoolkit/mat_data.py +90 -0
  40. wml/iotoolkit/mckeypoints_statistics.py +28 -0
  41. wml/iotoolkit/mot_datasets.py +62 -0
  42. wml/iotoolkit/mpii.py +108 -0
  43. wml/iotoolkit/npmckeypoints_dataset.py +164 -0
  44. wml/iotoolkit/o365_to_coco.py +136 -0
  45. wml/iotoolkit/object365_toolkit.py +156 -0
  46. wml/iotoolkit/object365v2_toolkit.py +71 -0
  47. wml/iotoolkit/pascal_voc_data.py +51 -0
  48. wml/iotoolkit/pascal_voc_toolkit.py +194 -0
  49. wml/iotoolkit/pascal_voc_toolkit_fwd.py +473 -0
  50. wml/iotoolkit/penn_action.py +57 -0
  51. wml/iotoolkit/rawframe_dataset.py +129 -0
  52. wml/iotoolkit/rewrite_pascal_voc.py +28 -0
  53. wml/iotoolkit/semantic_data.py +49 -0
  54. wml/iotoolkit/split_file_by_type.py +29 -0
  55. wml/iotoolkit/sports_mot_datasets.py +78 -0
  56. wml/iotoolkit/vis_objectdetection_dataset.py +70 -0
  57. wml/iotoolkit/vis_torch_data.py +39 -0
  58. wml/iotoolkit/yolo_toolkit.py +38 -0
  59. wml/object_detection2/__init__.py +4 -0
  60. wml/object_detection2/basic_visualization.py +37 -0
  61. wml/object_detection2/bboxes.py +812 -0
  62. wml/object_detection2/data_process_toolkit.py +146 -0
  63. wml/object_detection2/keypoints.py +292 -0
  64. wml/object_detection2/mask.py +120 -0
  65. wml/object_detection2/metrics/__init__.py +3 -0
  66. wml/object_detection2/metrics/build.py +15 -0
  67. wml/object_detection2/metrics/classifier_toolkit.py +440 -0
  68. wml/object_detection2/metrics/common.py +71 -0
  69. wml/object_detection2/metrics/mckps_toolkit.py +338 -0
  70. wml/object_detection2/metrics/toolkit.py +1953 -0
  71. wml/object_detection2/npod_toolkit.py +361 -0
  72. wml/object_detection2/odtools.py +243 -0
  73. wml/object_detection2/standard_names.py +75 -0
  74. wml/object_detection2/visualization.py +956 -0
  75. wml/object_detection2/wmath.py +34 -0
  76. wml/semantic/__init__.py +0 -0
  77. wml/semantic/basic_toolkit.py +65 -0
  78. wml/semantic/mask_utils.py +156 -0
  79. wml/semantic/semantic_test.py +21 -0
  80. wml/semantic/structures.py +1 -0
  81. wml/semantic/toolkit.py +105 -0
  82. wml/semantic/visualization_utils.py +658 -0
  83. wml/threadtoolkit.py +50 -0
  84. wml/walgorithm.py +228 -0
  85. wml/wcollections.py +212 -0
  86. wml/wfilesystem.py +487 -0
  87. wml/wml_utils.py +657 -0
  88. wml/wstructures/__init__.py +4 -0
  89. wml/wstructures/common.py +9 -0
  90. wml/wstructures/keypoints_train_toolkit.py +149 -0
  91. wml/wstructures/kps_structures.py +579 -0
  92. wml/wstructures/mask_structures.py +1161 -0
  93. wml/wtorch/__init__.py +8 -0
  94. wml/wtorch/bboxes.py +104 -0
  95. wml/wtorch/classes_suppression.py +24 -0
  96. wml/wtorch/conv_module.py +181 -0
  97. wml/wtorch/conv_ws.py +144 -0
  98. wml/wtorch/data/__init__.py +16 -0
  99. wml/wtorch/data/_utils/__init__.py +45 -0
  100. wml/wtorch/data/_utils/collate.py +183 -0
  101. wml/wtorch/data/_utils/fetch.py +47 -0
  102. wml/wtorch/data/_utils/pin_memory.py +121 -0
  103. wml/wtorch/data/_utils/signal_handling.py +72 -0
  104. wml/wtorch/data/_utils/worker.py +227 -0
  105. wml/wtorch/data/base_data_loader_iter.py +93 -0
  106. wml/wtorch/data/dataloader.py +501 -0
  107. wml/wtorch/data/datapipes/__init__.py +1 -0
  108. wml/wtorch/data/datapipes/iter/__init__.py +12 -0
  109. wml/wtorch/data/datapipes/iter/batch.py +126 -0
  110. wml/wtorch/data/datapipes/iter/callable.py +92 -0
  111. wml/wtorch/data/datapipes/iter/listdirfiles.py +37 -0
  112. wml/wtorch/data/datapipes/iter/loadfilesfromdisk.py +30 -0
  113. wml/wtorch/data/datapipes/iter/readfilesfromtar.py +60 -0
  114. wml/wtorch/data/datapipes/iter/readfilesfromzip.py +63 -0
  115. wml/wtorch/data/datapipes/iter/sampler.py +94 -0
  116. wml/wtorch/data/datapipes/utils/__init__.py +0 -0
  117. wml/wtorch/data/datapipes/utils/common.py +65 -0
  118. wml/wtorch/data/dataset.py +354 -0
  119. wml/wtorch/data/datasets/__init__.py +4 -0
  120. wml/wtorch/data/datasets/common.py +53 -0
  121. wml/wtorch/data/datasets/listdirfilesdataset.py +36 -0
  122. wml/wtorch/data/datasets/loadfilesfromdiskdataset.py +30 -0
  123. wml/wtorch/data/distributed.py +135 -0
  124. wml/wtorch/data/multi_processing_data_loader_iter.py +866 -0
  125. wml/wtorch/data/sampler.py +267 -0
  126. wml/wtorch/data/single_process_data_loader_iter.py +24 -0
  127. wml/wtorch/data/test_data_loader.py +26 -0
  128. wml/wtorch/dataset_toolkit.py +67 -0
  129. wml/wtorch/depthwise_separable_conv_module.py +98 -0
  130. wml/wtorch/dist.py +591 -0
  131. wml/wtorch/dropblock/__init__.py +6 -0
  132. wml/wtorch/dropblock/dropblock.py +228 -0
  133. wml/wtorch/dropblock/dropout.py +40 -0
  134. wml/wtorch/dropblock/scheduler.py +48 -0
  135. wml/wtorch/ema.py +61 -0
  136. wml/wtorch/fc_module.py +73 -0
  137. wml/wtorch/functional.py +34 -0
  138. wml/wtorch/iter_dataset.py +26 -0
  139. wml/wtorch/loss.py +69 -0
  140. wml/wtorch/nets/__init__.py +0 -0
  141. wml/wtorch/nets/ckpt_toolkit.py +219 -0
  142. wml/wtorch/nets/fpn.py +276 -0
  143. wml/wtorch/nets/hrnet/__init__.py +0 -0
  144. wml/wtorch/nets/hrnet/config.py +2 -0
  145. wml/wtorch/nets/hrnet/hrnet.py +494 -0
  146. wml/wtorch/nets/misc.py +249 -0
  147. wml/wtorch/nets/resnet/__init__.py +0 -0
  148. wml/wtorch/nets/resnet/layers/__init__.py +17 -0
  149. wml/wtorch/nets/resnet/layers/aspp.py +144 -0
  150. wml/wtorch/nets/resnet/layers/batch_norm.py +231 -0
  151. wml/wtorch/nets/resnet/layers/blocks.py +111 -0
  152. wml/wtorch/nets/resnet/layers/wrappers.py +110 -0
  153. wml/wtorch/nets/resnet/r50_config.py +38 -0
  154. wml/wtorch/nets/resnet/resnet.py +691 -0
  155. wml/wtorch/nets/shape_spec.py +20 -0
  156. wml/wtorch/nets/simple_fpn.py +101 -0
  157. wml/wtorch/nms.py +109 -0
  158. wml/wtorch/nn.py +896 -0
  159. wml/wtorch/ocr_block.py +193 -0
  160. wml/wtorch/summary.py +331 -0
  161. wml/wtorch/train_toolkit.py +603 -0
  162. wml/wtorch/transformer_blocks.py +266 -0
  163. wml/wtorch/utils.py +719 -0
  164. wml/wtorch/wlr_scheduler.py +100 -0
@@ -0,0 +1,866 @@
1
+ import os
2
+ import threading
3
+ import itertools
4
+ import warnings
5
+ from typing import Any, Callable, Iterable, TypeVar, Generic, Sequence, List, Optional
6
+ import multiprocessing as python_multiprocessing
7
+ import torch
8
+ import torch.multiprocessing as multiprocessing
9
+ from torch._utils import ExceptionWrapper
10
+ if torch.__version__ < "1.9.0":
11
+ from torch._six import queue, container_abcs
12
+ else:
13
+ import queue
14
+ import collections as container_abcs
15
+ import wml.wtorch.utils as wtu
16
+ import time
17
+ from . import IterableDataset, Sampler, SequentialSampler, RandomSampler, BatchSampler, Dataset
18
+ from . import _utils, _BaseDataLoaderIter
19
+ import wml.wml_utils as wmlu
20
+ import traceback
21
+ import sys
22
+ import math
23
+
24
+ class _MultiProcessingDataLoaderIter(_BaseDataLoaderIter):
25
+ r"""Iterates once over the DataLoader's dataset, as specified by the sampler"""
26
+
27
+ # NOTE [ Data Loader Multiprocessing Shutdown Logic ]
28
+ #
29
+ # Preliminary:
30
+ #
31
+ # Our data model looks like this (queues are indicated with curly brackets):
32
+ #
33
+ # main process ||
34
+ # | ||
35
+ # {index_queue} ||
36
+ # | ||
37
+ # worker processes || DATA
38
+ # | ||
39
+ # {worker_result_queue} || FLOW
40
+ # | ||
41
+ # pin_memory_thread of main process || DIRECTION
42
+ # | ||
43
+ # {data_queue} ||
44
+ # | ||
45
+ # data output \/
46
+ #
47
+ # P.S. `worker_result_queue` and `pin_memory_thread` part may be omitted if
48
+ # `pin_memory=False`.
49
+ #
50
+ #
51
+ # Terminating multiprocessing logic requires very careful design. In
52
+ # particular, we need to make sure that
53
+ #
54
+ # 1. The iterator gracefully exits the workers when its last reference is
55
+ # gone or it is depleted.
56
+ #
57
+ # In this case, the workers should be gracefully exited because the
58
+ # main process may still need to continue to run, and we want cleaning
59
+ # up code in the workers to be executed (e.g., releasing GPU memory).
60
+ # Naturally, we implement the shutdown logic in `__del__` of
61
+ # DataLoaderIterator.
62
+ #
63
+ # We delay the discussion on the logic in this case until later.
64
+ #
65
+ # 2. The iterator exits the workers when the loader process and/or worker
66
+ # processes exits normally or with error.
67
+ #
68
+ # We set all workers and `pin_memory_thread` to have `daemon=True`.
69
+ #
70
+ # You may ask, why can't we make the workers non-daemonic, and
71
+ # gracefully exit using the same logic as we have in `__del__` when the
72
+ # iterator gets deleted (see 1 above)?
73
+ #
74
+ # First of all, `__del__` is **not** guaranteed to be called when
75
+ # interpreter exits. Even if it is called, by the time it executes,
76
+ # many Python core library resources may alreay be freed, and even
77
+ # simple things like acquiring an internal lock of a queue may hang.
78
+ # Therefore, in this case, we actually need to prevent `__del__` from
79
+ # being executed, and rely on the automatic termination of daemonic
80
+ # children.
81
+ #
82
+ # Thus, we register an `atexit` hook that sets a global flag
83
+ # `_utils.python_exit_status`. Since `atexit` hooks are executed in the
84
+ # reverse order of registration, we are guaranteed that this flag is
85
+ # set before library resources we use are freed (which, at least in
86
+ # CPython, is done via an `atexit` handler defined in
87
+ # `multiprocessing/util.py`
88
+ # https://github.com/python/cpython/blob/c606624af8d4cb3b4a052fb263bb983b3f87585b/Lib/multiprocessing/util.py#L320-L362
89
+ # registered when an object requiring this mechanism is first
90
+ # created, e.g., `mp.Queue`
91
+ # https://github.com/python/cpython/blob/c606624af8d4cb3b4a052fb263bb983b3f87585b/Lib/multiprocessing/context.py#L100-L103
92
+ # https://github.com/python/cpython/blob/c606624af8d4cb3b4a052fb263bb983b3f87585b/Lib/multiprocessing/queues.py#L29
93
+ # )
94
+ #
95
+ # So in `__del__`, we check if `_utils.python_exit_status` is set or
96
+ # `None` (freed), and perform no-op if so.
97
+ #
98
+ # However, simply letting library clean-up codes run can also be bad,
99
+ # because such codes (i.e., `multiprocessing.util._exit_function()`)
100
+ # include join putting threads for `mp.Queue`, which can be blocking.
101
+ # Hence, the main process putting threads are called with
102
+ # `cancel_join_thread` at creation. See later section
103
+ # [ 3b. A process won't hang when putting into a queue; ]
104
+ # for more details.
105
+ #
106
+ # Here are two example cases where library clean-up codes can run
107
+ # before `__del__` is called:
108
+ #
109
+ # 1. If we hold onto a reference to the iterator, it more often
110
+ # than not tries to do `multiprocessing` library cleaning before
111
+ # clearing the alive referenced objects (https://github.com/pytorch/pytorch/issues/48666)
112
+ # and thus prevents our cleaning-up code to run first.
113
+ #
114
+ # 2. A similar issue araises when a `DataLoader` is used in a subprocess.
115
+ # When a process ends, it shuts the all its daemonic children
116
+ # down with a SIGTERM (instead of joining them without a timeout).
117
+ # Simiarly for threads, but by a different mechanism. This fact,
118
+ # together with a few implementation details of multiprocessing, forces
119
+ # us to make workers daemonic. All of our problems arise when a
120
+ # DataLoader is used in a subprocess, and are caused by multiprocessing
121
+ # code which looks more or less like this:
122
+ #
123
+ # try:
124
+ # your_function_using_a_dataloader()
125
+ # finally:
126
+ # multiprocessing.util._exit_function()
127
+ #
128
+ # The joining/termination mentioned above happens inside
129
+ # `_exit_function()`. Now, if `your_function_using_a_dataloader()`
130
+ # throws, the stack trace stored in the exception will prevent the
131
+ # frame which uses `DataLoaderIter` to be freed. If the frame has any
132
+ # reference to the `DataLoaderIter` (e.g., in a method of the iter),
133
+ # its `__del__`, which starts the shutdown procedure, will not be
134
+ # called. That, in turn, means that workers aren't notified. Attempting
135
+ # to join in `_exit_function` will then result in a hang.
136
+ #
137
+ # For context, `_exit_function` is also registered as an `atexit` call.
138
+ # So it is unclear to me (@ssnl) why this is needed in a finally block.
139
+ # The code dates back to 2008 and there is no comment on the original
140
+ # PEP 371 or patch https://bugs.python.org/issue3050 (containing both
141
+ # the finally block and the `atexit` registration) that explains this.
142
+ #
143
+ #
144
+ # Finally, another choice is to just shutdown workers with logic in 1
145
+ # above whenever we see an error in `next`. This isn't ideal because
146
+ # a. It prevents users from using try-catch to resume data loading.
147
+ # b. It doesn't prevent hanging if users have references to the
148
+ # iterator.
149
+ #
150
+ # 3. All processes exit if any of them die unexpectedly by fatal signals.
151
+ #
152
+ # As shown above, the workers are set as daemonic children of the main
153
+ # process. However, automatic cleaning-up of such child processes only
154
+ # happens if the parent process exits gracefully (e.g., not via fatal
155
+ # signals like SIGKILL). So we must ensure that each process will exit
156
+ # even the process that should send/receive data to/from it were
157
+ # killed, i.e.,
158
+ #
159
+ # a. A process won't hang when getting from a queue.
160
+ #
161
+ # Even with carefully designed data dependencies (i.e., a `put()`
162
+ # always corresponding to a `get()`), hanging on `get()` can still
163
+ # happen when data in queue is corrupted (e.g., due to
164
+ # `cancel_join_thread` or unexpected exit).
165
+ #
166
+ # For child exit, we set a timeout whenever we try to get data
167
+ # from `data_queue`, and check the workers' status on each timeout
168
+ # and error.
169
+ # See `_DataLoaderiter._get_batch()` and
170
+ # `_DataLoaderiter._try_get_data()` for details.
171
+ #
172
+ # Additionally, for child exit on non-Windows platforms, we also
173
+ # register a SIGCHLD handler (which is supported on Windows) on
174
+ # the main process, which checks if any of the workers fail in the
175
+ # (Python) handler. This is more efficient and faster in detecting
176
+ # worker failures, compared to only using the above mechanism.
177
+ # See `DataLoader.cpp` and `_utils/signal_handling.py` for details.
178
+ #
179
+ # For `.get()` calls where the sender(s) is not the workers, we
180
+ # guard them with timeouts, and check the status of the sender
181
+ # when timeout happens:
182
+ # + in the workers, the `_utils.worker.ManagerWatchdog` class
183
+ # checks the status of the main process.
184
+ # + if `pin_memory=True`, when getting from `pin_memory_thread`,
185
+ # check `pin_memory_thread` status periodically until `.get()`
186
+ # returns or see that `pin_memory_thread` died.
187
+ #
188
+ # b. A process won't hang when putting into a queue;
189
+ #
190
+ # We use `mp.Queue` which has a separate background thread to put
191
+ # objects from an unbounded buffer array. The background thread is
192
+ # daemonic and usually automatically joined when the process
193
+ # *exits*.
194
+ #
195
+ # In case that the receiver has ended abruptly while
196
+ # reading from the pipe, the join will hang forever. The usual
197
+ # solution for this in Python is calling `q.cancel_join_thread`,
198
+ # which prevents automatically joining it when finalizing
199
+ # (exiting).
200
+ #
201
+ # Nonetheless, `cancel_join_thread` must only be called when the
202
+ # queue is **not** going to be read from or write into by another
203
+ # process, because it may hold onto a lock or leave corrupted data
204
+ # in the queue, leading other readers/writers to hang.
205
+ #
206
+ # Hence,
207
+ # + For worker processes, we only do so (for their output
208
+ # queues, i.e., `worker_result_queue`) before exiting.
209
+ # + For `pin_memory_thread`, its output queue `data_queue` is a
210
+ # `queue.Queue` that does blocking `put` if the queue is full.
211
+ # So there is no above problem, but as a result, in
212
+ # `_pin_memory_loop`, we do need to wrap the `put` in a loop
213
+ # that breaks not only upon success, but also when the main
214
+ # process stops reading, i.e., is shutting down.
215
+ # + For loader process, we `cancel_join_thread()` for all
216
+ # `_index_queues` because the whole purpose of workers and
217
+ # `pin_memory_thread` is to serve the loader process. If
218
+ # loader process is already exiting, we don't really care if
219
+ # the queues are corrupted.
220
+ #
221
+ #
222
+ # Now let's get back to 1:
223
+ # how we gracefully exit the workers when the last reference to the
224
+ # iterator is gone.
225
+ #
226
+ # To achieve this, we implement the following logic along with the design
227
+ # choices mentioned above:
228
+ #
229
+ # `workers_done_event`:
230
+ # A `multiprocessing.Event` shared among the main process and all worker
231
+ # processes. This is used to signal the workers that the iterator is
232
+ # shutting down. After it is set, they will not send processed data to
233
+ # queues anymore, and only wait for the final `None` before exiting.
234
+ # `done_event` isn't strictly needed. I.e., we can just check for `None`
235
+ # from the input queue, but it allows us to skip wasting resources
236
+ # processing data if we are already shutting down.
237
+ #
238
+ # `pin_memory_thread_done_event`:
239
+ # A `threading.Event` for a similar purpose to that of
240
+ # `workers_done_event`, but is for the `pin_memory_thread`. The reason
241
+ # that separate events are needed is that `pin_memory_thread` reads from
242
+ # the output queue of the workers. But the workers, upon seeing that
243
+ # `workers_done_event` is set, only wants to see the final `None`, and is
244
+ # not required to flush all data in the output queue (e.g., it may call
245
+ # `cancel_join_thread` on that queue if its `IterableDataset` iterator
246
+ # happens to exhaust coincidentally, which is out of the control of the
247
+ # main process). Thus, since we will exit `pin_memory_thread` before the
248
+ # workers (see below), two separete events are used.
249
+ #
250
+ # NOTE: In short, the protocol is that the main process will set these
251
+ # `done_event`s and then the corresponding processes/threads a `None`,
252
+ # and that they may exit at any time after receiving the `None`.
253
+ #
254
+ # NOTE: Using `None` as the final signal is valid, since normal data will
255
+ # always be a 2-tuple with the 1st element being the index of the data
256
+ # transferred (different from dataset index/key), and the 2nd being
257
+ # either the dataset key or the data sample (depending on which part
258
+ # of the data model the queue is at).
259
+ #
260
+ # [ worker processes ]
261
+ # While loader process is alive:
262
+ # Get from `index_queue`.
263
+ # If get anything else,
264
+ # Check `workers_done_event`.
265
+ # If set, continue to next iteration
266
+ # i.e., keep getting until see the `None`, then exit.
267
+ # Otherwise, process data:
268
+ # If is fetching from an `IterableDataset` and the iterator
269
+ # is exhausted, send an `_IterableDatasetStopIteration`
270
+ # object to signal iteration end. The main process, upon
271
+ # receiving such an object, will send `None` to this
272
+ # worker and not use the corresponding `index_queue`
273
+ # anymore.
274
+ # If timed out,
275
+ # No matter `workers_done_event` is set (still need to see `None`)
276
+ # or not, must continue to next iteration.
277
+ # (outside loop)
278
+ # If `workers_done_event` is set, (this can be False with `IterableDataset`)
279
+ # `data_queue.cancel_join_thread()`. (Everything is ending here:
280
+ # main process won't read from it;
281
+ # other workers will also call
282
+ # `cancel_join_thread`.)
283
+ #
284
+ # [ pin_memory_thread ]
285
+ # # No need to check main thread. If this thread is alive, the main loader
286
+ # # thread must be alive, because this thread is set as daemonic.
287
+ # While `pin_memory_thread_done_event` is not set:
288
+ # Get from `index_queue`.
289
+ # If timed out, continue to get in the next iteration.
290
+ # Otherwise, process data.
291
+ # While `pin_memory_thread_done_event` is not set:
292
+ # Put processed data to `data_queue` (a `queue.Queue` with blocking put)
293
+ # If timed out, continue to put in the next iteration.
294
+ # Otherwise, break, i.e., continuing to the out loop.
295
+ #
296
+ # NOTE: we don't check the status of the main thread because
297
+ # 1. if the process is killed by fatal signal, `pin_memory_thread`
298
+ # ends.
299
+ # 2. in other cases, either the cleaning-up in __del__ or the
300
+ # automatic exit of daemonic thread will take care of it.
301
+ # This won't busy-wait either because `.get(timeout)` does not
302
+ # busy-wait.
303
+ #
304
+ # [ main process ]
305
+ # In the DataLoader Iter's `__del__`
306
+ # b. Exit `pin_memory_thread`
307
+ # i. Set `pin_memory_thread_done_event`.
308
+ # ii Put `None` in `worker_result_queue`.
309
+ # iii. Join the `pin_memory_thread`.
310
+ # iv. `worker_result_queue.cancel_join_thread()`.
311
+ #
312
+ # c. Exit the workers.
313
+ # i. Set `workers_done_event`.
314
+ # ii. Put `None` in each worker's `index_queue`.
315
+ # iii. Join the workers.
316
+ # iv. Call `.cancel_join_thread()` on each worker's `index_queue`.
317
+ #
318
+ # NOTE: (c) is better placed after (b) because it may leave corrupted
319
+ # data in `worker_result_queue`, which `pin_memory_thread`
320
+ # reads from, in which case the `pin_memory_thread` can only
321
+ # happen at timeing out, which is slow. Nonetheless, same thing
322
+ # happens if a worker is killed by signal at unfortunate times,
323
+ # but in other cases, we are better off having a non-corrupted
324
+ # `worker_result_queue` for `pin_memory_thread`.
325
+ #
326
+ # NOTE: If `pin_memory=False`, there is no `pin_memory_thread` and (b)
327
+ # can be omitted
328
+ #
329
+ # NB: `done_event`s isn't strictly needed. E.g., we can just check for
330
+ # `None` from `index_queue`, but it allows us to skip wasting resources
331
+ # processing indices already in `index_queue` if we are already shutting
332
+ # down.
333
+
334
+ def __init__(self, loader,batch_split_nr=1):
335
+ super(_MultiProcessingDataLoaderIter, self).__init__(loader)
336
+ print(f"Use _WMultiProcessingDataLoaderIter, batch split nr = {batch_split_nr}, pin memory= {loader.pin_memory}, num_works= {self._num_workers}")
337
+
338
+ assert self._num_workers > 0
339
+ assert self._prefetch_factor > 0
340
+
341
+
342
+ if loader.multiprocessing_context is None:
343
+ multiprocessing_context = multiprocessing
344
+ else:
345
+ multiprocessing_context = loader.multiprocessing_context
346
+
347
+ self._worker_init_fn = loader.worker_init_fn
348
+ self._worker_queue_idx_cycle = itertools.cycle(range(self._num_workers))
349
+ # No certainty which module multiprocessing_context is
350
+ self._worker_result_queue = multiprocessing_context.Queue(max(self._prefetch_factor*batch_split_nr*2,min(self._num_workers,6)*self._prefetch_factor*batch_split_nr)) # type: ignore
351
+ self._worker_pids_set = False
352
+ self._shutdown = False
353
+ self._workers_done_event = multiprocessing_context.Event()
354
+ self.stop_iteration = False
355
+
356
+ self._index_queues = []
357
+ self._workers = []
358
+ self.batch_split_nr = max(1,batch_split_nr)
359
+ for i in range(self._num_workers):
360
+ # No certainty which module multiprocessing_context is
361
+ index_queue = multiprocessing_context.Queue() # type: ignore
362
+ # Need to `cancel_join_thread` here!
363
+ # See sections (2) and (3b) above.
364
+ index_queue.cancel_join_thread()
365
+ w = multiprocessing_context.Process(
366
+ target=_utils.worker._worker_loop,
367
+ args=(self._dataset_kind, self._dataset, index_queue,
368
+ self._worker_result_queue, self._workers_done_event,
369
+ self._auto_collation, self._collate_fn, self._drop_last,
370
+ self._base_seed + i, self._worker_init_fn, i, self._num_workers,
371
+ self._persistent_workers))
372
+ w.daemon = True
373
+ # NB: Process.start() actually take some time as it needs to
374
+ # start a process and pass the arguments over via a pipe.
375
+ # Therefore, we only add a worker to self._workers list after
376
+ # it started, so that we do not call .join() if program dies
377
+ # before it starts, and __del__ tries to join but will get:
378
+ # AssertionError: can only join a started process.
379
+ w.start()
380
+ self._index_queues.append(index_queue)
381
+ self._workers.append(w)
382
+
383
+ if self._pin_memory:
384
+ self._pin_memory_thread_done_event = threading.Event()
385
+
386
+ # Queue is not type-annotated
387
+ self._data_queue = queue.Queue(4*self.batch_split_nr) # type: ignore
388
+ self.pin_memory_stream = torch.cuda.Stream()
389
+ pin_memory_thread = threading.Thread(
390
+ target=_utils.pin_memory._pin_memory_loop_stream,
391
+ args=(self._worker_result_queue, self._data_queue,
392
+ torch.cuda.current_device(),
393
+ self._pin_memory_thread_done_event,
394
+ self.pin_memory_stream))
395
+ pin_memory_thread.daemon = True
396
+ pin_memory_thread.start()
397
+ # Similar to workers (see comment above), we only register
398
+ # pin_memory_thread once it is started.
399
+ self._pin_memory_thread = pin_memory_thread
400
+ self._try_get_data_imp = self._try_get_data_imp_stream
401
+ else:
402
+ self._data_queue = self._worker_result_queue
403
+ self._try_get_data_imp = self._try_get_data_imp_no_stream
404
+
405
+ # .pid can be None only before process is spawned (not the case, so ignore)
406
+ _utils.signal_handling._set_worker_pids(id(self), tuple(w.pid for w in self._workers)) # type: ignore
407
+ _utils.signal_handling._set_SIGCHLD_handler()
408
+ self._worker_pids_set = True
409
+ self._reset(loader, first_iter=True)
410
+ self.datas_cache = []
411
+
412
+ all_pids = f"{os.getpid()}"
413
+ for x in self._workers:
414
+ all_pids += f" {x.ident}"
415
+ print(f"All data workers process PID: {all_pids}")
416
+ print(all_pids.replace(" ",","))
417
+
418
+ def _reset(self, loader, first_iter=False):
419
+ super()._reset(loader, first_iter)
420
+ self.stop_iteration = False
421
+ self._send_idx = 0 # idx of the next task to be sent to workers
422
+ self._rcvd_idx = 0 # idx of the next task to be returned in __next__
423
+ # information about data not yet yielded, i.e., tasks w/ indices in range [rcvd_idx, send_idx).
424
+ # map: task idx => - (worker_id,) if data isn't fetched (outstanding)
425
+ # \ (worker_id, data) if data is already fetched (out-of-order)
426
+ self._task_info = {}
427
+ self._tasks_outstanding = 0 # always equal to count(v for v in task_info.values() if len(v) == 1)
428
+ # A list of booleans representing whether each worker still has work to
429
+ # do, i.e., not having exhausted its iterable dataset object. It always
430
+ # contains all `True`s if not using an iterable-style dataset
431
+ # (i.e., if kind != Iterable).
432
+ # Not that this indicates that a worker still has work to do *for this epoch*.
433
+ # It does not mean that a worker is dead. In case of `_persistent_workers`,
434
+ # the worker will be reset to available in the next epoch.
435
+ self._workers_status = [True for i in range(self._num_workers)]
436
+ # prime the prefetch loop
437
+ self._try_put_index_first()
438
+
439
+ def _try_get_data(self, timeout=_utils.MP_STATUS_CHECK_INTERVAL):
440
+ if self.batch_split_nr<=1:
441
+ res,data = self._try_get_data_imp(timeout=timeout)
442
+ if res is False:
443
+ if self.stop_iteration:
444
+ for q in self._index_queues:
445
+ if q.qsize() > 0:
446
+ return False, None
447
+ raise StopIteration()
448
+ else:
449
+ return res,data
450
+ else:
451
+ return res,data
452
+ else:
453
+ try_nr = self.batch_split_nr+2
454
+ while try_nr>0 and len(self.datas_cache)<self.batch_split_nr:
455
+ res,data = self._try_get_data_imp(timeout=max(1,timeout/self.batch_split_nr))
456
+ if res:
457
+ if isinstance(data[1], ExceptionWrapper):
458
+ e = data[1]
459
+ msg = "ERROR: Caught {} {}.\nOriginal {}".format(
460
+ e.exc_type.__name__, e.where, e.exc_msg)
461
+ print(msg)
462
+ elif data[1] is not None:
463
+ self.datas_cache.append(data[1])
464
+ elif self.stop_iteration:
465
+ break
466
+ try_nr -= 1
467
+ if len(self.datas_cache)>=self.batch_split_nr:
468
+ try:
469
+ data = wtu.concat_datas(self.datas_cache[:self.batch_split_nr],dim=0)
470
+ self.datas_cache = self.datas_cache[self.batch_split_nr:]
471
+ except Exception as e:
472
+ print(f"ERROR: Concat datas faild, {e}.")
473
+ self.datas_cache = []
474
+ return False,None
475
+ return True,(0,data)
476
+ if self.stop_iteration:
477
+ for q in self._index_queues:
478
+ if q.qsize()>0:
479
+ return False,None
480
+ raise StopIteration()
481
+ else:
482
+ return False,None
483
+
484
+ @staticmethod
485
+ def record_stream(data,stream):
486
+ if torch.is_tensor(data):
487
+ if data.dtype != torch.int16: #hack: 不处理int16
488
+ data.record_stream(stream)
489
+ elif isinstance(data,container_abcs.Mapping):
490
+ for k,v in data.items():
491
+ _MultiProcessingDataLoaderIter.record_stream(v,stream)
492
+ elif isinstance(data,Iterable):
493
+ for x in data:
494
+ _MultiProcessingDataLoaderIter.record_stream(x,stream)
495
+
496
+ def _try_get_data_imp_stream(self, timeout=_utils.MP_STATUS_CHECK_INTERVAL):
497
+ torch.cuda.current_stream().wait_stream(self.pin_memory_stream)
498
+ res = self._try_get_data_imp_no_stream(timeout=timeout)
499
+ if res[0]:
500
+ _MultiProcessingDataLoaderIter.record_stream(res[1][1],torch.cuda.current_stream())
501
+ return res
502
+
503
+ def _try_get_data_imp_no_stream(self, timeout=_utils.MP_STATUS_CHECK_INTERVAL):
504
+ # Tries to fetch data from `self._data_queue` once for a given timeout.
505
+ # This can also be used as inner loop of fetching without timeout, with
506
+ # the sender status as the loop condition.
507
+ #
508
+ # This raises a `RuntimeError` if any worker died expectedly. This error
509
+ # can come from either the SIGCHLD handler in `_utils/signal_handling.py`
510
+ # (only for non-Windows platforms), or the manual check below on errors
511
+ # and timeouts.
512
+ #
513
+ # Returns a 2-tuple:
514
+ # (bool: whether successfully get data, any: data if successful else None)
515
+ try:
516
+ data = self._data_queue.get(timeout=timeout)
517
+ return (True, data)
518
+ except Exception as e:
519
+ # At timeout and error, we manually check whether any worker has
520
+ # failed. Note that this is the only mechanism for Windows to detect
521
+ # worker failures.
522
+ failed_workers = []
523
+ for worker_id, w in enumerate(self._workers):
524
+ if self._workers_status[worker_id] and not w.is_alive():
525
+ failed_workers.append(w)
526
+ self._mark_worker_as_unavailable(worker_id)
527
+ if len(failed_workers) > 0:
528
+ pids_str = ', '.join(str(w.pid) for w in failed_workers)
529
+ raise RuntimeError('DataLoader worker (pid(s) {}) exited unexpectedly'.format(pids_str)) from e
530
+ if isinstance(e, queue.Empty):
531
+ return (False, None)
532
+ import tempfile
533
+ import errno
534
+ try:
535
+ # Raise an exception if we are this close to the FDs limit.
536
+ # Apparently, trying to open only one file is not a sufficient
537
+ # test.
538
+ # See NOTE [ DataLoader on Linux and open files limit ]
539
+ fds_limit_margin = 10
540
+ fs = [tempfile.NamedTemporaryFile() for i in range(fds_limit_margin)]
541
+ except OSError as e:
542
+ if e.errno == errno.EMFILE:
543
+ print(f"Open files nr")
544
+ cmd = f"ls /proc/{os.getpid()}/fd | wc -l"
545
+ os.system(cmd)
546
+ print(f"ulimit -n")
547
+ cmd = f"ulimit -n"
548
+ os.system(cmd)
549
+ raise RuntimeError(
550
+ "Too many open files. Communication with the"
551
+ " workers is no longer possible. Please increase the"
552
+ " limit using `ulimit -n` in the shell or change the"
553
+ " sharing strategy by calling"
554
+ " `torch.multiprocessing.set_sharing_strategy('file_system')`"
555
+ " at the beginning of your code") from None
556
+ raise
557
+
558
+ # NOTE [ DataLoader on Linux and open files limit ]
559
+ #
560
+ # On Linux when DataLoader is used with multiprocessing we pass the data between
561
+ # the root process and the workers through SHM files. We remove those files from
562
+ # the filesystem as soon as they are created and keep them alive by
563
+ # passing around their file descriptors through AF_UNIX sockets. (See
564
+ # docs/source/multiprocessing.rst and 'Multiprocessing Technical Notes` in
565
+ # the wiki (https://github.com/pytorch/pytorch/wiki).)
566
+ #
567
+ # This sometimes leads us to exceeding the open files limit. When that happens,
568
+ # and the offending file descriptor is coming over a socket, the `socket` Python
569
+ # package silently strips the file descriptor from the message, setting only the
570
+ # `MSG_CTRUNC` flag (which might be a bit misleading since the manpage says that
571
+ # it _indicates that some control data were discarded due to lack of space in
572
+ # the buffer for ancillary data_). This might reflect the C implementation of
573
+ # AF_UNIX sockets.
574
+ #
575
+ # This behaviour can be reproduced with the script and instructions at the
576
+ # bottom of this note.
577
+ #
578
+ # When that happens, the standard Python `multiprocessing` (and not
579
+ # `torch.multiprocessing`) raises a `RuntimeError: received 0 items of ancdata`
580
+ #
581
+ # Sometimes, instead of the FD being stripped, you may get an `OSError:
582
+ # Too many open files`, both in the script below and in DataLoader. However,
583
+ # this is rare and seems to be nondeterministic.
584
+ #
585
+ #
586
+ # #!/usr/bin/env python3
587
+ # import sys
588
+ # import socket
589
+ # import os
590
+ # import array
591
+ # import shutil
592
+ # import socket
593
+ #
594
+ #
595
+ # if len(sys.argv) != 4:
596
+ # print("Usage: ", sys.argv[0], " tmp_dirname iteration (send|recv)")
597
+ # sys.exit(1)
598
+ #
599
+ # if __name__ == '__main__':
600
+ # dirname = sys.argv[1]
601
+ # sock_path = dirname + "/sock"
602
+ # iterations = int(sys.argv[2])
603
+ # def dummy_path(i):
604
+ # return dirname + "/" + str(i) + ".dummy"
605
+ #
606
+ #
607
+ # if sys.argv[3] == 'send':
608
+ # while not os.path.exists(sock_path):
609
+ # pass
610
+ # client = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
611
+ # client.connect(sock_path)
612
+ # for i in range(iterations):
613
+ # fd = os.open(dummy_path(i), os.O_WRONLY | os.O_CREAT)
614
+ # ancdata = array.array('i', [fd])
615
+ # msg = bytes([i % 256])
616
+ # print("Sending fd ", fd, " (iteration #", i, ")")
617
+ # client.sendmsg([msg], [(socket.SOL_SOCKET, socket.SCM_RIGHTS, ancdata)])
618
+ #
619
+ #
620
+ # else:
621
+ # assert sys.argv[3] == 'recv'
622
+ #
623
+ # if os.path.exists(dirname):
624
+ # raise Exception("Directory exists")
625
+ #
626
+ # os.mkdir(dirname)
627
+ #
628
+ # print("Opening socket...")
629
+ # server = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
630
+ # server.bind(sock_path)
631
+ #
632
+ # print("Listening...")
633
+ # for i in range(iterations):
634
+ # a = array.array('i')
635
+ # msg, ancdata, flags, addr = server.recvmsg(1, socket.CMSG_SPACE(a.itemsize))
636
+ # assert(len(ancdata) == 1)
637
+ # cmsg_level, cmsg_type, cmsg_data = ancdata[0]
638
+ # a.frombytes(cmsg_data)
639
+ # print("Received fd ", a[0], " (iteration #", i, ")")
640
+ #
641
+ # shutil.rmtree(dirname)
642
+ #
643
+ # Steps to reproduce:
644
+ #
645
+ # 1. Run two shells and set lower file descriptor limit in the receiving one:
646
+ # (shell1) ulimit -n 1020
647
+ # (shell2) ulimit -n 1022
648
+ #
649
+ # 2. Run the script above with the `recv` option in the first shell
650
+ # (shell1) ./test_socket.py sock_tmp 1017 recv
651
+ #
652
+ # 3. Run the script with the `send` option in the second shell:
653
+ # (shell2) ./test_socket.py sock_tmp 1017 send
654
+
655
+ def _get_data(self):
656
+ # Fetches data from `self._data_queue`.
657
+ #
658
+ # We check workers' status every `MP_STATUS_CHECK_INTERVAL` seconds,
659
+ # which we achieve by running `self._try_get_data(timeout=MP_STATUS_CHECK_INTERVAL)`
660
+ # in a loop. This is the only mechanism to detect worker failures for
661
+ # Windows. For other platforms, a SIGCHLD handler is also used for
662
+ # worker failure detection.
663
+ #
664
+ # If `pin_memory=True`, we also need check if `pin_memory_thread` had
665
+ # died at timeouts.
666
+ if self._timeout > 0:
667
+ success, data = self._try_get_data(self._timeout)
668
+ if success:
669
+ return data
670
+ else:
671
+ raise RuntimeError('DataLoader timed out after {} seconds'.format(self._timeout))
672
+ elif self._pin_memory:
673
+ while self._pin_memory_thread.is_alive():
674
+ success, data = self._try_get_data()
675
+ if success:
676
+ return data
677
+ else:
678
+ # while condition is false, i.e., pin_memory_thread died.
679
+ raise RuntimeError('Pin memory thread exited unexpectedly')
680
+ # In this case, `self._data_queue` is a `queue.Queue`,. But we don't
681
+ # need to call `.task_done()` because we don't use `.join()`.
682
+ else:
683
+ while True:
684
+ success, data = self._try_get_data()
685
+ if success:
686
+ return data
687
+
688
+ def _next_data(self):
689
+ while True:
690
+ idx, data = self._get_data()
691
+ return self._process_data(data)
692
+
693
+ def _try_put_index(self):
694
+ try:
695
+ dnr = self._prefetch_factor
696
+ hdnr = max(1,dnr//2)
697
+ for i,index_queue in enumerate(self._index_queues):
698
+ if self._workers_status[i] and index_queue.qsize()<hdnr:
699
+ nr = dnr-index_queue.qsize()
700
+ for _ in range(nr):
701
+ index = self._next_index()
702
+ if self.batch_split_nr>1:
703
+ if len(index)%self.batch_split_nr!=0:
704
+ print(f"ERROR: batch_split_nr = {self.batch_split_nr}, batch size = {len(index)}")
705
+ indexs = wmlu.list_to_2dlistv2(index,self.batch_split_nr)
706
+ for index in indexs:
707
+ index_queue.put((0,index))
708
+ else:
709
+ index_queue.put((0,index))
710
+ except StopIteration:
711
+ self.stop_iteration = True
712
+ return
713
+
714
+ def _try_put_index_first(self):
715
+ try:
716
+ dnr = self._prefetch_factor
717
+ nr = max(1,math.ceil(dnr/len(self._index_queues)))
718
+ for i,index_queue in enumerate(self._index_queues):
719
+ for _ in range(nr):
720
+ index = self._next_index()
721
+ if self.batch_split_nr>1:
722
+ if len(index)%self.batch_split_nr!=0:
723
+ print(f"ERROR: batch_split_nr = {self.batch_split_nr}, batch size = {len(index)}")
724
+ indexs = wmlu.list_to_2dlistv2(index,self.batch_split_nr)
725
+ for index in indexs:
726
+ index_queue.put((0,index))
727
+ else:
728
+ index_queue.put((0,index))
729
+ except StopIteration:
730
+ self.stop_iteration = True
731
+ return
732
+
733
+ def _process_data(self, data):
734
+ self._try_put_index()
735
+ if isinstance(data, ExceptionWrapper):
736
+ data.reraise()
737
+ return data
738
+
739
+ def _mark_worker_as_unavailable(self, worker_id, shutdown=False):
740
+ # Mark a worker as having finished its work e.g., due to
741
+ # exhausting an `IterableDataset`. This should be used only when this
742
+ # `_MultiProcessingDataLoaderIter` is going to continue running.
743
+
744
+ assert self._workers_status[worker_id] or (self._persistent_workers and shutdown)
745
+
746
+ # Signal termination to that specific worker.
747
+ q = self._index_queues[worker_id]
748
+ # Indicate that no more data will be put on this queue by the current
749
+ # process.
750
+ q.put(None)
751
+
752
+ # Note that we don't actually join the worker here, nor do we remove the
753
+ # worker's pid from C side struct because (1) joining may be slow, and
754
+ # (2) since we don't join, the worker may still raise error, and we
755
+ # prefer capturing those, rather than ignoring them, even though they
756
+ # are raised after the worker has finished its job.
757
+ # Joinning is deferred to `_shutdown_workers`, which it is called when
758
+ # all workers finish their jobs (e.g., `IterableDataset` replicas) or
759
+ # when this iterator is garbage collected.
760
+
761
+ self._workers_status[worker_id] = False
762
+ print(f"WARNING: make process {worker_id} to unavaiable.")
763
+
764
+ assert self._workers_done_event.is_set() == shutdown
765
+
766
+ def _clear_index_queue(self):
767
+ for index_queue in self._index_queues:
768
+ try:
769
+ while True:
770
+ index_queue.get_nowait()
771
+ except:
772
+ pass
773
+
774
+ def _clear_worker_results_queue(self):
775
+ try:
776
+ while True:
777
+ self._worker_result_queue.get_nowait()
778
+ except Exception as e:
779
+ pass
780
+
781
+ def _shutdown_workers(self):
782
+ # Called when shutting down this `_MultiProcessingDataLoaderIter`.
783
+ # See NOTE [ Data Loader Multiprocessing Shutdown Logic ] for details on
784
+ # the logic of this function.
785
+ print(f"Shutdown data loader workers.")
786
+ traceback.print_exc(file=sys.stdout)
787
+ sys.stdout.flush()
788
+ python_exit_status = _utils.python_exit_status
789
+ if python_exit_status is True or python_exit_status is None:
790
+ # See (2) of the note. If Python is shutting down, do no-op.
791
+ return
792
+ # Normal exit when last reference is gone / iterator is depleted.
793
+ # See (1) and the second half of the note.
794
+ if not self._shutdown:
795
+ self._shutdown = True
796
+ try:
797
+ # Normal exit when last reference is gone / iterator is depleted.
798
+ # See (1) and the second half of the note.
799
+
800
+ # Exit `pin_memory_thread` first because exiting workers may leave
801
+ # corrupted data in `worker_result_queue` which `pin_memory_thread`
802
+ # reads from.
803
+
804
+ # Exit workers now.
805
+ self._clear_index_queue()
806
+ self._workers_done_event.set()
807
+ self._clear_worker_results_queue()
808
+ try:
809
+ self._worker_result_queue.put_nowait((None, None))
810
+ except:
811
+ pass
812
+
813
+ for worker_id in range(len(self._workers)):
814
+ # Get number of workers from `len(self._workers)` instead of
815
+ # `self._num_workers` in case we error before starting all
816
+ # workers.
817
+ # If we are using workers_status with persistent_workers
818
+ # we have to shut it down because the worker is paused
819
+ if self._persistent_workers or self._workers_status[worker_id]:
820
+ self._mark_worker_as_unavailable(worker_id, shutdown=True)
821
+
822
+
823
+ if hasattr(self, '_pin_memory_thread'):
824
+ # Use hasattr in case error happens before we set the attribute.
825
+ self._pin_memory_thread_done_event.set()
826
+ # Send something to pin_memory_thread in case it is waiting
827
+ # so that it can wake up and check `pin_memory_thread_done_event`
828
+ self._pin_memory_thread.join()
829
+ self._worker_result_queue.cancel_join_thread()
830
+ self._worker_result_queue.close()
831
+
832
+ for w in self._workers:
833
+ # We should be able to join here, but in case anything went
834
+ # wrong, we set a timeout and if the workers fail to join,
835
+ # they are killed in the `finally` block.
836
+ w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
837
+ for q in self._index_queues:
838
+ q.cancel_join_thread()
839
+ q.close()
840
+ finally:
841
+ # Even though all this function does is putting into queues that
842
+ # we have called `cancel_join_thread` on, weird things can
843
+ # happen when a worker is killed by a signal, e.g., hanging in
844
+ # `Event.set()`. So we need to guard this with SIGCHLD handler,
845
+ # and remove pids from the C side data structure only at the
846
+ # end.
847
+ #
848
+ # FIXME: Unfortunately, for Windows, we are missing a worker
849
+ # error detection mechanism here in this function, as it
850
+ # doesn't provide a SIGCHLD handler.
851
+ if self._worker_pids_set:
852
+ _utils.signal_handling._remove_worker_pids(id(self))
853
+ self._worker_pids_set = False
854
+ for w in self._workers:
855
+ if w.is_alive():
856
+ # Existing mechanisms try to make the workers exit
857
+ # peacefully, but in case that we unfortunately reach
858
+ # here, which we shouldn't, (e.g., pytorch/pytorch#39570),
859
+ # we kill the worker.
860
+ w.terminate()
861
+
862
+ def __del__(self):
863
+ try:
864
+ self._shutdown_workers()
865
+ except:
866
+ pass