nnInteractive 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nnInteractive/__init__.py +3 -0
- nnInteractive/inference/__init__.py +0 -0
- nnInteractive/inference/cvpr2025_challenge_baseline/__init__.py +0 -0
- nnInteractive/inference/cvpr2025_challenge_baseline/predict.py +173 -0
- nnInteractive/inference/inference_session.py +1400 -0
- nnInteractive/interaction/__init__.py +0 -0
- nnInteractive/interaction/point.py +166 -0
- nnInteractive/supervoxel/setup.py +4 -0
- nnInteractive/supervoxel/src/metadata.py +118 -0
- nnInteractive/supervoxel/src/reader.py +175 -0
- nnInteractive/supervoxel/src/run.py +136 -0
- nnInteractive/supervoxel/src/sam2/__init__.py +2 -0
- nnInteractive/supervoxel/src/sam2/sam2/__init__.py +11 -0
- nnInteractive/supervoxel/src/sam2/sam2/automatic_mask_generator.py +434 -0
- nnInteractive/supervoxel/src/sam2/sam2/benchmark.py +86 -0
- nnInteractive/supervoxel/src/sam2/sam2/build_sam.py +172 -0
- nnInteractive/supervoxel/src/sam2/sam2/modeling/__init__.py +5 -0
- nnInteractive/supervoxel/src/sam2/sam2/modeling/backbones/__init__.py +5 -0
- nnInteractive/supervoxel/src/sam2/sam2/modeling/backbones/hieradet.py +305 -0
- nnInteractive/supervoxel/src/sam2/sam2/modeling/backbones/image_encoder.py +132 -0
- nnInteractive/supervoxel/src/sam2/sam2/modeling/backbones/utils.py +89 -0
- nnInteractive/supervoxel/src/sam2/sam2/modeling/memory_attention.py +167 -0
- nnInteractive/supervoxel/src/sam2/sam2/modeling/memory_encoder.py +179 -0
- nnInteractive/supervoxel/src/sam2/sam2/modeling/position_encoding.py +217 -0
- nnInteractive/supervoxel/src/sam2/sam2/modeling/sam/__init__.py +5 -0
- nnInteractive/supervoxel/src/sam2/sam2/modeling/sam/mask_decoder.py +274 -0
- nnInteractive/supervoxel/src/sam2/sam2/modeling/sam/prompt_encoder.py +194 -0
- nnInteractive/supervoxel/src/sam2/sam2/modeling/sam/transformer.py +293 -0
- nnInteractive/supervoxel/src/sam2/sam2/modeling/sam2_base.py +879 -0
- nnInteractive/supervoxel/src/sam2/sam2/modeling/sam2_utils.py +315 -0
- nnInteractive/supervoxel/src/sam2/sam2/sam2_image_predictor.py +433 -0
- nnInteractive/supervoxel/src/sam2/sam2/sam2_video_predictor.py +1171 -0
- nnInteractive/supervoxel/src/sam2/sam2/sam2_video_predictor_legacy.py +1125 -0
- nnInteractive/supervoxel/src/sam2/sam2/utils/__init__.py +5 -0
- nnInteractive/supervoxel/src/sam2/sam2/utils/amg.py +332 -0
- nnInteractive/supervoxel/src/sam2/sam2/utils/misc.py +488 -0
- nnInteractive/supervoxel/src/sam2/sam2/utils/transforms.py +108 -0
- nnInteractive/supervoxel/src/sam2/setup.py +174 -0
- nnInteractive/supervoxel/src/sam2/training/__init__.py +5 -0
- nnInteractive/supervoxel/src/sam2/training/dataset/__init__.py +5 -0
- nnInteractive/supervoxel/src/sam2/training/dataset/sam2_datasets.py +176 -0
- nnInteractive/supervoxel/src/sam2/training/dataset/transforms.py +481 -0
- nnInteractive/supervoxel/src/sam2/training/dataset/utils.py +102 -0
- nnInteractive/supervoxel/src/sam2/training/dataset/vos_dataset.py +154 -0
- nnInteractive/supervoxel/src/sam2/training/dataset/vos_raw_dataset.py +290 -0
- nnInteractive/supervoxel/src/sam2/training/dataset/vos_sampler.py +103 -0
- nnInteractive/supervoxel/src/sam2/training/dataset/vos_segment_loader.py +289 -0
- nnInteractive/supervoxel/src/sam2/training/loss_fns.py +290 -0
- nnInteractive/supervoxel/src/sam2/training/model/__init__.py +5 -0
- nnInteractive/supervoxel/src/sam2/training/model/sam2.py +515 -0
- nnInteractive/supervoxel/src/sam2/training/optimizer.py +462 -0
- nnInteractive/supervoxel/src/sam2/training/scripts/sav_frame_extraction_submitit.py +157 -0
- nnInteractive/supervoxel/src/sam2/training/train.py +232 -0
- nnInteractive/supervoxel/src/sam2/training/trainer.py +1051 -0
- nnInteractive/supervoxel/src/sam2/training/utils/__init__.py +5 -0
- nnInteractive/supervoxel/src/sam2/training/utils/checkpoint_utils.py +328 -0
- nnInteractive/supervoxel/src/sam2/training/utils/data_utils.py +166 -0
- nnInteractive/supervoxel/src/sam2/training/utils/distributed.py +560 -0
- nnInteractive/supervoxel/src/sam2/training/utils/logger.py +236 -0
- nnInteractive/supervoxel/src/sam2/training/utils/train_utils.py +275 -0
- nnInteractive/supervoxel/src/supervoxel.py +198 -0
- nnInteractive/trainer/__init__.py +0 -0
- nnInteractive/trainer/nnInteractiveTrainer.py +24 -0
- nnInteractive/utils/__init__.py +0 -0
- nnInteractive/utils/bboxes.py +217 -0
- nnInteractive/utils/checkpoint_cleansing.py +9 -0
- nnInteractive/utils/crop.py +268 -0
- nnInteractive/utils/erosion_dilation.py +48 -0
- nnInteractive/utils/inference_helpers.py +45 -0
- nnInteractive/utils/os_shennanigans.py +16 -0
- nnInteractive/utils/rounding.py +13 -0
- nninteractive-2.0.0.dist-info/METADATA +511 -0
- nninteractive-2.0.0.dist-info/RECORD +76 -0
- nninteractive-2.0.0.dist-info/WHEEL +5 -0
- nninteractive-2.0.0.dist-info/licenses/LICENSE +201 -0
- nninteractive-2.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
|
|
4
|
+
# This source code is licensed under the license found in the
|
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
# Code borrowed from TLC - https://www.internalfb.com/code/fbsource/fbcode/pytorch/tlc/torchtlc/loggers/tensorboard.py
|
|
8
|
+
import atexit
|
|
9
|
+
import functools
|
|
10
|
+
import logging
|
|
11
|
+
import sys
|
|
12
|
+
import uuid
|
|
13
|
+
from typing import Any, Dict, Optional, Union
|
|
14
|
+
|
|
15
|
+
from hydra.utils import instantiate
|
|
16
|
+
|
|
17
|
+
from iopath.common.file_io import g_pathmgr
|
|
18
|
+
from numpy import ndarray
|
|
19
|
+
from torch import Tensor
|
|
20
|
+
from torch.utils.tensorboard import SummaryWriter
|
|
21
|
+
|
|
22
|
+
from training.utils.train_utils import get_machine_local_and_dist_rank, makedir
|
|
23
|
+
|
|
24
|
+
Scalar = Union[Tensor, ndarray, int, float]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def make_tensorboard_logger(log_dir: str, **writer_kwargs: Any):
|
|
28
|
+
makedir(log_dir)
|
|
29
|
+
summary_writer_method = SummaryWriter
|
|
30
|
+
return TensorBoardLogger(path=log_dir, summary_writer_method=summary_writer_method, **writer_kwargs)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class TensorBoardWriterWrapper:
|
|
34
|
+
"""
|
|
35
|
+
A wrapper around a SummaryWriter object.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
path: str,
|
|
41
|
+
*args: Any,
|
|
42
|
+
filename_suffix: str = None,
|
|
43
|
+
summary_writer_method: Any = SummaryWriter,
|
|
44
|
+
**kwargs: Any,
|
|
45
|
+
) -> None:
|
|
46
|
+
"""Create a new TensorBoard logger.
|
|
47
|
+
On construction, the logger creates a new events file that logs
|
|
48
|
+
will be written to. If the environment variable `RANK` is defined,
|
|
49
|
+
logger will only log if RANK = 0.
|
|
50
|
+
|
|
51
|
+
NOTE: If using the logger with distributed training:
|
|
52
|
+
- This logger can call collective operations
|
|
53
|
+
- Logs will be written on rank 0 only
|
|
54
|
+
- Logger must be constructed synchronously *after* initializing distributed process group.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
path (str): path to write logs to
|
|
58
|
+
*args, **kwargs: Extra arguments to pass to SummaryWriter
|
|
59
|
+
"""
|
|
60
|
+
self._writer: Optional[SummaryWriter] = None
|
|
61
|
+
_, self._rank = get_machine_local_and_dist_rank()
|
|
62
|
+
self._path: str = path
|
|
63
|
+
if self._rank == 0:
|
|
64
|
+
logging.info(f"TensorBoard SummaryWriter instantiated. Files will be stored in: {path}")
|
|
65
|
+
self._writer = summary_writer_method(
|
|
66
|
+
log_dir=path,
|
|
67
|
+
*args,
|
|
68
|
+
filename_suffix=filename_suffix or str(uuid.uuid4()),
|
|
69
|
+
**kwargs,
|
|
70
|
+
)
|
|
71
|
+
else:
|
|
72
|
+
logging.debug(f"Not logging meters on this host because env RANK: {self._rank} != 0")
|
|
73
|
+
atexit.register(self.close)
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def writer(self) -> Optional[SummaryWriter]:
|
|
77
|
+
return self._writer
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def path(self) -> str:
|
|
81
|
+
return self._path
|
|
82
|
+
|
|
83
|
+
def flush(self) -> None:
|
|
84
|
+
"""Writes pending logs to disk."""
|
|
85
|
+
|
|
86
|
+
if not self._writer:
|
|
87
|
+
return
|
|
88
|
+
|
|
89
|
+
self._writer.flush()
|
|
90
|
+
|
|
91
|
+
def close(self) -> None:
|
|
92
|
+
"""Close writer, flushing pending logs to disk.
|
|
93
|
+
Logs cannot be written after `close` is called.
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
if not self._writer:
|
|
97
|
+
return
|
|
98
|
+
|
|
99
|
+
self._writer.close()
|
|
100
|
+
self._writer = None
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class TensorBoardLogger(TensorBoardWriterWrapper):
|
|
104
|
+
"""
|
|
105
|
+
A simple logger for TensorBoard.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
def log_dict(self, payload: Dict[str, Scalar], step: int) -> None:
|
|
109
|
+
"""Add multiple scalar values to TensorBoard.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
payload (dict): dictionary of tag name and scalar value
|
|
113
|
+
step (int, Optional): step value to record
|
|
114
|
+
"""
|
|
115
|
+
if not self._writer:
|
|
116
|
+
return
|
|
117
|
+
for k, v in payload.items():
|
|
118
|
+
self.log(k, v, step)
|
|
119
|
+
|
|
120
|
+
def log(self, name: str, data: Scalar, step: int) -> None:
|
|
121
|
+
"""Add scalar data to TensorBoard.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
name (string): tag name used to group scalars
|
|
125
|
+
data (float/int/Tensor): scalar data to log
|
|
126
|
+
step (int, optional): step value to record
|
|
127
|
+
"""
|
|
128
|
+
if not self._writer:
|
|
129
|
+
return
|
|
130
|
+
self._writer.add_scalar(name, data, global_step=step, new_style=True)
|
|
131
|
+
|
|
132
|
+
def log_hparams(self, hparams: Dict[str, Scalar], meters: Dict[str, Scalar]) -> None:
|
|
133
|
+
"""Add hyperparameter data to TensorBoard.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
hparams (dict): dictionary of hyperparameter names and corresponding values
|
|
137
|
+
meters (dict): dictionary of name of meter and corersponding values
|
|
138
|
+
"""
|
|
139
|
+
if not self._writer:
|
|
140
|
+
return
|
|
141
|
+
self._writer.add_hparams(hparams, meters)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class Logger:
|
|
145
|
+
"""
|
|
146
|
+
A logger class that can interface with multiple loggers. It now supports tensorboard only for simplicity, but you can extend it with your own logger.
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
def __init__(self, logging_conf):
|
|
150
|
+
# allow turning off TensorBoard with "should_log: false" in config
|
|
151
|
+
tb_config = logging_conf.tensorboard_writer
|
|
152
|
+
tb_should_log = tb_config and tb_config.pop("should_log", True)
|
|
153
|
+
self.tb_logger = instantiate(tb_config) if tb_should_log else None
|
|
154
|
+
|
|
155
|
+
def log_dict(self, payload: Dict[str, Scalar], step: int) -> None:
|
|
156
|
+
if self.tb_logger:
|
|
157
|
+
self.tb_logger.log_dict(payload, step)
|
|
158
|
+
|
|
159
|
+
def log(self, name: str, data: Scalar, step: int) -> None:
|
|
160
|
+
if self.tb_logger:
|
|
161
|
+
self.tb_logger.log(name, data, step)
|
|
162
|
+
|
|
163
|
+
def log_hparams(self, hparams: Dict[str, Scalar], meters: Dict[str, Scalar]) -> None:
|
|
164
|
+
if self.tb_logger:
|
|
165
|
+
self.tb_logger.log_hparams(hparams, meters)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
# cache the opened file object, so that different calls to `setup_logger`
|
|
169
|
+
# with the same file name can safely write to the same file.
|
|
170
|
+
@functools.lru_cache(maxsize=None)
|
|
171
|
+
def _cached_log_stream(filename):
|
|
172
|
+
# we tune the buffering value so that the logs are updated
|
|
173
|
+
# frequently.
|
|
174
|
+
log_buffer_kb = 10 * 1024 # 10KB
|
|
175
|
+
io = g_pathmgr.open(filename, mode="a", buffering=log_buffer_kb)
|
|
176
|
+
atexit.register(io.close)
|
|
177
|
+
return io
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def setup_logging(
|
|
181
|
+
name,
|
|
182
|
+
output_dir=None,
|
|
183
|
+
rank=0,
|
|
184
|
+
log_level_primary="INFO",
|
|
185
|
+
log_level_secondary="ERROR",
|
|
186
|
+
):
|
|
187
|
+
"""
|
|
188
|
+
Setup various logging streams: stdout and file handlers.
|
|
189
|
+
For file handlers, we only setup for the master gpu.
|
|
190
|
+
"""
|
|
191
|
+
# get the filename if we want to log to the file as well
|
|
192
|
+
log_filename = None
|
|
193
|
+
if output_dir:
|
|
194
|
+
makedir(output_dir)
|
|
195
|
+
if rank == 0:
|
|
196
|
+
log_filename = f"{output_dir}/log.txt"
|
|
197
|
+
|
|
198
|
+
logger = logging.getLogger(name)
|
|
199
|
+
logger.setLevel(log_level_primary)
|
|
200
|
+
|
|
201
|
+
# create formatter
|
|
202
|
+
FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)4d: %(message)s"
|
|
203
|
+
formatter = logging.Formatter(FORMAT)
|
|
204
|
+
|
|
205
|
+
# Cleanup any existing handlers
|
|
206
|
+
for h in logger.handlers:
|
|
207
|
+
logger.removeHandler(h)
|
|
208
|
+
logger.root.handlers = []
|
|
209
|
+
|
|
210
|
+
# setup the console handler
|
|
211
|
+
console_handler = logging.StreamHandler(sys.stdout)
|
|
212
|
+
console_handler.setFormatter(formatter)
|
|
213
|
+
logger.addHandler(console_handler)
|
|
214
|
+
if rank == 0:
|
|
215
|
+
console_handler.setLevel(log_level_primary)
|
|
216
|
+
else:
|
|
217
|
+
console_handler.setLevel(log_level_secondary)
|
|
218
|
+
|
|
219
|
+
# we log to file as well if user wants
|
|
220
|
+
if log_filename and rank == 0:
|
|
221
|
+
file_handler = logging.StreamHandler(_cached_log_stream(log_filename))
|
|
222
|
+
file_handler.setLevel(log_level_primary)
|
|
223
|
+
file_handler.setFormatter(formatter)
|
|
224
|
+
logger.addHandler(file_handler)
|
|
225
|
+
|
|
226
|
+
logging.root = logger
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def shutdown_logging():
|
|
230
|
+
"""
|
|
231
|
+
After training is done, we ensure to shut down all the logger streams.
|
|
232
|
+
"""
|
|
233
|
+
logging.info("Shutting down loggers...")
|
|
234
|
+
handlers = logging.root.handlers
|
|
235
|
+
for handler in handlers:
|
|
236
|
+
handler.close()
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
|
|
4
|
+
# This source code is licensed under the license found in the
|
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import math
|
|
9
|
+
import os
|
|
10
|
+
import random
|
|
11
|
+
import re
|
|
12
|
+
from datetime import timedelta
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
import hydra
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import omegaconf
|
|
19
|
+
import torch
|
|
20
|
+
import torch.distributed as dist
|
|
21
|
+
from iopath.common.file_io import g_pathmgr
|
|
22
|
+
from omegaconf import OmegaConf
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def multiply_all(*args):
|
|
26
|
+
return np.prod(np.array(args)).item()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def collect_dict_keys(config):
|
|
30
|
+
"""This function recursively iterates through a dataset configuration, and collect all the dict_key that are defined"""
|
|
31
|
+
val_keys = []
|
|
32
|
+
# If the this config points to the collate function, then it has a key
|
|
33
|
+
if "_target_" in config and re.match(r".*collate_fn.*", config["_target_"]):
|
|
34
|
+
val_keys.append(config["dict_key"])
|
|
35
|
+
else:
|
|
36
|
+
# Recursively proceed
|
|
37
|
+
for v in config.values():
|
|
38
|
+
if isinstance(v, type(config)):
|
|
39
|
+
val_keys.extend(collect_dict_keys(v))
|
|
40
|
+
elif isinstance(v, omegaconf.listconfig.ListConfig):
|
|
41
|
+
for item in v:
|
|
42
|
+
if isinstance(item, type(config)):
|
|
43
|
+
val_keys.extend(collect_dict_keys(item))
|
|
44
|
+
return val_keys
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class Phase:
|
|
48
|
+
TRAIN = "train"
|
|
49
|
+
VAL = "val"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def register_omegaconf_resolvers():
|
|
53
|
+
OmegaConf.register_new_resolver("get_method", hydra.utils.get_method)
|
|
54
|
+
OmegaConf.register_new_resolver("get_class", hydra.utils.get_class)
|
|
55
|
+
OmegaConf.register_new_resolver("add", lambda x, y: x + y)
|
|
56
|
+
OmegaConf.register_new_resolver("times", multiply_all)
|
|
57
|
+
OmegaConf.register_new_resolver("divide", lambda x, y: x / y)
|
|
58
|
+
OmegaConf.register_new_resolver("pow", lambda x, y: x**y)
|
|
59
|
+
OmegaConf.register_new_resolver("subtract", lambda x, y: x - y)
|
|
60
|
+
OmegaConf.register_new_resolver("range", lambda x: list(range(x)))
|
|
61
|
+
OmegaConf.register_new_resolver("int", lambda x: int(x))
|
|
62
|
+
OmegaConf.register_new_resolver("ceil_int", lambda x: int(math.ceil(x)))
|
|
63
|
+
OmegaConf.register_new_resolver("merge", lambda *x: OmegaConf.merge(*x))
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def setup_distributed_backend(backend, timeout_mins):
|
|
67
|
+
"""
|
|
68
|
+
Initialize torch.distributed and set the CUDA device.
|
|
69
|
+
Expects environment variables to be set as per
|
|
70
|
+
https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization
|
|
71
|
+
along with the environ variable "LOCAL_RANK" which is used to set the CUDA device.
|
|
72
|
+
"""
|
|
73
|
+
# enable TORCH_NCCL_ASYNC_ERROR_HANDLING to ensure dist nccl ops time out after timeout_mins
|
|
74
|
+
# of waiting
|
|
75
|
+
os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1"
|
|
76
|
+
logging.info(f"Setting up torch.distributed with a timeout of {timeout_mins} mins")
|
|
77
|
+
dist.init_process_group(backend=backend, timeout=timedelta(minutes=timeout_mins))
|
|
78
|
+
return dist.get_rank()
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def get_machine_local_and_dist_rank():
|
|
82
|
+
"""
|
|
83
|
+
Get the distributed and local rank of the current gpu.
|
|
84
|
+
"""
|
|
85
|
+
local_rank = int(os.environ.get("LOCAL_RANK", None))
|
|
86
|
+
distributed_rank = int(os.environ.get("RANK", None))
|
|
87
|
+
assert (
|
|
88
|
+
local_rank is not None and distributed_rank is not None
|
|
89
|
+
), "Please the set the RANK and LOCAL_RANK environment variables."
|
|
90
|
+
return local_rank, distributed_rank
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def print_cfg(cfg):
|
|
94
|
+
"""
|
|
95
|
+
Supports printing both Hydra DictConfig and also the AttrDict config
|
|
96
|
+
"""
|
|
97
|
+
logging.info("Training with config:")
|
|
98
|
+
logging.info(OmegaConf.to_yaml(cfg))
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def set_seeds(seed_value, max_epochs, dist_rank):
|
|
102
|
+
"""
|
|
103
|
+
Set the python random, numpy and torch seed for each gpu. Also set the CUDA
|
|
104
|
+
seeds if the CUDA is available. This ensures deterministic nature of the training.
|
|
105
|
+
"""
|
|
106
|
+
# Since in the pytorch sampler, we increment the seed by 1 for every epoch.
|
|
107
|
+
seed_value = (seed_value + dist_rank) * max_epochs
|
|
108
|
+
logging.info(f"MACHINE SEED: {seed_value}")
|
|
109
|
+
random.seed(seed_value)
|
|
110
|
+
np.random.seed(seed_value)
|
|
111
|
+
torch.manual_seed(seed_value)
|
|
112
|
+
if torch.cuda.is_available():
|
|
113
|
+
torch.cuda.manual_seed_all(seed_value)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def makedir(dir_path):
|
|
117
|
+
"""
|
|
118
|
+
Create the directory if it does not exist.
|
|
119
|
+
"""
|
|
120
|
+
is_success = False
|
|
121
|
+
try:
|
|
122
|
+
if not g_pathmgr.exists(dir_path):
|
|
123
|
+
g_pathmgr.mkdirs(dir_path)
|
|
124
|
+
is_success = True
|
|
125
|
+
except BaseException:
|
|
126
|
+
logging.info(f"Error creating directory: {dir_path}")
|
|
127
|
+
return is_success
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def is_dist_avail_and_initialized():
|
|
131
|
+
if not dist.is_available():
|
|
132
|
+
return False
|
|
133
|
+
if not dist.is_initialized():
|
|
134
|
+
return False
|
|
135
|
+
return True
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def get_amp_type(amp_type: Optional[str] = None):
|
|
139
|
+
if amp_type is None:
|
|
140
|
+
return None
|
|
141
|
+
assert amp_type in ["bfloat16", "float16"], "Invalid Amp type."
|
|
142
|
+
if amp_type == "bfloat16":
|
|
143
|
+
return torch.bfloat16
|
|
144
|
+
else:
|
|
145
|
+
return torch.float16
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def log_env_variables():
|
|
149
|
+
env_keys = sorted(list(os.environ.keys()))
|
|
150
|
+
st = ""
|
|
151
|
+
for k in env_keys:
|
|
152
|
+
v = os.environ[k]
|
|
153
|
+
st += f"{k}={v}\n"
|
|
154
|
+
logging.info("Logging ENV_VARIABLES")
|
|
155
|
+
logging.info(st)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class AverageMeter:
|
|
159
|
+
"""Computes and stores the average and current value"""
|
|
160
|
+
|
|
161
|
+
def __init__(self, name, device, fmt=":f"):
|
|
162
|
+
self.name = name
|
|
163
|
+
self.fmt = fmt
|
|
164
|
+
self.device = device
|
|
165
|
+
self.reset()
|
|
166
|
+
|
|
167
|
+
def reset(self):
|
|
168
|
+
self.val = 0
|
|
169
|
+
self.avg = 0
|
|
170
|
+
self.sum = 0
|
|
171
|
+
self.count = 0
|
|
172
|
+
self._allow_updates = True
|
|
173
|
+
|
|
174
|
+
def update(self, val, n=1):
|
|
175
|
+
self.val = val
|
|
176
|
+
self.sum += val * n
|
|
177
|
+
self.count += n
|
|
178
|
+
self.avg = self.sum / self.count
|
|
179
|
+
|
|
180
|
+
def __str__(self):
|
|
181
|
+
fmtstr = "{name}: {val" + self.fmt + "} ({avg" + self.fmt + "})"
|
|
182
|
+
return fmtstr.format(**self.__dict__)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class MemMeter:
|
|
186
|
+
"""Computes and stores the current, avg, and max of peak Mem usage per iteration"""
|
|
187
|
+
|
|
188
|
+
def __init__(self, name, device, fmt=":f"):
|
|
189
|
+
self.name = name
|
|
190
|
+
self.fmt = fmt
|
|
191
|
+
self.device = device
|
|
192
|
+
self.reset()
|
|
193
|
+
|
|
194
|
+
def reset(self):
|
|
195
|
+
self.val = 0 # Per iteration max usage
|
|
196
|
+
self.avg = 0 # Avg per iteration max usage
|
|
197
|
+
self.peak = 0 # Peak usage for lifetime of program
|
|
198
|
+
self.sum = 0
|
|
199
|
+
self.count = 0
|
|
200
|
+
self._allow_updates = True
|
|
201
|
+
|
|
202
|
+
def update(self, n=1, reset_peak_usage=True):
|
|
203
|
+
self.val = torch.cuda.max_memory_allocated() // 1e9
|
|
204
|
+
self.sum += self.val * n
|
|
205
|
+
self.count += n
|
|
206
|
+
self.avg = self.sum / self.count
|
|
207
|
+
self.peak = max(self.peak, self.val)
|
|
208
|
+
if reset_peak_usage:
|
|
209
|
+
torch.cuda.reset_peak_memory_stats()
|
|
210
|
+
|
|
211
|
+
def __str__(self):
|
|
212
|
+
fmtstr = "{name}: {val" + self.fmt + "} ({avg" + self.fmt + "}/{peak" + self.fmt + "})"
|
|
213
|
+
return fmtstr.format(**self.__dict__)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def human_readable_time(time_seconds):
|
|
217
|
+
time = int(time_seconds)
|
|
218
|
+
minutes, seconds = divmod(time, 60)
|
|
219
|
+
hours, minutes = divmod(minutes, 60)
|
|
220
|
+
days, hours = divmod(hours, 24)
|
|
221
|
+
return f"{days:02}d {hours:02}h {minutes:02}m"
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
class DurationMeter:
|
|
225
|
+
def __init__(self, name, device, fmt=":f"):
|
|
226
|
+
self.name = name
|
|
227
|
+
self.device = device
|
|
228
|
+
self.fmt = fmt
|
|
229
|
+
self.val = 0
|
|
230
|
+
|
|
231
|
+
def reset(self):
|
|
232
|
+
self.val = 0
|
|
233
|
+
|
|
234
|
+
def update(self, val):
|
|
235
|
+
self.val = val
|
|
236
|
+
|
|
237
|
+
def add(self, val):
|
|
238
|
+
self.val += val
|
|
239
|
+
|
|
240
|
+
def __str__(self):
|
|
241
|
+
return f"{self.name}: {human_readable_time(self.val)}"
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
class ProgressMeter:
|
|
245
|
+
def __init__(self, num_batches, meters, real_meters, prefix=""):
|
|
246
|
+
self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
|
|
247
|
+
self.meters = meters
|
|
248
|
+
self.real_meters = real_meters
|
|
249
|
+
self.prefix = prefix
|
|
250
|
+
|
|
251
|
+
def display(self, batch, enable_print=False):
|
|
252
|
+
entries = [self.prefix + self.batch_fmtstr.format(batch)]
|
|
253
|
+
entries += [str(meter) for meter in self.meters]
|
|
254
|
+
entries += [
|
|
255
|
+
" | ".join([f"{os.path.join(name, subname)}: {val:.4f}" for subname, val in meter.compute().items()])
|
|
256
|
+
for name, meter in self.real_meters.items()
|
|
257
|
+
]
|
|
258
|
+
logging.info(" | ".join(entries))
|
|
259
|
+
if enable_print:
|
|
260
|
+
print(" | ".join(entries))
|
|
261
|
+
|
|
262
|
+
def _get_batch_fmtstr(self, num_batches):
|
|
263
|
+
num_digits = len(str(num_batches // 1))
|
|
264
|
+
fmt = "{:" + str(num_digits) + "d}"
|
|
265
|
+
return "[" + fmt + "/" + fmt.format(num_batches) + "]"
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def get_resume_checkpoint(checkpoint_save_dir):
|
|
269
|
+
if not g_pathmgr.isdir(checkpoint_save_dir):
|
|
270
|
+
return None
|
|
271
|
+
ckpt_file = os.path.join(checkpoint_save_dir, "checkpoint.pt")
|
|
272
|
+
if not g_pathmgr.isfile(ckpt_file):
|
|
273
|
+
return None
|
|
274
|
+
|
|
275
|
+
return ckpt_file
|