PyPI - birder - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

birder 0.3.1py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

birder/adversarial/deepfool.py +2 -0
birder/adversarial/simba.py +2 -0
birder/common/masking.py +13 -4
birder/inference/classification.py +1 -1
birder/introspection/__init__.py +2 -0
birder/introspection/base.py +0 -7
birder/introspection/feature_pca.py +101 -0
birder/kernels/soft_nms/soft_nms.cpp +5 -2
birder/model_registry/model_registry.py +3 -2
birder/net/convnext_v1.py +20 -0
birder/net/fastvit.py +0 -1
birder/net/flexivit.py +5 -0
birder/net/focalnet.py +0 -1
birder/net/rope_flexivit.py +7 -0
birder/net/rope_vit.py +49 -4
birder/net/smt.py +0 -1
birder/net/ssl/ibot.py +0 -1
birder/net/vit.py +166 -2
birder/scripts/train.py +7 -6
birder/scripts/train_barlow_twins.py +4 -3
birder/scripts/train_byol.py +4 -3
birder/scripts/train_capi.py +6 -5
birder/scripts/train_data2vec.py +4 -3
birder/scripts/train_data2vec2.py +4 -3
birder/scripts/train_detection.py +7 -5
birder/scripts/train_dino_v1.py +5 -4
birder/scripts/train_dino_v2.py +69 -20
birder/scripts/train_dino_v2_dist.py +70 -21
birder/scripts/train_franca.py +8 -7
birder/scripts/train_i_jepa.py +4 -3
birder/scripts/train_ibot.py +5 -4
birder/scripts/train_kd.py +8 -8
birder/scripts/train_mim.py +4 -3
birder/scripts/train_mmcr.py +4 -3
birder/scripts/train_rotnet.py +5 -4
birder/scripts/train_simclr.py +4 -3
birder/scripts/train_vicreg.py +4 -3
birder/tools/avg_model.py +24 -8
birder/tools/introspection.py +35 -9
birder/tools/show_iterator.py +1 -1
birder/version.py +1 -1
{birder-0.3.1.dist-info → birder-0.3.2.dist-info}/METADATA +1 -1
{birder-0.3.1.dist-info → birder-0.3.2.dist-info}/RECORD +47 -46
{birder-0.3.1.dist-info → birder-0.3.2.dist-info}/WHEEL +0 -0
{birder-0.3.1.dist-info → birder-0.3.2.dist-info}/entry_points.txt +0 -0
{birder-0.3.1.dist-info → birder-0.3.2.dist-info}/licenses/LICENSE +0 -0
{birder-0.3.1.dist-info → birder-0.3.2.dist-info}/top_level.txt +0 -0

birder/net/vit.py CHANGED Viewed

@@ -91,6 +91,126 @@ class PatchEmbed(nn.Module):
         return x
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        attn_drop: float,
+        proj_drop: float,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        norm_layer_eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        if qk_norm is True:
+            self.q_norm = norm_layer(self.head_dim, eps=norm_layer_eps)
+            self.k_norm = norm_layer(self.head_dim, eps=norm_layer_eps)
+        else:
+            self.q_norm = nn.Identity()
+            self.k_norm = nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    # Make the same interface as nn.MultiheadAttention forward for TorchScript compatibility
+    def forward(
+        self,
+        x: torch.Tensor,
+        key: Optional[torch.Tensor] = None,  # pylint: disable=unused-argument
+        value: Optional[torch.Tensor] = None,  # pylint: disable=unused-argument
+        need_weights: bool = False,
+        attn_mask: Optional[torch.Tensor] = None,  # pylint: disable=unused-argument
+        average_attn_weights: bool = False,
+        is_causal: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Apply multi-head self-attention to the input sequence
+        This module implements scaled dot-product attention over x and returns the
+        projected output. The method signature intentionally matches
+        torch.nn.MultiheadAttention.forward for TorchScript compatibility.
+        Compatibility notes
+        -------------------
+        The following parameters are accepted for API compatibility but are ignored by this implementation:
+        - key: ignored (keys are computed from x)
+        - value: ignored (values are computed from x)
+        - attn_mask: ignored (no external attention mask is applied)
+        Parameters
+        ----------
+        x
+            Input tensor of shape (B, N, C) where B is batch size, N is sequence length,
+            and C is embedding dimension.
+        key
+            Unused. Present for nn.MultiheadAttention-compatible signature.
+        value
+            Unused. Present for nn.MultiheadAttention-compatible signature.
+        need_weights
+            If True, also return attention weights computed explicitly. If False, uses
+            torch.nn.functional.scaled_dot_product_attention and returns None for attention weights.
+        attn_mask
+            Unused. Present for nn.MultiheadAttention-compatible signature.
+        average_attn_weights
+            If True and need_weights is True, average attention weights across heads
+            to shape (B, N, N). If False, return per-head weights of shape (B, num_heads, N, N).
+        is_causal
+            If True, apply a causal (upper-triangular) mask so positions cannot attend to future tokens.
+        Returns
+        -------
+        A tuple containing two elements:
+            - output: Tensor of shape (B, N, C)
+            - attn_weights: If need_weights is True attention weights, otherwise, None.
+        """
+        (B, N, C) = x.size()
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        (q, k, v) = qkv.unbind(0)
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        attn_weights: Optional[torch.Tensor] = None
+        if need_weights is True:
+            # Compute attention manually to get weights
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            if is_causal is True:
+                causal_mask = torch.triu(
+                    torch.full((N, N), float("-inf"), dtype=attn.dtype, device=attn.device),
+                    diagonal=1,
+                )
+                attn = attn + causal_mask
+            attn = attn.softmax(dim=-1)
+            attn_weights = attn
+            attn = self.attn_drop(attn)
+            x = attn @ v
+            if average_attn_weights is True:
+                # Average across heads: (B, num_heads, N, N) -> (B, N, N)
+                attn_weights = attn_weights.mean(dim=1)
+        else:
+            x = F.scaled_dot_product_attention(  # pylint: disable=not-callable
+                q, k, v, dropout_p=self.attn_drop.p if self.training else 0.0, is_causal=is_causal, scale=self.scale
+            )
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return (x, attn_weights)
 class EncoderBlock(nn.Module):
     def __init__(
         self,
@@ -105,17 +225,37 @@ class EncoderBlock(nn.Module):
         norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
         norm_layer_eps: float = 1e-6,
         mlp_layer: Callable[..., nn.Module] = FFN,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
     ) -> None:
         super().__init__()
         self.need_attn = False
         self.is_causal = False
+        self.use_custom_attn = qk_norm is True
         if mlp_dim is None:
             mlp_dim = hidden_dim * 4
         # Attention block
         self.ln1 = norm_layer(hidden_dim, eps=norm_layer_eps)
-        self.self_attention = nn.MultiheadAttention(hidden_dim, num_heads, dropout=attention_dropout, batch_first=True)
+        if self.use_custom_attn is False:
+            # Prefer PyTorch's built-in MultiheadAttention for the "standard" case
+            self.self_attention = nn.MultiheadAttention(
+                hidden_dim, num_heads, dropout=attention_dropout, bias=qkv_bias, batch_first=True
+            )
+        else:
+            self.self_attention = Attention(
+                hidden_dim,
+                num_heads=num_heads,
+                attn_drop=attention_dropout,
+                proj_drop=0.0,
+                qkv_bias=qkv_bias,
+                qk_norm=qk_norm,
+                norm_layer=norm_layer,
+                norm_layer_eps=norm_layer_eps,
+            )
         self.drop_path1 = StochasticDepth(drop_path, mode="row")
         if layer_scale_init_value is not None:
             self.layer_scale_1 = LayerScale(hidden_dim, layer_scale_init_value)
@@ -148,10 +288,11 @@ class EncoderBlock(nn.Module):
             branch1,
             branch1,
             need_weights=self.need_attn,
-            attn_mask=attn_mask,
+            attn_mask=attn_mask,  # Ignored on the custom attention
             average_attn_weights=False,
             is_causal=self.is_causal,
         )
         branch1 = self.layer_scale_1(branch1)
         branch1 = self.drop_path1(branch1) + x
@@ -181,6 +322,8 @@ class Encoder(nn.Module):
         attention_dropout: float,
         dpr: list[float],
         pre_norm: bool = False,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
         activation_layer: Callable[..., nn.Module] = nn.GELU,
         layer_scale_init_value: Optional[float] = None,
         norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
@@ -211,6 +354,8 @@ class Encoder(nn.Module):
                     norm_layer=norm_layer,
                     norm_layer_eps=norm_layer_eps,
                     mlp_layer=mlp_layer,
+                    qkv_bias=qkv_bias,
+                    qk_norm=qk_norm,
                 )
             )
@@ -267,6 +412,8 @@ class ViT(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin, MaskedTok
         layer_scale_init_value: Optional[float] = self.config.get("layer_scale_init_value", None)
         pre_norm: bool = self.config.get("pre_norm", False)
         post_norm: bool = self.config.get("post_norm", True)
+        qkv_bias: bool = self.config.get("qkv_bias", True)
+        qk_norm: bool = self.config.get("qk_norm", False)
         num_reg_tokens: int = self.config.get("num_reg_tokens", 0)
         class_token: bool = self.config.get("class_token", True)
         attn_pool_head: bool = self.config.get("attn_pool_head", False)
@@ -351,6 +498,8 @@ class ViT(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin, MaskedTok
             attention_dropout,
             dpr,
             pre_norm=pre_norm,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
             activation_layer=act_layer,
             layer_scale_init_value=layer_scale_init_value,
             norm_layer=norm_layer,
@@ -389,6 +538,7 @@ class ViT(DetectorBackbone, PreTrainEncoder, MaskedTokenOmissionMixin, MaskedTok
             drop_path=0,
             activation_layer=act_layer,
             norm_layer=norm_layer,
+            norm_layer_eps=norm_layer_eps,
             mlp_layer=mlp_layer,
         )
@@ -846,6 +996,20 @@ registry.register_model_config(
         "drop_path_rate": 0.1,
     },
 )
+registry.register_model_config(
+    "vit_b16_qkn_ls",
+    ViT,
+    config={
+        "patch_size": 16,
+        "num_layers": 12,
+        "num_heads": 12,
+        "hidden_dim": 768,
+        "mlp_dim": 3072,
+        "layer_scale_init_value": 1e-5,
+        "qk_norm": True,
+        "drop_path_rate": 0.1,
+    },
+)
 registry.register_model_config(
     "vit_b16_pn_quick_gelu",
     ViT,

birder/scripts/train.py CHANGED Viewed

@@ -474,14 +474,15 @@ def train(args: argparse.Namespace) -> None:
     if virtual_epoch_mode is True:
         train_iter = iter(training_loader)
+    running_loss = training_utils.SmoothedValue(window_size=64)
+    running_val_loss = training_utils.SmoothedValue()
+    train_accuracy = training_utils.SmoothedValue(window_size=64)
+    val_accuracy = training_utils.SmoothedValue()
     logger.info(f"Starting training with learning rate of {last_lr}")
     for epoch in range(begin_epoch, args.stop_epoch):
         tic = time.time()
         net.train()
-        running_loss = training_utils.SmoothedValue(window_size=64)
-        running_val_loss = training_utils.SmoothedValue()
-        train_accuracy = training_utils.SmoothedValue(window_size=64)
-        val_accuracy = training_utils.SmoothedValue()
         if args.distributed is True or virtual_epoch_mode is True:
             train_sampler.set_epoch(epoch)
@@ -566,7 +567,7 @@ def train(args: argparse.Namespace) -> None:
             train_accuracy.update(training_utils.accuracy(targets, outputs.detach()))
             # Write statistics
-            if i % args.log_interval == 0 or i == last_batch_idx:
+            if (i % args.log_interval == 0 and i > 0) or i == last_batch_idx:
                 time_now = time.time()
                 time_cost = time_now - start_time
                 iters_processed_in_interval = i - last_idx
@@ -806,6 +807,7 @@ def get_args_parser() -> argparse.ArgumentParser:
         formatter_class=cli.ArgumentHelpFormatter,
     )
     parser.add_argument("-n", "--network", type=str, help="the neural network to use")
+    parser.add_argument("-t", "--tag", type=str, help="add model tag")
     parser.add_argument(
         "--model-config",
         action=cli.FlexibleDictAction,
@@ -814,7 +816,6 @@ def get_args_parser() -> argparse.ArgumentParser:
             "('drop_path_rate=0.2' or '{\"units\": [3, 24, 36, 3], \"dropout\": 0.2}'"
         ),
     )
-    parser.add_argument("-t", "--tag", type=str, help="add model tag")
     parser.add_argument("--reset-head", default=False, action="store_true", help="reset the classification head")
     parser.add_argument(
         "--freeze-body",

birder/scripts/train_barlow_twins.py CHANGED Viewed

@@ -358,11 +358,12 @@ def train(args: argparse.Namespace) -> None:
     if virtual_epoch_mode is True:
         train_iter = iter(training_loader)
+    running_loss = training_utils.SmoothedValue()
     logger.info(f"Starting training with learning rate of {last_lr}")
     for epoch in range(begin_epoch, args.stop_epoch):
         tic = time.time()
         net.train()
-        running_loss = training_utils.SmoothedValue()
         if args.distributed is True or virtual_epoch_mode is True:
             train_sampler.set_epoch(epoch)
@@ -426,7 +427,7 @@ def train(args: argparse.Namespace) -> None:
             running_loss.update(loss.detach())
             # Write statistics
-            if i % args.log_interval == 0 or i == last_batch_idx:
+            if (i % args.log_interval == 0 and i > 0) or i == last_batch_idx:
                 time_now = time.time()
                 time_cost = time_now - start_time
                 iters_processed_in_interval = i - last_idx
@@ -566,6 +567,7 @@ def get_args_parser() -> argparse.ArgumentParser:
         formatter_class=cli.ArgumentHelpFormatter,
     )
     parser.add_argument("-n", "--network", type=str, help="the neural network to train")
+    parser.add_argument("-t", "--tag", type=str, help="add model tag")
     parser.add_argument(
         "--model-config",
         action=cli.FlexibleDictAction,
@@ -583,7 +585,6 @@ def get_args_parser() -> argparse.ArgumentParser:
         help="projector mlp dimensions",
     )
     parser.add_argument("--off-lambda", type=float, default=0.0051, help="weight on off-diagonal terms")
-    parser.add_argument("-t", "--tag", type=str, help="add model tag")
     training_cli.add_optimization_args(parser)
     training_cli.add_lr_wd_args(parser)
     training_cli.add_lr_scheduler_args(parser)

birder/scripts/train_byol.py CHANGED Viewed

@@ -370,11 +370,12 @@ def train(args: argparse.Namespace) -> None:
     if virtual_epoch_mode is True:
         train_iter = iter(training_loader)
+    running_loss = training_utils.SmoothedValue()
     logger.info(f"Starting training with learning rate of {last_lr}")
     for epoch in range(begin_epoch, args.stop_epoch):
         tic = time.time()
         net.train()
-        running_loss = training_utils.SmoothedValue()
         if args.distributed is True or virtual_epoch_mode is True:
             train_sampler.set_epoch(epoch)
@@ -449,7 +450,7 @@ def train(args: argparse.Namespace) -> None:
             running_loss.update(loss.detach())
             # Write statistics
-            if i % args.log_interval == 0 or i == last_batch_idx:
+            if (i % args.log_interval == 0 and i > 0) or i == last_batch_idx:
                 time_now = time.time()
                 time_cost = time_now - start_time
                 iters_processed_in_interval = i - last_idx
@@ -590,6 +591,7 @@ def get_args_parser() -> argparse.ArgumentParser:
         formatter_class=cli.ArgumentHelpFormatter,
     )
     parser.add_argument("-n", "--network", type=str, help="the neural network to use")
+    parser.add_argument("-t", "--tag", type=str, help="add model tag")
     parser.add_argument(
         "--model-config",
         action=cli.FlexibleDictAction,
@@ -611,7 +613,6 @@ def get_args_parser() -> argparse.ArgumentParser:
         default=0.99,
         help="base EMA parameter for teacher update, set a higher value with small batches",
     )
-    parser.add_argument("-t", "--tag", type=str, help="add model tag")
     training_cli.add_optimization_args(parser)
     training_cli.add_lr_wd_args(parser)
     training_cli.add_lr_scheduler_args(parser)

birder/scripts/train_capi.py CHANGED Viewed

@@ -444,13 +444,14 @@ def train(args: argparse.Namespace) -> None:
     if virtual_epoch_mode is True:
         train_iter = iter(training_loader)
+    running_loss = training_utils.SmoothedValue()
+    running_clustering_loss = training_utils.SmoothedValue()
+    running_target_entropy = training_utils.SmoothedValue()
     logger.info(f"Starting training with learning rate of {last_lr}")
     for epoch in range(begin_epoch, args.stop_epoch):
         tic = time.time()
         net.train()
-        running_loss = training_utils.SmoothedValue()
-        running_clustering_loss = training_utils.SmoothedValue()
-        running_target_entropy = training_utils.SmoothedValue()
         if args.sinkhorn_queue_size is not None:
             queue_active = epoch > args.sinkhorn_queue_warmup_epochs
@@ -564,7 +565,7 @@ def train(args: argparse.Namespace) -> None:
             running_target_entropy.update(target_entropy.detach())
             # Write statistics
-            if i % args.log_interval == 0 or i == last_batch_idx:
+            if (i % args.log_interval == 0 and i > 0) or i == last_batch_idx:
                 time_now = time.time()
                 time_cost = time_now - start_time
                 iters_processed_in_interval = i - last_idx
@@ -737,6 +738,7 @@ def get_args_parser() -> argparse.ArgumentParser:
         formatter_class=cli.ArgumentHelpFormatter,
     )
     parser.add_argument("-n", "--network", type=str, help="the neural network to use")
+    parser.add_argument("-t", "--tag", type=str, help="add model tag")
     parser.add_argument(
         "--model-config",
         action=cli.FlexibleDictAction,
@@ -768,7 +770,6 @@ def get_args_parser() -> argparse.ArgumentParser:
         default=0,
         help="number of initial epochs to disable Sinkhorn queueing",
     )
-    parser.add_argument("-t", "--tag", type=str, help="add model tag")
     training_cli.add_optimization_args(parser)
     training_cli.add_lr_wd_args(parser)
     training_cli.add_lr_scheduler_args(parser)

birder/scripts/train_data2vec.py CHANGED Viewed

@@ -384,11 +384,12 @@ def train(args: argparse.Namespace) -> None:
     if virtual_epoch_mode is True:
         train_iter = iter(training_loader)
+    running_loss = training_utils.SmoothedValue()
     logger.info(f"Starting training with learning rate of {last_lr}")
     for epoch in range(begin_epoch, args.stop_epoch):
         tic = time.time()
         net.train()
-        running_loss = training_utils.SmoothedValue()
         if args.distributed is True or virtual_epoch_mode is True:
             train_sampler.set_epoch(epoch)
@@ -463,7 +464,7 @@ def train(args: argparse.Namespace) -> None:
             running_loss.update(loss.detach())
             # Write statistics
-            if i % args.log_interval == 0 or i == last_batch_idx:
+            if (i % args.log_interval == 0 and i > 0) or i == last_batch_idx:
                 time_now = time.time()
                 time_cost = time_now - start_time
                 iters_processed_in_interval = i - last_idx
@@ -603,6 +604,7 @@ def get_args_parser() -> argparse.ArgumentParser:
         formatter_class=cli.ArgumentHelpFormatter,
     )
     parser.add_argument("-n", "--network", type=str, help="the neural network to use")
+    parser.add_argument("-t", "--tag", type=str, help="add model tag")
     parser.add_argument(
         "--model-config",
         action=cli.FlexibleDictAction,
@@ -617,7 +619,6 @@ def get_args_parser() -> argparse.ArgumentParser:
         default=0.999,
         help="base EMA parameter for teacher update, set a higher value with small batches",
     )
-    parser.add_argument("-t", "--tag", type=str, help="add model tag")
     training_cli.add_optimization_args(parser)
     training_cli.add_lr_wd_args(parser)
     training_cli.add_lr_scheduler_args(parser)

birder/scripts/train_data2vec2.py CHANGED Viewed

@@ -393,11 +393,12 @@ def train(args: argparse.Namespace) -> None:
     if virtual_epoch_mode is True:
         train_iter = iter(training_loader)
+    running_loss = training_utils.SmoothedValue()
     logger.info(f"Starting training with learning rate of {last_lr}")
     for epoch in range(begin_epoch, args.stop_epoch):
         tic = time.time()
         net.train()
-        running_loss = training_utils.SmoothedValue()
         if args.distributed is True or virtual_epoch_mode is True:
             train_sampler.set_epoch(epoch)
@@ -473,7 +474,7 @@ def train(args: argparse.Namespace) -> None:
             running_loss.update(loss.detach())
             # Write statistics
-            if i % args.log_interval == 0 or i == last_batch_idx:
+            if (i % args.log_interval == 0 and i > 0) or i == last_batch_idx:
                 time_now = time.time()
                 time_cost = time_now - start_time
                 iters_processed_in_interval = i - last_idx
@@ -615,6 +616,7 @@ def get_args_parser() -> argparse.ArgumentParser:
         formatter_class=cli.ArgumentHelpFormatter,
     )
     parser.add_argument("-n", "--network", type=str, help="the neural network to use")
+    parser.add_argument("-t", "--tag", type=str, help="add model tag")
     parser.add_argument(
         "--model-config",
         action=cli.FlexibleDictAction,
@@ -635,7 +637,6 @@ def get_args_parser() -> argparse.ArgumentParser:
         default=0.9998,
         help="base EMA parameter for teacher update, set a higher value with small batches",
     )
-    parser.add_argument("-t", "--tag", type=str, help="add model tag")
     training_cli.add_optimization_args(parser)
     training_cli.add_lr_wd_args(parser)
     training_cli.add_lr_scheduler_args(parser)

birder/scripts/train_detection.py CHANGED Viewed

@@ -538,12 +538,14 @@ def train(args: argparse.Namespace) -> None:
     if virtual_epoch_mode is True:
         train_iter = iter(training_loader)
+    running_loss = training_utils.SmoothedValue()
+    loss_trackers: dict[str, training_utils.SmoothedValue] = {}
     logger.info(f"Starting training with learning rate of {last_lr}")
     for epoch in range(begin_epoch, args.stop_epoch):
         tic = time.time()
         net.train()
-        running_loss = training_utils.SmoothedValue()
-        loss_trackers: dict[str, training_utils.SmoothedValue] = {}
         validation_metrics.reset()
         if args.distributed is True or virtual_epoch_mode is True:
@@ -634,7 +636,7 @@ def train(args: argparse.Namespace) -> None:
                 loss_trackers[key].update(value.detach())
             # Write statistics
-            if i % args.log_interval == 0 or i == last_batch_idx:
+            if (i % args.log_interval == 0 and i > 0) or i == last_batch_idx:
                 time_now = time.time()
                 time_cost = time_now - start_time
                 iters_processed_in_interval = i - last_idx
@@ -889,6 +891,7 @@ def get_args_parser() -> argparse.ArgumentParser:
         formatter_class=cli.ArgumentHelpFormatter,
     )
     parser.add_argument("-n", "--network", type=str, help="the neural network to use")
+    parser.add_argument("-t", "--tag", type=str, help="add model tag")
     parser.add_argument(
         "--model-config",
         action=cli.FlexibleDictAction,
@@ -897,8 +900,8 @@ def get_args_parser() -> argparse.ArgumentParser:
             "('drop_path_rate=0.2' or '{\"units\": [3, 24, 36, 3], \"dropout\": 0.2}'"
         ),
     )
-    parser.add_argument("-t", "--tag", type=str, help="add model tag")
     parser.add_argument("--backbone", type=str, help="the neural network to used as backbone")
+    parser.add_argument("--backbone-tag", type=str, help="backbone training log tag (loading only)")
     parser.add_argument(
         "--backbone-model-config",
         action=cli.FlexibleDictAction,
@@ -907,7 +910,6 @@ def get_args_parser() -> argparse.ArgumentParser:
             "('drop_path_rate=0.2' or '{\"units\": [3, 24, 36, 3], \"dropout\": 0.2}'"
         ),
     )
-    parser.add_argument("--backbone-tag", type=str, help="backbone training log tag (loading only)")
     parser.add_argument("--backbone-epoch", type=int, help="load backbone weights from selected epoch")
     parser.add_argument(
         "--backbone-pretrained",

birder/scripts/train_dino_v1.py CHANGED Viewed

@@ -480,12 +480,13 @@ def train(args: argparse.Namespace) -> None:
     if virtual_epoch_mode is True:
         train_iter = iter(training_loader)
+    running_loss = training_utils.SmoothedValue()
+    train_proto_agreement = training_utils.SmoothedValue()
     logger.info(f"Starting training with learning rate of {last_lr}")
     for epoch in range(begin_epoch, args.stop_epoch):
         tic = time.time()
         net.train()
-        running_loss = training_utils.SmoothedValue()
-        train_proto_agreement = training_utils.SmoothedValue()
         if args.distributed is True or virtual_epoch_mode is True:
             train_sampler.set_epoch(epoch)
@@ -581,7 +582,7 @@ def train(args: argparse.Namespace) -> None:
             train_proto_agreement.update(training_utils.accuracy(pred_teacher, pred_student))
             # Write statistics
-            if i % args.log_interval == 0 or i == last_batch_idx:
+            if (i % args.log_interval == 0 and i > 0) or i == last_batch_idx:
                 time_now = time.time()
                 time_cost = time_now - start_time
                 iters_processed_in_interval = i - last_idx
@@ -733,6 +734,7 @@ def get_args_parser() -> argparse.ArgumentParser:
         formatter_class=cli.ArgumentHelpFormatter,
     )
     parser.add_argument("-n", "--network", type=str, help="the neural network to use")
+    parser.add_argument("-t", "--tag", type=str, help="add model tag")
     parser.add_argument(
         "--model-config",
         action=cli.FlexibleDictAction,
@@ -788,7 +790,6 @@ def get_args_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--local-crop-size", type=int, nargs="+", default=[96, 96], metavar=("H", "W"), help="local view size"
     )
-    parser.add_argument("-t", "--tag", type=str, help="add model tag")
     parser.add_argument(
         "--backbone-epoch",
         type=int,

birder 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

birder 0.3.1py3-none-any.whl → 0.3.2py3-none-any.whl