PyPI - trace-tad - Versions diffs - 0.2.0__py3-none-any.whl - Mend

trace-tad 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

configs/__init__.py +0 -0
configs/_dataset.py +98 -0
configs/_model.py +52 -0
configs/large.py +149 -0
configs/small.py +146 -0
tools/__init__.py +0 -0
tools/infer.py +603 -0
tools/prep_dataset.py +83 -0
tools/test.py +187 -0
tools/train.py +250 -0
tools/tune_train.py +42 -0
trace_tad/__init__.py +17 -0
trace_tad/cli.py +945 -0
trace_tad/config.py +179 -0
trace_tad/cores/__init__.py +6 -0
trace_tad/cores/eval_engine.py +341 -0
trace_tad/cores/layer_decay_optimizer.py +93 -0
trace_tad/cores/optimizer.py +135 -0
trace_tad/cores/scheduler.py +212 -0
trace_tad/cores/train_engine.py +156 -0
trace_tad/data_prep.py +1183 -0
trace_tad/datasets/__init__.py +8 -0
trace_tad/datasets/base/__init__.py +5 -0
trace_tad/datasets/base/padding_dataset.py +168 -0
trace_tad/datasets/base/sliding_dataset.py +212 -0
trace_tad/datasets/base/util.py +40 -0
trace_tad/datasets/builder.py +65 -0
trace_tad/datasets/thumos.py +148 -0
trace_tad/datasets/transforms/__init__.py +17 -0
trace_tad/datasets/transforms/end_to_end.py +360 -0
trace_tad/datasets/transforms/formatting.py +305 -0
trace_tad/datasets/transforms/loading.py +263 -0
trace_tad/datasets/transforms/video_transforms.py +502 -0
trace_tad/evaluations/__init__.py +5 -0
trace_tad/evaluations/builder.py +29 -0
trace_tad/evaluations/mAP.py +477 -0
trace_tad/evaluations/precision.py +524 -0
trace_tad/export.py +50 -0
trace_tad/jobs/__init__.py +24 -0
trace_tad/jobs/manager.py +705 -0
trace_tad/jobs/models.py +126 -0
trace_tad/model_artifacts.py +84 -0
trace_tad/models/__init__.py +24 -0
trace_tad/models/backbones/__init__.py +4 -0
trace_tad/models/backbones/backbone_wrapper.py +267 -0
trace_tad/models/backbones/vit_adapter.py +463 -0
trace_tad/models/bricks/__init__.py +7 -0
trace_tad/models/bricks/conv.py +112 -0
trace_tad/models/bricks/gradient_ops.py +37 -0
trace_tad/models/bricks/misc.py +21 -0
trace_tad/models/bricks/sgp.py +123 -0
trace_tad/models/bricks/transformer.py +608 -0
trace_tad/models/builder.py +70 -0
trace_tad/models/dense_heads/__init__.py +6 -0
trace_tad/models/dense_heads/anchor_free_head.py +309 -0
trace_tad/models/dense_heads/prior_generator/__init__.py +2 -0
trace_tad/models/dense_heads/prior_generator/point_generator.py +36 -0
trace_tad/models/dense_heads/tridet_bm_head.py +377 -0
trace_tad/models/dense_heads/tridet_head.py +393 -0
trace_tad/models/detectors/__init__.py +6 -0
trace_tad/models/detectors/base.py +83 -0
trace_tad/models/detectors/single_stage.py +138 -0
trace_tad/models/detectors/tridet.py +194 -0
trace_tad/models/detectors/tridet_bm.py +20 -0
trace_tad/models/losses/__init__.py +5 -0
trace_tad/models/losses/boundary_loss.py +202 -0
trace_tad/models/losses/focal_loss.py +166 -0
trace_tad/models/losses/iou_loss.py +47 -0
trace_tad/models/necks/__init__.py +4 -0
trace_tad/models/necks/fpn.py +127 -0
trace_tad/models/necks/temporal_deformable_fpn.py +181 -0
trace_tad/models/projections/__init__.py +2 -0
trace_tad/models/projections/actionformer_proj.py +186 -0
trace_tad/models/projections/tridet_proj.py +140 -0
trace_tad/models/utils/__init__.py +1 -0
trace_tad/models/utils/bbox_tools.py +58 -0
trace_tad/models/utils/iou_tools.py +150 -0
trace_tad/models/utils/misc.py +25 -0
trace_tad/models/utils/post_processing/__init__.py +9 -0
trace_tad/models/utils/post_processing/classifier.py +187 -0
trace_tad/models/utils/post_processing/nms/__init__.py +0 -0
trace_tad/models/utils/post_processing/nms/nms.py +236 -0
trace_tad/models/utils/post_processing/utils.py +160 -0
trace_tad/pipeline.py +375 -0
trace_tad/pipeline_plan.py +488 -0
trace_tad/registry.py +27 -0
trace_tad/server/__init__.py +1 -0
trace_tad/server/app.py +1347 -0
trace_tad/server/jobs_router.py +366 -0
trace_tad/static/annotator/assets/classnames.f9d2a9c9.js +6 -0
trace_tad/static/annotator/assets/index.480a38ed.css +1 -0
trace_tad/static/annotator/assets/index.de688db8.js +1 -0
trace_tad/static/annotator/assets/lodash.5a06a1a1.js +9 -0
trace_tad/static/annotator/assets/moment.40bc58bf.js +8 -0
trace_tad/static/annotator/assets/runtime-dom.4eada9c7.js +21 -0
trace_tad/static/annotator/assets/runtime.a4816b2b.js +1 -0
trace_tad/static/annotator/assets/ui.7b72c5dc.js +8 -0
trace_tad/static/annotator/index.html +32 -0
trace_tad/static/annotator/trace-logo.svg +16 -0
trace_tad/training_resources.py +507 -0
trace_tad/utils/__init__.py +21 -0
trace_tad/utils/auto_tune.py +248 -0
trace_tad/utils/checkpoint.py +37 -0
trace_tad/utils/ema.py +27 -0
trace_tad/utils/logger.py +24 -0
trace_tad/utils/misc.py +67 -0
trace_tad/utils/train_tune.py +226 -0
trace_tad/version.py +1 -0
trace_tad/video_annotation.py +622 -0
trace_tad/weights.py +143 -0
trace_tad-0.2.0.dist-info/METADATA +142 -0
trace_tad-0.2.0.dist-info/RECORD +116 -0
trace_tad-0.2.0.dist-info/WHEEL +5 -0
trace_tad-0.2.0.dist-info/entry_points.txt +2 -0
trace_tad-0.2.0.dist-info/licenses/LICENSE +176 -0
trace_tad-0.2.0.dist-info/top_level.txt +3 -0

configs/__init__.py ADDED Viewed

File without changes

configs/_dataset.py ADDED Viewed

@@ -0,0 +1,98 @@
+annotation_path = "dataset.json"
+class_map = "classmap.txt"
+data_path = "."
+block_list = None
+window_size = 256
+dataset = dict(
+    train=dict(
+        type="ThumosPaddingDataset",
+        ann_file=annotation_path,
+        subset_name="training",
+        block_list=block_list,
+        class_map=class_map,
+        data_path=data_path,
+        filter_gt=False,
+        feature_stride=1,
+        sample_stride=1,
+        pipeline=[
+            dict(type="PrepareVideoInfo", format="mp4"),
+            dict(type="VideoInit", num_threads=4),
+            dict(
+                type="LoadFrames",
+                num_clips=1,
+                method="random_trunc",
+                trunc_len=window_size,
+                trunc_thresh=0.5,
+                crop_ratio=[0.9, 1.0],
+            ),
+            dict(type="VideoDecode"),
+            dict(type="VideoResize", scale=(-1, 256)),
+            dict(type="VideoRandomResizedCrop"),
+            dict(type="VideoResize", scale=(224, 224)),
+            dict(type="VideoFlip", flip_ratio=0.5),
+            dict(type="VideoFormatShape", input_format="NCTHW"),
+            dict(type="ConvertToTensor", keys=["imgs", "gt_segments", "gt_labels"]),
+            dict(type="Collect", inputs="imgs", keys=["masks", "gt_segments", "gt_labels"]),
+        ],
+    ),
+    val=dict(
+        type="ThumosSlidingDataset",
+        ann_file=annotation_path,
+        subset_name="validation",
+        block_list=block_list,
+        class_map=class_map,
+        data_path=data_path,
+        filter_gt=False,
+        feature_stride=1,
+        sample_stride=1,
+        window_size=window_size,
+        window_overlap_ratio=0.25,
+        pipeline=[
+            dict(type="PrepareVideoInfo", format="mp4"),
+            dict(type="VideoInit", num_threads=4),
+            dict(type="LoadFrames", num_clips=1, method="sliding_window"),
+            dict(type="VideoDecode"),
+            dict(type="VideoResize", scale=(-1, 224)),
+            dict(type="VideoCenterCrop", crop_size=224),
+            dict(type="VideoFormatShape", input_format="NCTHW"),
+            dict(type="ConvertToTensor", keys=["imgs", "gt_segments", "gt_labels"]),
+            dict(type="Collect", inputs="imgs", keys=["masks", "gt_segments", "gt_labels"]),
+        ],
+    ),
+    test=dict(
+        type="ThumosSlidingDataset",
+        ann_file=annotation_path,
+        subset_name="validation",
+        block_list=block_list,
+        class_map=class_map,
+        data_path=data_path,
+        filter_gt=False,
+        test_mode=True,
+        feature_stride=1,
+        sample_stride=1,
+        window_size=window_size,
+        window_overlap_ratio=0.5,
+        pipeline=[
+            dict(type="PrepareVideoInfo", format="mp4"),
+            dict(type="VideoInit", num_threads=4),
+            dict(type="LoadFrames", num_clips=1, method="sliding_window"),
+            dict(type="VideoDecode"),
+            dict(type="VideoResize", scale=(-1, 224)),
+            dict(type="VideoCenterCrop", crop_size=224),
+            dict(type="VideoFormatShape", input_format="NCTHW"),
+            dict(type="ConvertToTensor", keys=["imgs"]),
+            dict(type="Collect", inputs="imgs", keys=["masks"]),
+        ],
+    ),
+)
+evaluation = dict(
+    type="Precision",
+    subset="validation",
+    tiou_thresholds=[0.3, 0.4, 0.5, 0.6, 0.7],
+    ground_truth_filename=annotation_path,
+    gt_fps=30.0,
+    eval_fps=30.0,
+)

configs/_model.py ADDED Viewed

@@ -0,0 +1,52 @@
+model = dict(
+    type="TriDet",
+    projection=dict(
+        type="TriDetProj",
+        in_channels=2048,
+        out_channels=512,
+        sgp_mlp_dim=768,
+        arch=(2, 2, 5),  # layers in embed / stem / branch
+        downsample_type="max",
+        sgp_win_size=[1, 1, 1, 1, 1, 1],
+        k=5,
+        init_conv_vars=0,
+        conv_cfg=dict(kernel_size=3),
+        norm_cfg=dict(type="LN"),
+        path_pdrop=0.1,
+        use_abs_pe=True,
+        max_seq_len=768,
+        input_noise=0.0,
+    ),
+    neck=dict(
+        type="FPNIdentity",
+        in_channels=512,
+        out_channels=512,
+        num_levels=6,
+    ),
+    rpn_head=dict(
+        type="TriDetHead",
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        num_convs=2,
+        cls_prior_prob=0.01,
+        prior_generator=dict(
+            type="PointGenerator",
+            strides=[1, 2, 4, 8, 16, 32],
+            regression_range=[(0, 4), (4, 8), (8, 16), (16, 32), (32, 64), (64, 10000)],
+        ),
+        loss_normalizer=100,
+        loss_normalizer_momentum=0.9,
+        center_sample="radius",
+        center_sample_radius=1.5,
+        label_smoothing=0.0,
+        boundary_kernel_size=3,
+        iou_weight_power=0.2,
+        num_bins=16,
+        loss=dict(
+            cls_loss=dict(type="ClassBalancedFocalLoss", beta=0.999),
+            reg_loss=dict(type="DIOULoss"),
+            iou_rate=dict(type="GIOULoss"),
+        ),
+    ),
+)

configs/large.py ADDED Viewed

@@ -0,0 +1,149 @@
+_base_ = [
+    "_dataset.py",
+    "_model.py",
+]
+window_size = 768
+scale_factor = 1
+chunk_num = window_size * scale_factor // 16
+dataset = dict(
+    train=dict(
+        pipeline=[
+            dict(type="PrepareVideoInfo", format="mp4"),
+            dict(type="VideoInit", num_threads=4, resize=(144, 144)),
+            dict(
+                type="LoadFrames",
+                num_clips=1,
+                method="random_trunc",
+                trunc_len=window_size,
+                trunc_thresh=0.75,
+                crop_ratio=[0.9, 1.0],
+                scale_factor=scale_factor,
+            ),
+            dict(type="VideoTemporalAugment", speed_range=[0.8, 1.2], p=0.5),
+            dict(type="VideoDecode"),
+            dict(type="VideoBatchResize", scale=(144, 144)),
+            dict(type="VideoFlip", flip_ratio=0.5),
+            dict(type="VideoColorJitter", brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),
+            dict(type="VideoFormatShape", input_format="NCTHW"),
+            dict(type="ConvertToTensor", keys=["imgs", "gt_segments", "gt_labels"]),
+            dict(type="Collect", inputs="imgs", keys=["masks", "gt_segments", "gt_labels"]),
+        ],
+    ),
+    val=dict(
+        window_size=window_size,
+        pipeline=[
+            dict(type="PrepareVideoInfo", format="mp4"),
+            dict(type="VideoInit", num_threads=4, resize=(144, 144)),
+            dict(type="LoadFrames", num_clips=1, method="random_trunc", scale_factor=scale_factor),
+            dict(type="VideoDecode"),
+            dict(type="VideoBatchResize", scale=(144, 144)),
+            dict(type="VideoFormatShape", input_format="NCTHW"),
+            dict(type="ConvertToTensor", keys=["imgs", "gt_segments", "gt_labels"]),
+            dict(type="Collect", inputs="imgs", keys=["masks", "gt_segments", "gt_labels"]),
+        ],
+    ),
+    test=dict(
+        window_size=window_size,
+        pipeline=[
+            dict(type="PrepareVideoInfo", format="mp4"),
+            dict(type="VideoInit", num_threads=4, resize=(144, 144)),
+            dict(type="LoadFrames", num_clips=1, method="sliding_window", scale_factor=scale_factor),
+            dict(type="VideoDecode"),
+            dict(type="VideoBatchResize", scale=(144, 144)),
+            dict(type="VideoFormatShape", input_format="NCTHW"),
+            dict(type="ConvertToTensor", keys=["imgs"]),
+            dict(type="Collect", inputs="imgs", keys=["masks"]),
+        ],
+    ),
+)
+model = dict(
+    backbone=dict(
+        type="VisionTransformerAdapter",
+        img_size=224,
+        patch_size=16,
+        embed_dims=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+        norm_cfg=dict(type="LN", eps=1e-6),
+        return_feat_map=True,
+        with_cp=True,
+        total_frames=window_size * scale_factor,
+        adapter_index=list(range(24)),
+        custom=dict(
+            pretrain="pretrained/vit-large-p16_videomaev2-k400.pth",
+            mean=[123.675, 116.28, 103.53],
+            std=[58.395, 57.12, 57.375],
+            pre_processing_pipeline=[
+                dict(type="Rearrange", keys=["frames"], ops="b n c (t1 t) h w -> (b t1) n c t h w", t1=chunk_num),
+            ],
+            post_processing_pipeline=[
+                dict(type="Reduce", keys=["feats"], ops="b n c t h w -> b c t", reduction="mean"),
+                dict(type="Rearrange", keys=["feats"], ops="(b t1) c t -> b c (t1 t)", t1=chunk_num),
+                dict(type="Interpolate", keys=["feats"], size=window_size),
+            ],
+            norm_eval=False,
+            freeze_backbone=False,
+        ),
+    ),
+    projection=dict(in_channels=1024, input_noise=0.0005),
+)
+solver = dict(
+    train=dict(batch_size=1, num_workers=16, persistent_workers=True, prefetch_factor=4),
+    val=dict(batch_size=4, num_workers=16, persistent_workers=True, prefetch_factor=4),
+    test=dict(batch_size=4, num_workers=16, persistent_workers=True, prefetch_factor=4),
+    clip_grad_norm=1,
+    ema=True,
+    amp=True,
+    accumulation_steps=2,
+    compile=False,
+)
+optimizer = dict(
+    type="AdamW",
+    lr=7e-5,
+    weight_decay=0.025,
+    paramwise=True,
+    backbone=dict(
+        lr=0,
+        weight_decay=0,
+        custom=[dict(name="adapter", lr=1e-4, weight_decay=0.05)],
+        exclude=["backbone"],
+    ),
+)
+scheduler = dict(type="LinearWarmupCosineAnnealingLR", warmup_epoch=5, max_epoch=150)
+inference = dict(load_from_raw_predictions=False, save_raw_prediction=False)
+post_processing = dict(
+    nms=dict(
+        use_soft_nms=True,
+        sigma=0.5,
+        max_seg_num=2000,
+        # min_score is a compaction threshold, not an output filter. Soft-NMS
+        # drops items whose decayed score falls below it — shrinking the active
+        # set and avoiding O(N²) work on items that will never reach the top
+        # `max_seg_num` anyway. Long videos aggregate 100k+ proposals across
+        # overlapping sliding windows; 0.05 is well below typical output cutoffs
+        # (which sit around 0.25+) so outputs remain bit-identical to 0.001
+        # while per-video NMS runs ~2× faster.
+        min_score=0.05,
+        multiclass=True,
+        voting_thresh=0.7,
+    ),
+    save_dict=True,
+)
+workflow = dict(
+    logging_interval=50,
+    checkpoint_interval=5,
+    val_eval_interval=5,
+    val_start_epoch=5,
+)
+work_dir = "exps/large"

configs/small.py ADDED Viewed

@@ -0,0 +1,146 @@
+_base_ = [
+    "_dataset.py",
+    "_model.py",
+]
+window_size = 768
+scale_factor = 1
+chunk_num = window_size * scale_factor // 16
+dataset = dict(
+    train=dict(
+        pipeline=[
+            dict(type="PrepareVideoInfo", format="mp4"),
+            dict(type="VideoInit", num_threads=4, resize=(144, 144)),
+            dict(
+                type="LoadFrames",
+                num_clips=1,
+                method="random_trunc",
+                trunc_len=window_size,
+                trunc_thresh=0.75,
+                crop_ratio=[0.9, 1.0],
+                scale_factor=scale_factor,
+            ),
+            dict(type="VideoDecode"),
+            dict(type="VideoBatchResize", scale=(144, 144)),
+            dict(type="VideoFlip", flip_ratio=0.5),
+            dict(type="VideoColorJitter", brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),
+            dict(type="VideoFormatShape", input_format="NCTHW"),
+            dict(type="ConvertToTensor", keys=["imgs", "gt_segments", "gt_labels"]),
+            dict(type="Collect", inputs="imgs", keys=["masks", "gt_segments", "gt_labels"]),
+        ],
+    ),
+    val=dict(
+        window_size=window_size,
+        pipeline=[
+            dict(type="PrepareVideoInfo", format="mp4"),
+            dict(type="VideoInit", num_threads=4, resize=(144, 144)),
+            dict(type="LoadFrames", num_clips=1, method="random_trunc", scale_factor=scale_factor),
+            dict(type="VideoDecode"),
+            dict(type="VideoBatchResize", scale=(144, 144)),
+            dict(type="VideoFormatShape", input_format="NCTHW"),
+            dict(type="ConvertToTensor", keys=["imgs", "gt_segments", "gt_labels"]),
+            dict(type="Collect", inputs="imgs", keys=["masks", "gt_segments", "gt_labels"]),
+        ],
+    ),
+    test=dict(
+        window_size=window_size,
+        pipeline=[
+            dict(type="PrepareVideoInfo", format="mp4"),
+            dict(type="VideoInit", num_threads=4, resize=(144, 144)),
+            dict(type="LoadFrames", num_clips=1, method="sliding_window", scale_factor=scale_factor),
+            dict(type="VideoDecode"),
+            dict(type="VideoBatchResize", scale=(144, 144)),
+            dict(type="VideoFormatShape", input_format="NCTHW"),
+            dict(type="ConvertToTensor", keys=["imgs"]),
+            dict(type="Collect", inputs="imgs", keys=["masks"]),
+        ],
+    ),
+)
+model = dict(
+    backbone=dict(
+        type="VisionTransformerAdapter",
+        img_size=224,
+        patch_size=16,
+        embed_dims=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.1,
+        norm_cfg=dict(type="LN", eps=1e-6),
+        return_feat_map=True,
+        with_cp=True,
+        total_frames=window_size * scale_factor,
+        adapter_index=list(range(12)),
+        custom=dict(
+            pretrain="pretrained/vit-small-p16_videomae-k400-pre_16x4x1_kinetics-400_my.pth",
+            mean=[123.675, 116.28, 103.53],
+            std=[58.395, 57.12, 57.375],
+            pre_processing_pipeline=[
+                dict(type="Rearrange", keys=["frames"], ops="b n c (t1 t) h w -> (b t1) n c t h w", t1=chunk_num),
+            ],
+            post_processing_pipeline=[
+                dict(type="Reduce", keys=["feats"], ops="b n c t h w -> b c t", reduction="mean"),
+                dict(type="Rearrange", keys=["feats"], ops="(b t1) c t -> b c (t1 t)", t1=chunk_num),
+                dict(type="Interpolate", keys=["feats"], size=window_size),
+            ],
+            norm_eval=False,
+            freeze_backbone=False,
+        ),
+    ),
+    projection=dict(in_channels=384, input_noise=0.0005),
+)
+solver = dict(
+    train=dict(batch_size=1, num_workers=16, persistent_workers=True, prefetch_factor=4),
+    val=dict(batch_size=4, num_workers=16, persistent_workers=True, prefetch_factor=4),
+    test=dict(batch_size=4, num_workers=16, persistent_workers=True, prefetch_factor=4),
+    clip_grad_norm=1,
+    ema=True,
+    amp=True,
+)
+optimizer = dict(
+    type="AdamW",
+    lr=7e-5,
+    weight_decay=0.025,
+    paramwise=True,
+    backbone=dict(
+        lr=0,
+        weight_decay=0,
+        custom=[dict(name="adapter", lr=1e-4, weight_decay=0.05)],
+        exclude=["backbone"],
+    ),
+)
+scheduler = dict(type="LinearWarmupCosineAnnealingLR", warmup_epoch=5, max_epoch=100)
+inference = dict(load_from_raw_predictions=False, save_raw_prediction=False)
+post_processing = dict(
+    nms=dict(
+        use_soft_nms=True,
+        sigma=0.5,
+        max_seg_num=2000,
+        # min_score is a compaction threshold, not an output filter. Soft-NMS
+        # drops items whose decayed score falls below it — shrinking the active
+        # set and avoiding O(N²) work on items that will never reach the top
+        # `max_seg_num` anyway. Long videos aggregate 100k+ proposals across
+        # overlapping sliding windows; 0.05 is well below typical output cutoffs
+        # (which sit around 0.25+) so outputs remain bit-identical to 0.001
+        # while per-video NMS runs ~2× faster.
+        min_score=0.05,
+        multiclass=True,
+        voting_thresh=0.7,
+    ),
+    save_dict=True,
+)
+workflow = dict(
+    logging_interval=50,
+    checkpoint_interval=2,
+    val_eval_interval=2,
+    val_start_epoch=0,
+)
+work_dir = "exps/small"

tools/__init__.py ADDED Viewed

File without changes