yomitoku 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- yomitoku/models/layers/activate.py +13 -0
 - yomitoku/models/layers/rtdetr_backbone.py +28 -6
 - yomitoku/models/layers/rtdetr_hybrid_encoder.py +31 -7
 - yomitoku/models/layers/rtdetrv2_decoder.py +56 -18
 - yomitoku/postprocessor/rtdetr_postprocessor.py +27 -5
 - {yomitoku-0.5.2.dist-info → yomitoku-0.5.3.dist-info}/METADATA +1 -1
 - {yomitoku-0.5.2.dist-info → yomitoku-0.5.3.dist-info}/RECORD +9 -9
 - {yomitoku-0.5.2.dist-info → yomitoku-0.5.3.dist-info}/WHEEL +1 -1
 - {yomitoku-0.5.2.dist-info → yomitoku-0.5.3.dist-info}/entry_points.txt +0 -0
 
| 
         @@ -1,3 +1,16 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # Copyright(c) 2023 lyuwenyu
         
     | 
| 
      
 2 
     | 
    
         
            +
            #
         
     | 
| 
      
 3 
     | 
    
         
            +
            # Licensed under the Apache License, Version 2.0 (the "License");
         
     | 
| 
      
 4 
     | 
    
         
            +
            # you may not use this file except in compliance with the License.
         
     | 
| 
      
 5 
     | 
    
         
            +
            # You may obtain a copy of the License at
         
     | 
| 
      
 6 
     | 
    
         
            +
            #
         
     | 
| 
      
 7 
     | 
    
         
            +
            #     https://www.apache.org/licenses/LICENSE-2.0
         
     | 
| 
      
 8 
     | 
    
         
            +
            #
         
     | 
| 
      
 9 
     | 
    
         
            +
            # Unless required by applicable law or agreed to in writing, software
         
     | 
| 
      
 10 
     | 
    
         
            +
            # distributed under the License is distributed on an "AS IS" BASIS,
         
     | 
| 
      
 11 
     | 
    
         
            +
            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         
     | 
| 
      
 12 
     | 
    
         
            +
            # See the License for the specific language governing permissions and
         
     | 
| 
      
 13 
     | 
    
         
            +
            # limitations under the License.
         
     | 
| 
       1 
14 
     | 
    
         
             
            import torch.nn as nn
         
     | 
| 
       2 
15 
     | 
    
         | 
| 
       3 
16 
     | 
    
         | 
| 
         @@ -1,4 +1,16 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
             
     | 
| 
      
 1 
     | 
    
         
            +
            # Copyright 2023 lyuwenyu
         
     | 
| 
      
 2 
     | 
    
         
            +
            #
         
     | 
| 
      
 3 
     | 
    
         
            +
            # Licensed under the Apache License, Version 2.0 (the "License");
         
     | 
| 
      
 4 
     | 
    
         
            +
            # you may not use this file except in compliance with the License.
         
     | 
| 
      
 5 
     | 
    
         
            +
            # You may obtain a copy of the License at
         
     | 
| 
      
 6 
     | 
    
         
            +
            #
         
     | 
| 
      
 7 
     | 
    
         
            +
            #     https://www.apache.org/licenses/LICENSE-2.0
         
     | 
| 
      
 8 
     | 
    
         
            +
            #
         
     | 
| 
      
 9 
     | 
    
         
            +
            # Unless required by applicable law or agreed to in writing, software
         
     | 
| 
      
 10 
     | 
    
         
            +
            # distributed under the License is distributed on an "AS IS" BASIS,
         
     | 
| 
      
 11 
     | 
    
         
            +
            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         
     | 
| 
      
 12 
     | 
    
         
            +
            # See the License for the specific language governing permissions and
         
     | 
| 
      
 13 
     | 
    
         
            +
            # limitations under the License.
         
     | 
| 
       2 
14 
     | 
    
         | 
| 
       3 
15 
     | 
    
         
             
            from collections import OrderedDict
         
     | 
| 
       4 
16 
     | 
    
         | 
| 
         @@ -47,7 +59,9 @@ class ConvNormLayer(nn.Module): 
     | 
|
| 
       47 
59 
     | 
    
         
             
            class BasicBlock(nn.Module):
         
     | 
| 
       48 
60 
     | 
    
         
             
                expansion = 1
         
     | 
| 
       49 
61 
     | 
    
         | 
| 
       50 
     | 
    
         
            -
                def __init__( 
     | 
| 
      
 62 
     | 
    
         
            +
                def __init__(
         
     | 
| 
      
 63 
     | 
    
         
            +
                    self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"
         
     | 
| 
      
 64 
     | 
    
         
            +
                ):
         
     | 
| 
       51 
65 
     | 
    
         
             
                    super().__init__()
         
     | 
| 
       52 
66 
     | 
    
         | 
| 
       53 
67 
     | 
    
         
             
                    self.shortcut = shortcut
         
     | 
| 
         @@ -86,7 +100,9 @@ class BasicBlock(nn.Module): 
     | 
|
| 
       86 
100 
     | 
    
         
             
            class BottleNeck(nn.Module):
         
     | 
| 
       87 
101 
     | 
    
         
             
                expansion = 4
         
     | 
| 
       88 
102 
     | 
    
         | 
| 
       89 
     | 
    
         
            -
                def __init__( 
     | 
| 
      
 103 
     | 
    
         
            +
                def __init__(
         
     | 
| 
      
 104 
     | 
    
         
            +
                    self, ch_in, ch_out, stride, shortcut, act="relu", variant="b"
         
     | 
| 
      
 105 
     | 
    
         
            +
                ):
         
     | 
| 
       90 
106 
     | 
    
         
             
                    super().__init__()
         
     | 
| 
       91 
107 
     | 
    
         | 
| 
       92 
108 
     | 
    
         
             
                    if variant == "a":
         
     | 
| 
         @@ -109,13 +125,17 @@ class BottleNeck(nn.Module): 
     | 
|
| 
       109 
125 
     | 
    
         
             
                                        ("pool", nn.AvgPool2d(2, 2, 0, ceil_mode=True)),
         
     | 
| 
       110 
126 
     | 
    
         
             
                                        (
         
     | 
| 
       111 
127 
     | 
    
         
             
                                            "conv",
         
     | 
| 
       112 
     | 
    
         
            -
                                            ConvNormLayer( 
     | 
| 
      
 128 
     | 
    
         
            +
                                            ConvNormLayer(
         
     | 
| 
      
 129 
     | 
    
         
            +
                                                ch_in, ch_out * self.expansion, 1, 1
         
     | 
| 
      
 130 
     | 
    
         
            +
                                            ),
         
     | 
| 
       113 
131 
     | 
    
         
             
                                        ),
         
     | 
| 
       114 
132 
     | 
    
         
             
                                    ]
         
     | 
| 
       115 
133 
     | 
    
         
             
                                )
         
     | 
| 
       116 
134 
     | 
    
         
             
                            )
         
     | 
| 
       117 
135 
     | 
    
         
             
                        else:
         
     | 
| 
       118 
     | 
    
         
            -
                            self.short = ConvNormLayer( 
     | 
| 
      
 136 
     | 
    
         
            +
                            self.short = ConvNormLayer(
         
     | 
| 
      
 137 
     | 
    
         
            +
                                ch_in, ch_out * self.expansion, 1, stride
         
     | 
| 
      
 138 
     | 
    
         
            +
                            )
         
     | 
| 
       119 
139 
     | 
    
         | 
| 
       120 
140 
     | 
    
         
             
                    self.act = nn.Identity() if act is None else get_activation(act)
         
     | 
| 
       121 
141 
     | 
    
         | 
| 
         @@ -136,7 +156,9 @@ class BottleNeck(nn.Module): 
     | 
|
| 
       136 
156 
     | 
    
         | 
| 
       137 
157 
     | 
    
         | 
| 
       138 
158 
     | 
    
         
             
            class Blocks(nn.Module):
         
     | 
| 
       139 
     | 
    
         
            -
                def __init__( 
     | 
| 
      
 159 
     | 
    
         
            +
                def __init__(
         
     | 
| 
      
 160 
     | 
    
         
            +
                    self, block, ch_in, ch_out, count, stage_num, act="relu", variant="b"
         
     | 
| 
      
 161 
     | 
    
         
            +
                ):
         
     | 
| 
       140 
162 
     | 
    
         
             
                    super().__init__()
         
     | 
| 
       141 
163 
     | 
    
         | 
| 
       142 
164 
     | 
    
         
             
                    self.blocks = nn.ModuleList()
         
     | 
| 
         @@ -1,4 +1,16 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
             
     | 
| 
      
 1 
     | 
    
         
            +
            # Copyright 2023 lyuwenyu
         
     | 
| 
      
 2 
     | 
    
         
            +
            #
         
     | 
| 
      
 3 
     | 
    
         
            +
            # Licensed under the Apache License, Version 2.0 (the "License");
         
     | 
| 
      
 4 
     | 
    
         
            +
            # you may not use this file except in compliance with the License.
         
     | 
| 
      
 5 
     | 
    
         
            +
            # You may obtain a copy of the License at
         
     | 
| 
      
 6 
     | 
    
         
            +
            #
         
     | 
| 
      
 7 
     | 
    
         
            +
            #     https://www.apache.org/licenses/LICENSE-2.0
         
     | 
| 
      
 8 
     | 
    
         
            +
            #
         
     | 
| 
      
 9 
     | 
    
         
            +
            # Unless required by applicable law or agreed to in writing, software
         
     | 
| 
      
 10 
     | 
    
         
            +
            # distributed under the License is distributed on an "AS IS" BASIS,
         
     | 
| 
      
 11 
     | 
    
         
            +
            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         
     | 
| 
      
 12 
     | 
    
         
            +
            # See the License for the specific language governing permissions and
         
     | 
| 
      
 13 
     | 
    
         
            +
            # limitations under the License.
         
     | 
| 
       2 
14 
     | 
    
         | 
| 
       3 
15 
     | 
    
         
             
            import copy
         
     | 
| 
       4 
16 
     | 
    
         
             
            from collections import OrderedDict
         
     | 
| 
         @@ -240,7 +252,9 @@ class HybridEncoder(nn.Module): 
     | 
|
| 
       240 
252 
     | 
    
         
             
                    for in_channel in in_channels:
         
     | 
| 
       241 
253 
     | 
    
         
             
                        if version == "v1":
         
     | 
| 
       242 
254 
     | 
    
         
             
                            proj = nn.Sequential(
         
     | 
| 
       243 
     | 
    
         
            -
                                nn.Conv2d( 
     | 
| 
      
 255 
     | 
    
         
            +
                                nn.Conv2d(
         
     | 
| 
      
 256 
     | 
    
         
            +
                                    in_channel, hidden_dim, kernel_size=1, bias=False
         
     | 
| 
      
 257 
     | 
    
         
            +
                                ),
         
     | 
| 
       244 
258 
     | 
    
         
             
                                nn.BatchNorm2d(hidden_dim),
         
     | 
| 
       245 
259 
     | 
    
         
             
                            )
         
     | 
| 
       246 
260 
     | 
    
         
             
                        elif version == "v2":
         
     | 
| 
         @@ -276,7 +290,9 @@ class HybridEncoder(nn.Module): 
     | 
|
| 
       276 
290 
     | 
    
         | 
| 
       277 
291 
     | 
    
         
             
                    self.encoder = nn.ModuleList(
         
     | 
| 
       278 
292 
     | 
    
         
             
                        [
         
     | 
| 
       279 
     | 
    
         
            -
                            TransformerEncoder( 
     | 
| 
      
 293 
     | 
    
         
            +
                            TransformerEncoder(
         
     | 
| 
      
 294 
     | 
    
         
            +
                                copy.deepcopy(encoder_layer), num_encoder_layers
         
     | 
| 
      
 295 
     | 
    
         
            +
                            )
         
     | 
| 
       280 
296 
     | 
    
         
             
                            for _ in range(len(use_encoder_idx))
         
     | 
| 
       281 
297 
     | 
    
         
             
                        ]
         
     | 
| 
       282 
298 
     | 
    
         
             
                    )
         
     | 
| 
         @@ -331,7 +347,9 @@ class HybridEncoder(nn.Module): 
     | 
|
| 
       331 
347 
     | 
    
         
             
                            # self.register_buffer(f'pos_embed{idx}', pos_embed)
         
     | 
| 
       332 
348 
     | 
    
         | 
| 
       333 
349 
     | 
    
         
             
                @staticmethod
         
     | 
| 
       334 
     | 
    
         
            -
                def build_2d_sincos_position_embedding( 
     | 
| 
      
 350 
     | 
    
         
            +
                def build_2d_sincos_position_embedding(
         
     | 
| 
      
 351 
     | 
    
         
            +
                    w, h, embed_dim=256, temperature=10000.0
         
     | 
| 
      
 352 
     | 
    
         
            +
                ):
         
     | 
| 
       335 
353 
     | 
    
         
             
                    """ """
         
     | 
| 
       336 
354 
     | 
    
         
             
                    grid_w = torch.arange(int(w), dtype=torch.float32)
         
     | 
| 
       337 
355 
     | 
    
         
             
                    grid_h = torch.arange(int(h), dtype=torch.float32)
         
     | 
| 
         @@ -369,7 +387,9 @@ class HybridEncoder(nn.Module): 
     | 
|
| 
       369 
387 
     | 
    
         
             
                                    src_flatten.device
         
     | 
| 
       370 
388 
     | 
    
         
             
                                )
         
     | 
| 
       371 
389 
     | 
    
         | 
| 
       372 
     | 
    
         
            -
                            memory: torch.Tensor = self.encoder[i]( 
     | 
| 
      
 390 
     | 
    
         
            +
                            memory: torch.Tensor = self.encoder[i](
         
     | 
| 
      
 391 
     | 
    
         
            +
                                src_flatten, pos_embed=pos_embed
         
     | 
| 
      
 392 
     | 
    
         
            +
                            )
         
     | 
| 
       373 
393 
     | 
    
         
             
                            proj_feats[enc_ind] = (
         
     | 
| 
       374 
394 
     | 
    
         
             
                                memory.permute(0, 2, 1)
         
     | 
| 
       375 
395 
     | 
    
         
             
                                .reshape(-1, self.hidden_dim, h, w)
         
     | 
| 
         @@ -381,9 +401,13 @@ class HybridEncoder(nn.Module): 
     | 
|
| 
       381 
401 
     | 
    
         
             
                    for idx in range(len(self.in_channels) - 1, 0, -1):
         
     | 
| 
       382 
402 
     | 
    
         
             
                        feat_heigh = inner_outs[0]
         
     | 
| 
       383 
403 
     | 
    
         
             
                        feat_low = proj_feats[idx - 1]
         
     | 
| 
       384 
     | 
    
         
            -
                        feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx]( 
     | 
| 
      
 404 
     | 
    
         
            +
                        feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
         
     | 
| 
      
 405 
     | 
    
         
            +
                            feat_heigh
         
     | 
| 
      
 406 
     | 
    
         
            +
                        )
         
     | 
| 
       385 
407 
     | 
    
         
             
                        inner_outs[0] = feat_heigh
         
     | 
| 
       386 
     | 
    
         
            -
                        upsample_feat = F.interpolate( 
     | 
| 
      
 408 
     | 
    
         
            +
                        upsample_feat = F.interpolate(
         
     | 
| 
      
 409 
     | 
    
         
            +
                            feat_heigh, scale_factor=2.0, mode="nearest"
         
     | 
| 
      
 410 
     | 
    
         
            +
                        )
         
     | 
| 
       387 
411 
     | 
    
         
             
                        inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
         
     | 
| 
       388 
412 
     | 
    
         
             
                            torch.concat([upsample_feat, feat_low], dim=1)
         
     | 
| 
       389 
413 
     | 
    
         
             
                        )
         
     | 
| 
         @@ -1,4 +1,17 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
             
     | 
| 
      
 1 
     | 
    
         
            +
            # Scene Text Recognition Model Hub
         
     | 
| 
      
 2 
     | 
    
         
            +
            # Copyright 2023 lyuwenyu
         
     | 
| 
      
 3 
     | 
    
         
            +
            #
         
     | 
| 
      
 4 
     | 
    
         
            +
            # Licensed under the Apache License, Version 2.0 (the "License");
         
     | 
| 
      
 5 
     | 
    
         
            +
            # you may not use this file except in compliance with the License.
         
     | 
| 
      
 6 
     | 
    
         
            +
            # You may obtain a copy of the License at
         
     | 
| 
      
 7 
     | 
    
         
            +
            #
         
     | 
| 
      
 8 
     | 
    
         
            +
            #     https://www.apache.org/licenses/LICENSE-2.0
         
     | 
| 
      
 9 
     | 
    
         
            +
            #
         
     | 
| 
      
 10 
     | 
    
         
            +
            # Unless required by applicable law or agreed to in writing, software
         
     | 
| 
      
 11 
     | 
    
         
            +
            # distributed under the License is distributed on an "AS IS" BASIS,
         
     | 
| 
      
 12 
     | 
    
         
            +
            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         
     | 
| 
      
 13 
     | 
    
         
            +
            # See the License for the specific language governing permissions and
         
     | 
| 
      
 14 
     | 
    
         
            +
            # limitations under the License.
         
     | 
| 
       2 
15 
     | 
    
         | 
| 
       3 
16 
     | 
    
         
             
            import copy
         
     | 
| 
       4 
17 
     | 
    
         
             
            import functools
         
     | 
| 
         @@ -27,7 +40,9 @@ def inverse_sigmoid(x: torch.Tensor, eps: float = 1e-5) -> torch.Tensor: 
     | 
|
| 
       27 
40 
     | 
    
         | 
| 
       28 
41 
     | 
    
         | 
| 
       29 
42 
     | 
    
         
             
            class MLP(nn.Module):
         
     | 
| 
       30 
     | 
    
         
            -
                def __init__( 
     | 
| 
      
 43 
     | 
    
         
            +
                def __init__(
         
     | 
| 
      
 44 
     | 
    
         
            +
                    self, input_dim, hidden_dim, output_dim, num_layers, act="relu"
         
     | 
| 
      
 45 
     | 
    
         
            +
                ):
         
     | 
| 
       31 
46 
     | 
    
         
             
                    super().__init__()
         
     | 
| 
       32 
47 
     | 
    
         
             
                    self.num_layers = num_layers
         
     | 
| 
       33 
48 
     | 
    
         
             
                    h = [hidden_dim] * (num_layers - 1)
         
     | 
| 
         @@ -178,7 +193,9 @@ class MSDeformableAttention(nn.Module): 
     | 
|
| 
       178 
193 
     | 
    
         
             
                    elif reference_points.shape[-1] == 4:
         
     | 
| 
       179 
194 
     | 
    
         
             
                        # reference_points [8, 480, None, 1,  4]
         
     | 
| 
       180 
195 
     | 
    
         
             
                        # sampling_offsets [8, 480, 8,    12, 2]
         
     | 
| 
       181 
     | 
    
         
            -
                        num_points_scale = self.num_points_scale.to( 
     | 
| 
      
 196 
     | 
    
         
            +
                        num_points_scale = self.num_points_scale.to(
         
     | 
| 
      
 197 
     | 
    
         
            +
                            dtype=query.dtype
         
     | 
| 
      
 198 
     | 
    
         
            +
                        ).unsqueeze(-1)
         
     | 
| 
       182 
199 
     | 
    
         
             
                        offset = (
         
     | 
| 
       183 
200 
     | 
    
         
             
                            sampling_offsets
         
     | 
| 
       184 
201 
     | 
    
         
             
                            * num_points_scale
         
     | 
| 
         @@ -313,7 +330,9 @@ def deformable_attention_core_func_v2( 
     | 
|
| 
       313 
330 
     | 
    
         
             
                _, Len_q, _, _, _ = sampling_locations.shape
         
     | 
| 
       314 
331 
     | 
    
         | 
| 
       315 
332 
     | 
    
         
             
                split_shape = [h * w for h, w in value_spatial_shapes]
         
     | 
| 
       316 
     | 
    
         
            -
                value_list =  
     | 
| 
      
 333 
     | 
    
         
            +
                value_list = (
         
     | 
| 
      
 334 
     | 
    
         
            +
                    value.permute(0, 2, 3, 1).flatten(0, 1).split(split_shape, dim=-1)
         
     | 
| 
      
 335 
     | 
    
         
            +
                )
         
     | 
| 
       317 
336 
     | 
    
         | 
| 
       318 
337 
     | 
    
         
             
                # sampling_offsets [8, 480, 8, 12, 2]
         
     | 
| 
       319 
338 
     | 
    
         
             
                if method == "default":
         
     | 
| 
         @@ -342,7 +361,8 @@ def deformable_attention_core_func_v2( 
     | 
|
| 
       342 
361 
     | 
    
         
             
                    elif method == "discrete":
         
     | 
| 
       343 
362 
     | 
    
         
             
                        # n * m, seq, n, 2
         
     | 
| 
       344 
363 
     | 
    
         
             
                        sampling_coord = (
         
     | 
| 
       345 
     | 
    
         
            -
                            sampling_grid_l * torch.tensor([[w, h]], device=value.device) 
     | 
| 
      
 364 
     | 
    
         
            +
                            sampling_grid_l * torch.tensor([[w, h]], device=value.device)
         
     | 
| 
      
 365 
     | 
    
         
            +
                            + 0.5
         
     | 
| 
       346 
366 
     | 
    
         
             
                        ).to(torch.int64)
         
     | 
| 
       347 
367 
     | 
    
         | 
| 
       348 
368 
     | 
    
         
             
                        # FIX ME? for rectangle input
         
     | 
| 
         @@ -369,7 +389,9 @@ def deformable_attention_core_func_v2( 
     | 
|
| 
       369 
389 
     | 
    
         
             
                attn_weights = attention_weights.permute(0, 2, 1, 3).reshape(
         
     | 
| 
       370 
390 
     | 
    
         
             
                    bs * n_head, 1, Len_q, sum(num_points_list)
         
     | 
| 
       371 
391 
     | 
    
         
             
                )
         
     | 
| 
       372 
     | 
    
         
            -
                weighted_sample_locs =  
     | 
| 
      
 392 
     | 
    
         
            +
                weighted_sample_locs = (
         
     | 
| 
      
 393 
     | 
    
         
            +
                    torch.concat(sampling_value_list, dim=-1) * attn_weights
         
     | 
| 
      
 394 
     | 
    
         
            +
                )
         
     | 
| 
       373 
395 
     | 
    
         
             
                output = weighted_sample_locs.sum(-1).reshape(bs, n_head * c, Len_q)
         
     | 
| 
       374 
396 
     | 
    
         | 
| 
       375 
397 
     | 
    
         
             
                return output.permute(0, 2, 1)
         
     | 
| 
         @@ -584,7 +606,9 @@ class RTDETRTransformerv2(nn.Module): 
     | 
|
| 
       584 
606 
     | 
    
         
             
                                    [
         
     | 
| 
       585 
607 
     | 
    
         
             
                                        (
         
     | 
| 
       586 
608 
     | 
    
         
             
                                            "conv",
         
     | 
| 
       587 
     | 
    
         
            -
                                            nn.Conv2d( 
     | 
| 
      
 609 
     | 
    
         
            +
                                            nn.Conv2d(
         
     | 
| 
      
 610 
     | 
    
         
            +
                                                in_channels, self.hidden_dim, 1, bias=False
         
     | 
| 
      
 611 
     | 
    
         
            +
                                            ),
         
     | 
| 
       588 
612 
     | 
    
         
             
                                        ),
         
     | 
| 
       589 
613 
     | 
    
         
             
                                        (
         
     | 
| 
       590 
614 
     | 
    
         
             
                                            "norm",
         
     | 
| 
         @@ -665,9 +689,13 @@ class RTDETRTransformerv2(nn.Module): 
     | 
|
| 
       665 
689 
     | 
    
         
             
                            torch.arange(h), torch.arange(w), indexing="ij"
         
     | 
| 
       666 
690 
     | 
    
         
             
                        )
         
     | 
| 
       667 
691 
     | 
    
         
             
                        grid_xy = torch.stack([grid_x, grid_y], dim=-1)
         
     | 
| 
       668 
     | 
    
         
            -
                        grid_xy = (grid_xy.unsqueeze(0) + 0.5) / torch.tensor( 
     | 
| 
      
 692 
     | 
    
         
            +
                        grid_xy = (grid_xy.unsqueeze(0) + 0.5) / torch.tensor(
         
     | 
| 
      
 693 
     | 
    
         
            +
                            [w, h], dtype=dtype
         
     | 
| 
      
 694 
     | 
    
         
            +
                        )
         
     | 
| 
       669 
695 
     | 
    
         
             
                        wh = torch.ones_like(grid_xy) * grid_size * (2.0**lvl)
         
     | 
| 
       670 
     | 
    
         
            -
                        lvl_anchors = torch.concat([grid_xy, wh], dim=-1).reshape( 
     | 
| 
      
 696 
     | 
    
         
            +
                        lvl_anchors = torch.concat([grid_xy, wh], dim=-1).reshape(
         
     | 
| 
      
 697 
     | 
    
         
            +
                            -1, h * w, 4
         
     | 
| 
      
 698 
     | 
    
         
            +
                        )
         
     | 
| 
       671 
699 
     | 
    
         
             
                        anchors.append(lvl_anchors)
         
     | 
| 
       672 
700 
     | 
    
         | 
| 
       673 
701 
     | 
    
         
             
                    anchors = torch.concat(anchors, dim=1).to(device)
         
     | 
| 
         @@ -701,18 +729,22 @@ class RTDETRTransformerv2(nn.Module): 
     | 
|
| 
       701 
729 
     | 
    
         
             
                    )
         
     | 
| 
       702 
730 
     | 
    
         | 
| 
       703 
731 
     | 
    
         
             
                    enc_topk_bboxes_list, enc_topk_logits_list = [], []
         
     | 
| 
       704 
     | 
    
         
            -
                    enc_topk_memory, enc_topk_logits, enc_topk_bbox_unact =  
     | 
| 
       705 
     | 
    
         
            -
                         
     | 
| 
       706 
     | 
    
         
            -
             
     | 
| 
       707 
     | 
    
         
            -
             
     | 
| 
       708 
     | 
    
         
            -
             
     | 
| 
      
 732 
     | 
    
         
            +
                    enc_topk_memory, enc_topk_logits, enc_topk_bbox_unact = (
         
     | 
| 
      
 733 
     | 
    
         
            +
                        self._select_topk(
         
     | 
| 
      
 734 
     | 
    
         
            +
                            output_memory,
         
     | 
| 
      
 735 
     | 
    
         
            +
                            enc_outputs_logits,
         
     | 
| 
      
 736 
     | 
    
         
            +
                            enc_outputs_coord_unact,
         
     | 
| 
      
 737 
     | 
    
         
            +
                            self.num_queries,
         
     | 
| 
      
 738 
     | 
    
         
            +
                        )
         
     | 
| 
       709 
739 
     | 
    
         
             
                    )
         
     | 
| 
       710 
740 
     | 
    
         | 
| 
       711 
741 
     | 
    
         
             
                    # if self.num_select_queries != self.num_queries:
         
     | 
| 
       712 
742 
     | 
    
         
             
                    #     raise NotImplementedError('')
         
     | 
| 
       713 
743 
     | 
    
         | 
| 
       714 
744 
     | 
    
         
             
                    if self.learn_query_content:
         
     | 
| 
       715 
     | 
    
         
            -
                        content = self.tgt_embed.weight.unsqueeze(0).tile( 
     | 
| 
      
 745 
     | 
    
         
            +
                        content = self.tgt_embed.weight.unsqueeze(0).tile(
         
     | 
| 
      
 746 
     | 
    
         
            +
                            [memory.shape[0], 1, 1]
         
     | 
| 
      
 747 
     | 
    
         
            +
                        )
         
     | 
| 
       716 
748 
     | 
    
         
             
                    else:
         
     | 
| 
       717 
749 
     | 
    
         
             
                        content = enc_topk_memory.detach()
         
     | 
| 
       718 
750 
     | 
    
         | 
| 
         @@ -739,7 +771,9 @@ class RTDETRTransformerv2(nn.Module): 
     | 
|
| 
       739 
771 
     | 
    
         
             
                    topk: int,
         
     | 
| 
       740 
772 
     | 
    
         
             
                ):
         
     | 
| 
       741 
773 
     | 
    
         
             
                    if self.query_select_method == "default":
         
     | 
| 
       742 
     | 
    
         
            -
                        _, topk_ind = torch.topk( 
     | 
| 
      
 774 
     | 
    
         
            +
                        _, topk_ind = torch.topk(
         
     | 
| 
      
 775 
     | 
    
         
            +
                            outputs_logits.max(-1).values, topk, dim=-1
         
     | 
| 
      
 776 
     | 
    
         
            +
                        )
         
     | 
| 
       743 
777 
     | 
    
         | 
| 
       744 
778 
     | 
    
         
             
                    elif self.query_select_method == "one2many":
         
     | 
| 
       745 
779 
     | 
    
         
             
                        _, topk_ind = torch.topk(outputs_logits.flatten(1), topk, dim=-1)
         
     | 
| 
         @@ -752,12 +786,16 @@ class RTDETRTransformerv2(nn.Module): 
     | 
|
| 
       752 
786 
     | 
    
         | 
| 
       753 
787 
     | 
    
         
             
                    topk_coords = outputs_coords_unact.gather(
         
     | 
| 
       754 
788 
     | 
    
         
             
                        dim=1,
         
     | 
| 
       755 
     | 
    
         
            -
                        index=topk_ind.unsqueeze(-1).repeat( 
     | 
| 
      
 789 
     | 
    
         
            +
                        index=topk_ind.unsqueeze(-1).repeat(
         
     | 
| 
      
 790 
     | 
    
         
            +
                            1, 1, outputs_coords_unact.shape[-1]
         
     | 
| 
      
 791 
     | 
    
         
            +
                        ),
         
     | 
| 
       756 
792 
     | 
    
         
             
                    )
         
     | 
| 
       757 
793 
     | 
    
         | 
| 
       758 
794 
     | 
    
         
             
                    topk_logits = outputs_logits.gather(
         
     | 
| 
       759 
795 
     | 
    
         
             
                        dim=1,
         
     | 
| 
       760 
     | 
    
         
            -
                        index=topk_ind.unsqueeze(-1).repeat( 
     | 
| 
      
 796 
     | 
    
         
            +
                        index=topk_ind.unsqueeze(-1).repeat(
         
     | 
| 
      
 797 
     | 
    
         
            +
                            1, 1, outputs_logits.shape[-1]
         
     | 
| 
      
 798 
     | 
    
         
            +
                        ),
         
     | 
| 
       761 
799 
     | 
    
         
             
                    )
         
     | 
| 
       762 
800 
     | 
    
         | 
| 
       763 
801 
     | 
    
         
             
                    topk_memory = memory.gather(
         
     | 
| 
         @@ -1,4 +1,17 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
             
     | 
| 
      
 1 
     | 
    
         
            +
            # Copyright 2023 lyuwenyu
         
     | 
| 
      
 2 
     | 
    
         
            +
            #
         
     | 
| 
      
 3 
     | 
    
         
            +
            # Licensed under the Apache License, Version 2.0 (the "License");
         
     | 
| 
      
 4 
     | 
    
         
            +
            # you may not use this file except in compliance with the License.
         
     | 
| 
      
 5 
     | 
    
         
            +
            # You may obtain a copy of the License at
         
     | 
| 
      
 6 
     | 
    
         
            +
            #
         
     | 
| 
      
 7 
     | 
    
         
            +
            #     https://www.apache.org/licenses/LICENSE-2.0
         
     | 
| 
      
 8 
     | 
    
         
            +
            #
         
     | 
| 
      
 9 
     | 
    
         
            +
            # Unless required by applicable law or agreed to in writing, software
         
     | 
| 
      
 10 
     | 
    
         
            +
            # distributed under the License is distributed on an "AS IS" BASIS,
         
     | 
| 
      
 11 
     | 
    
         
            +
            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         
     | 
| 
      
 12 
     | 
    
         
            +
            # See the License for the specific language governing permissions and
         
     | 
| 
      
 13 
     | 
    
         
            +
            # limitations under the License.
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
       2 
15 
     | 
    
         | 
| 
       3 
16 
     | 
    
         
             
            import torch
         
     | 
| 
       4 
17 
     | 
    
         
             
            import torch.nn as nn
         
     | 
| 
         @@ -41,12 +54,16 @@ class RTDETRPostProcessor(nn.Module): 
     | 
|
| 
       41 
54 
     | 
    
         
             
                    logits, boxes = outputs["pred_logits"], outputs["pred_boxes"]
         
     | 
| 
       42 
55 
     | 
    
         
             
                    # orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
         
     | 
| 
       43 
56 
     | 
    
         | 
| 
       44 
     | 
    
         
            -
                    bbox_pred = torchvision.ops.box_convert( 
     | 
| 
      
 57 
     | 
    
         
            +
                    bbox_pred = torchvision.ops.box_convert(
         
     | 
| 
      
 58 
     | 
    
         
            +
                        boxes, in_fmt="cxcywh", out_fmt="xyxy"
         
     | 
| 
      
 59 
     | 
    
         
            +
                    )
         
     | 
| 
       45 
60 
     | 
    
         
             
                    bbox_pred *= orig_target_sizes.repeat(1, 2).unsqueeze(1)
         
     | 
| 
       46 
61 
     | 
    
         | 
| 
       47 
62 
     | 
    
         
             
                    if self.use_focal_loss:
         
     | 
| 
       48 
63 
     | 
    
         
             
                        scores = F.sigmoid(logits)
         
     | 
| 
       49 
     | 
    
         
            -
                        scores, index = torch.topk( 
     | 
| 
      
 64 
     | 
    
         
            +
                        scores, index = torch.topk(
         
     | 
| 
      
 65 
     | 
    
         
            +
                            scores.flatten(1), self.num_top_queries, dim=-1
         
     | 
| 
      
 66 
     | 
    
         
            +
                        )
         
     | 
| 
       50 
67 
     | 
    
         
             
                        # TODO for older tensorrt
         
     | 
| 
       51 
68 
     | 
    
         
             
                        # labels = index % self.num_classes
         
     | 
| 
       52 
69 
     | 
    
         
             
                        labels = mod(index, self.num_classes)
         
     | 
| 
         @@ -60,7 +77,9 @@ class RTDETRPostProcessor(nn.Module): 
     | 
|
| 
       60 
77 
     | 
    
         
             
                        scores = F.softmax(logits)[:, :, :-1]
         
     | 
| 
       61 
78 
     | 
    
         
             
                        scores, labels = scores.max(dim=-1)
         
     | 
| 
       62 
79 
     | 
    
         
             
                        if scores.shape[1] > self.num_top_queries:
         
     | 
| 
       63 
     | 
    
         
            -
                            scores, index = torch.topk( 
     | 
| 
      
 80 
     | 
    
         
            +
                            scores, index = torch.topk(
         
     | 
| 
      
 81 
     | 
    
         
            +
                                scores, self.num_top_queries, dim=-1
         
     | 
| 
      
 82 
     | 
    
         
            +
                            )
         
     | 
| 
       64 
83 
     | 
    
         
             
                            labels = torch.gather(labels, dim=1, index=index)
         
     | 
| 
       65 
84 
     | 
    
         
             
                            boxes = torch.gather(
         
     | 
| 
       66 
85 
     | 
    
         
             
                                boxes,
         
     | 
| 
         @@ -78,7 +97,10 @@ class RTDETRPostProcessor(nn.Module): 
     | 
|
| 
       78 
97 
     | 
    
         | 
| 
       79 
98 
     | 
    
         
             
                        labels = (
         
     | 
| 
       80 
99 
     | 
    
         
             
                            torch.tensor(
         
     | 
| 
       81 
     | 
    
         
            -
                                [ 
     | 
| 
      
 100 
     | 
    
         
            +
                                [
         
     | 
| 
      
 101 
     | 
    
         
            +
                                    mscoco_label2category[int(x.item())]
         
     | 
| 
      
 102 
     | 
    
         
            +
                                    for x in labels.flatten()
         
     | 
| 
      
 103 
     | 
    
         
            +
                                ]
         
     | 
| 
       82 
104 
     | 
    
         
             
                            )
         
     | 
| 
       83 
105 
     | 
    
         
             
                            .to(boxes.device)
         
     | 
| 
       84 
106 
     | 
    
         
             
                            .reshape(labels.shape)
         
     | 
| 
         @@ -1,6 +1,6 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            Metadata-Version: 2.3
         
     | 
| 
       2 
2 
     | 
    
         
             
            Name: yomitoku
         
     | 
| 
       3 
     | 
    
         
            -
            Version: 0.5. 
     | 
| 
      
 3 
     | 
    
         
            +
            Version: 0.5.3
         
     | 
| 
       4 
4 
     | 
    
         
             
            Summary: Yomitoku is an AI-powered document image analysis package designed specifically for the Japanese language.
         
     | 
| 
       5 
5 
     | 
    
         
             
            Author-email: Kotaro Kinoshita <kotaro.kinoshita@mlism.com>
         
     | 
| 
       6 
6 
     | 
    
         
             
            License: CC BY-NC-SA 4.0
         
     | 
| 
         @@ -29,16 +29,16 @@ yomitoku/models/dbnet_plus.py,sha256=jeWJZm0ihbxoJeAXBFK7uVIwoosx2IUNk7Ut5wRH0vA 
     | 
|
| 
       29 
29 
     | 
    
         
             
            yomitoku/models/parseq.py,sha256=7QT-q5_oWqXTDXobRk1R6Lpap_AxdC4AzkSsOgXjOwM,8611
         
     | 
| 
       30 
30 
     | 
    
         
             
            yomitoku/models/rtdetr.py,sha256=oJsr8RHz3frslhLfXdVJve47lUsrmqLjfdTrZ41tlQ0,687
         
     | 
| 
       31 
31 
     | 
    
         
             
            yomitoku/models/layers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         
     | 
| 
       32 
     | 
    
         
            -
            yomitoku/models/layers/activate.py,sha256= 
     | 
| 
      
 32 
     | 
    
         
            +
            yomitoku/models/layers/activate.py,sha256=S54GPssZBMloM2oFAXeDVMmBBZOWyjwU98Niq758txE,1244
         
     | 
| 
       33 
33 
     | 
    
         
             
            yomitoku/models/layers/dbnet_feature_attention.py,sha256=Vpp_PiLVuI7Zs30TTg4RNRn16KTb81ewonADpUHd4aE,6060
         
     | 
| 
       34 
34 
     | 
    
         
             
            yomitoku/models/layers/parseq_transformer.py,sha256=33eroJf8rmgIptP-NpZLJMhG7XOTwV4rXsq674VrKnU,6704
         
     | 
| 
       35 
     | 
    
         
            -
            yomitoku/models/layers/rtdetr_backbone.py,sha256= 
     | 
| 
       36 
     | 
    
         
            -
            yomitoku/models/layers/rtdetr_hybrid_encoder.py,sha256= 
     | 
| 
       37 
     | 
    
         
            -
            yomitoku/models/layers/rtdetrv2_decoder.py,sha256= 
     | 
| 
      
 35 
     | 
    
         
            +
            yomitoku/models/layers/rtdetr_backbone.py,sha256=wn1jOI8oB0V4GWKaCB-WCNMJ7CqmjvkeO8v2GB3gtAQ,10054
         
     | 
| 
      
 36 
     | 
    
         
            +
            yomitoku/models/layers/rtdetr_hybrid_encoder.py,sha256=fCXNw8DmD5rXAQZkiVy2Ohj3v19TFUTUCohGlj7V408,14321
         
     | 
| 
      
 37 
     | 
    
         
            +
            yomitoku/models/layers/rtdetrv2_decoder.py,sha256=iAxZ-TknFuEcI1B6-UU8o0rvWnuBr20idqTWCpc-u7A,28456
         
     | 
| 
       38 
38 
     | 
    
         
             
            yomitoku/postprocessor/__init__.py,sha256=W4vUuqBaFtH5dlSBIYgyaCroGLMjpV6RrNGIBQ8NFVw,243
         
     | 
| 
       39 
39 
     | 
    
         
             
            yomitoku/postprocessor/dbnet_postporcessor.py,sha256=o_y8b5REd2dFEdIpRcr6o-XBfOCHo9rBYGwokP_uhTc,4948
         
     | 
| 
       40 
40 
     | 
    
         
             
            yomitoku/postprocessor/parseq_tokenizer.py,sha256=e89_g_bc4Au3SchuxoJfJNATJTxFmVYetzXyAzPWm28,4315
         
     | 
| 
       41 
     | 
    
         
            -
            yomitoku/postprocessor/rtdetr_postprocessor.py,sha256= 
     | 
| 
      
 41 
     | 
    
         
            +
            yomitoku/postprocessor/rtdetr_postprocessor.py,sha256=ADT620nCs_OvHmoWwH01ylmbHuCNtVUN8pVYYq-vy0Q,4065
         
     | 
| 
       42 
42 
     | 
    
         
             
            yomitoku/resource/MPLUS1p-Medium.ttf,sha256=KLL1KkCumIBkgQtx1n4SffdaFuCNffThktEAbkB1OU8,1758908
         
     | 
| 
       43 
43 
     | 
    
         
             
            yomitoku/resource/charset.txt,sha256=sU91kSi-9Wk4733bCXy4j_UDmvcsj96sHOq1ppUJlOY,21672
         
     | 
| 
       44 
44 
     | 
    
         
             
            yomitoku/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         
     | 
| 
         @@ -46,7 +46,7 @@ yomitoku/utils/graph.py,sha256=LKNB8ZhSQwOZMfeAimPMF5UCVVr2ZaUWoGDkz8z-uGU,456 
     | 
|
| 
       46 
46 
     | 
    
         
             
            yomitoku/utils/logger.py,sha256=uOmtQDr0A0JD7wyFshedL08BiNrQorHnpktRXba8bjU,424
         
     | 
| 
       47 
47 
     | 
    
         
             
            yomitoku/utils/misc.py,sha256=2Eyy7-9K_h4Mal1zGXq6OlxubfNzhS0mEYwn_xt7xl8,2497
         
     | 
| 
       48 
48 
     | 
    
         
             
            yomitoku/utils/visualizer.py,sha256=2pSmbhUPylzVVJ0bXtGDoNmMdArAByab4Py7Xavvs_A,5230
         
     | 
| 
       49 
     | 
    
         
            -
            yomitoku-0.5. 
     | 
| 
       50 
     | 
    
         
            -
            yomitoku-0.5. 
     | 
| 
       51 
     | 
    
         
            -
            yomitoku-0.5. 
     | 
| 
       52 
     | 
    
         
            -
            yomitoku-0.5. 
     | 
| 
      
 49 
     | 
    
         
            +
            yomitoku-0.5.3.dist-info/METADATA,sha256=qLwgVjKd3AELsZu8k1JGbX2-VHgHq3Tn-eCaw11c0_s,7819
         
     | 
| 
      
 50 
     | 
    
         
            +
            yomitoku-0.5.3.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
         
     | 
| 
      
 51 
     | 
    
         
            +
            yomitoku-0.5.3.dist-info/entry_points.txt,sha256=nFV3S11zgBNW0Qq_D0XQNg2R4lNXU_9XUFr6rdJoyF8,52
         
     | 
| 
      
 52 
     | 
    
         
            +
            yomitoku-0.5.3.dist-info/RECORD,,
         
     | 
| 
         
            File without changes
         
     |