PyPI - d4rt - Versions diffs - 0.0.2__tar.gz → 0.0.3__tar.gz - Mend

d4rt 0.0.2tar.gz → 0.0.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

{d4rt-0.0.2 → d4rt-0.0.3}/PKG-INFO +41 -1
{d4rt-0.0.2 → d4rt-0.0.3}/README.md +40 -0
{d4rt-0.0.2 → d4rt-0.0.3}/d4rt/d4rt.py +29 -7
{d4rt-0.0.2 → d4rt-0.0.3}/pyproject.toml +1 -1
{d4rt-0.0.2 → d4rt-0.0.3}/.gitignore +0 -0
{d4rt-0.0.2 → d4rt-0.0.3}/LICENSE +0 -0
{d4rt-0.0.2 → d4rt-0.0.3}/d4rt/__init__.py +0 -0

{d4rt-0.0.2 → d4rt-0.0.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: d4rt
-Version: 0.0.2
+Version: 0.0.3
 Summary: Implementation of D4RT, Efficiently Reconstructing Dynamic Scenes
 Project-URL: Homepage, https://pypi.org/project/d4rt/
 Project-URL: Repository, https://codeberg.org/lucidrains/d4rt
@@ -50,6 +50,46 @@ Description-Content-Type: text/markdown
 Implementation of [D4RT](https://d4rt-paper.github.io/), Efficiently Reconstructing Dynamic Scenes, Deepmind
+## install
+```shell
+$ pip install d4rt
+```
+## usage
+```python
+import torch
+from d4rt import D4RT
+model = D4RT(
+    dim = 512,
+    video_image_size = 128,
+    video_patch_size = 32,
+    video_max_time_len = 10,
+    enc_depth = 6,
+    dec_depth = 6
+)
+videos = torch.randn(2, 10, 3, 128, 128)
+points = torch.randn(2, 5, 3)
+queries = torch.randn(2, 5, 512)
+loss = model(
+    videos,
+    coors = torch.randint(0, 128, (2, 5, 2)),
+    time_src = torch.randint(0, 10, (2, 5)),
+    time_tgt = torch.randint(0, 10, (2, 5)),
+    time_camera = torch.randint(0, 10, (2, 5)),
+    points = points
+)
+loss.backward()
+pred = model(videos, queries = queries) # (2, 5, 3)
+assert pred.shape == (2, 5, 3)
+```
 ## citations
 ```bibtex

{d4rt-0.0.2 → d4rt-0.0.3}/README.md RENAMED Viewed

@@ -4,6 +4,46 @@
 Implementation of [D4RT](https://d4rt-paper.github.io/), Efficiently Reconstructing Dynamic Scenes, Deepmind
+## install
+```shell
+$ pip install d4rt
+```
+## usage
+```python
+import torch
+from d4rt import D4RT
+model = D4RT(
+    dim = 512,
+    video_image_size = 128,
+    video_patch_size = 32,
+    video_max_time_len = 10,
+    enc_depth = 6,
+    dec_depth = 6
+)
+videos = torch.randn(2, 10, 3, 128, 128)
+points = torch.randn(2, 5, 3)
+queries = torch.randn(2, 5, 512)
+loss = model(
+    videos,
+    coors = torch.randint(0, 128, (2, 5, 2)),
+    time_src = torch.randint(0, 10, (2, 5)),
+    time_tgt = torch.randint(0, 10, (2, 5)),
+    time_camera = torch.randint(0, 10, (2, 5)),
+    points = points
+)
+loss.backward()
+pred = model(videos, queries = queries) # (2, 5, 3)
+assert pred.shape == (2, 5, 3)
+```
 ## citations
 ```bibtex

{d4rt-0.0.2 → d4rt-0.0.3}/d4rt/d4rt.py RENAMED Viewed

@@ -10,9 +10,9 @@ from x_transformers import Encoder, CrossAttender, Attention, FeedForward
 # ein notation
 import einx
-from einops import rearrange
+from einops import rearrange, repeat
 from einops.layers.torch import Rearrange
-from torch_einops_utils import pack_with_inverse
+from torch_einops_utils import pack_with_inverse, lens_to_mask, maybe
 # helpers
@@ -24,7 +24,12 @@ def divisible_by(num, den):
 # function for the patch embedding in the query
-def extract_patches(video, coors, time_src, patch_size):
+def extract_patches(
+    video,      # float[b t c h w]
+    coors,      # int[b q 2]
+    time_src,   # int[b q]
+    patch_size
+):
     b, q, p, device = *time_src.shape, patch_size, video.device
     padded_video = F.pad(video, (p,) * 4)
@@ -112,11 +117,15 @@ class VideoEncoder(Module):
     def forward(
         self,
-        video # float[b t c h w]
+        video,      # float[b t c h w],
+        mask = None # bool[b t]
     ): # float[b n d]
         tokens = self.patch_to_tokens(video) # float[b t s d]
+        if exists(mask):
+            mask = repeat(mask, 'b ... -> (b s) ...', s = tokens.shape[-2])
         for spatial_attn, time_attn, ff in self.layers:
             # space attn
@@ -133,7 +142,7 @@ class VideoEncoder(Module):
             tokens, inverse_pack = pack_with_inverse(tokens, '* t d')
-            tokens = time_attn(tokens) + tokens
+            tokens = time_attn(tokens,  mask = mask) + tokens
             tokens = inverse_pack(tokens)
@@ -223,8 +232,12 @@ class D4RT(Module):
         time_camera = None, # int[b q]
         queries = None,     # float[b q d]
         points = None,      # float[b q 3]
-        return_pred = False
+        return_pred = False,
+        video_lens = None   # int[b]
     ):
+        # embedding to queries
         assert (
             exists(queries) or
             all([exists(p) for p in (coors, time_src, time_tgt, time_camera)])
@@ -245,12 +258,21 @@ class D4RT(Module):
             queries = self.norm_queries(queries)
-        global_spatial_repr = self.to_global_spatial_repr(video)
+        # self attention
+        time = video.shape[1]
+        video_mask = maybe(lens_to_mask)(video_lens, time)
+        global_spatial_repr = self.to_global_spatial_repr(video, mask = video_mask)
         global_spatial_repr, inverse_pack_spacetime = pack_with_inverse(global_spatial_repr, 'b * d')
+        # cross attention
         queried = self.cross_attender(queries, context = global_spatial_repr)
+        # prediction
         pred = self.to_pred(queried)
         if not exists(points):

{d4rt-0.0.2 → d4rt-0.0.3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "d4rt"
-version = "0.0.2"
+version = "0.0.3"
 description = "Implementation of D4RT, Efficiently Reconstructing Dynamic Scenes"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }

{d4rt-0.0.2 → d4rt-0.0.3}/.gitignore RENAMED Viewed

File without changes

{d4rt-0.0.2 → d4rt-0.0.3}/LICENSE RENAMED Viewed

File without changes

{d4rt-0.0.2 → d4rt-0.0.3}/d4rt/__init__.py RENAMED Viewed

File without changes

d4rt 0.0.2__tar.gz → 0.0.3__tar.gz

d4rt 0.0.2tar.gz → 0.0.3tar.gz