PyPI - flaxdiff - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

flaxdiff 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

flaxdiff/models/attention.py +132 -155
flaxdiff/models/autoencoder/__init__.py +0 -0
flaxdiff/models/autoencoder/autoencoder.py +14 -0
flaxdiff/models/autoencoder/diffusers.py +88 -0
flaxdiff/models/common.py +243 -0
flaxdiff/models/simple_unet.py +17 -252
flaxdiff/trainer/__init__.py +28 -45
flaxdiff/trainer/simple_trainer.py +175 -80
{flaxdiff-0.1.4.dist-info → flaxdiff-0.1.5.dist-info}/METADATA +10 -2
{flaxdiff-0.1.4.dist-info → flaxdiff-0.1.5.dist-info}/RECORD +12 -9
{flaxdiff-0.1.4.dist-info → flaxdiff-0.1.5.dist-info}/WHEEL +0 -0
{flaxdiff-0.1.4.dist-info → flaxdiff-0.1.5.dist-info}/top_level.txt +0 -0

flaxdiff/trainer/__init__.py CHANGED Viewed

@@ -1,32 +1,24 @@
-import orbax.checkpoint
-import tqdm
 from flax import linen as nn
 import jax
 from typing import Callable
 from dataclasses import field
 import jax.numpy as jnp
-from clu import metrics
-from flax.training import train_state  # Useful dataclass to keep train state
 import optax
-from flax import struct                # Flax dataclasses
-import time
-import os
-import orbax
-from flax.training import orbax_utils
+from jax.sharding import Mesh, PartitionSpec as P
+from jax.experimental.shard_map import shard_map
+from typing import Dict, Callable, Sequence, Any, Union, Tuple
 from ..schedulers import NoiseScheduler
 from ..predictors import DiffusionPredictionTransform, EpsilonPredictionTransform
-from .simple_trainer import SimpleTrainer, SimpleTrainState
+from flaxdiff.utils import RandomMarkovState
+from .simple_trainer import SimpleTrainer, SimpleTrainState, Metrics
 class TrainState(SimpleTrainState):
     rngs: jax.random.PRNGKey
     ema_params: dict
-    def get_random_key(self):
-        rngs, subkey = jax.random.split(self.rngs)
-        return self.replace(rngs=rngs), subkey
     def apply_ema(self, decay: float = 0.999):
         new_ema_params = jax.tree_util.tree_map(
             lambda ema, param: decay * ema + (1 - decay) * param,
@@ -63,7 +55,7 @@ class DiffusionTrainer(SimpleTrainer):
         self.model_output_transform = model_output_transform
         self.unconditional_prob = unconditional_prob
-    def __init_fn(
+    def generate_states(
         self,
         optimizer: optax.GradientTransformation,
         rngs: jax.random.PRNGKey,
@@ -72,6 +64,7 @@ class DiffusionTrainer(SimpleTrainer):
         model: nn.Module = None,
         param_transforms: Callable = None
     ) -> Tuple[TrainState, TrainState]:
+        print("Generating states for DiffusionTrainer")
         rngs, subkey = jax.random.split(rngs)
         if existing_state == None:
@@ -102,7 +95,7 @@ class DiffusionTrainer(SimpleTrainer):
         return state, best_state
     def _define_train_step(self, batch_size, null_labels_seq, text_embedder):
-        noise_schedule = self.noise_schedule
+        noise_schedule: NoiseScheduler = self.noise_schedule
         model = self.model
         model_output_transform = self.model_output_transform
         loss_fn = self.loss_fn
@@ -117,16 +110,19 @@ class DiffusionTrainer(SimpleTrainer):
         distributed_training = self.distributed_training
-        def train_step(state: TrainState, batch):
+        # @jax.jit
+        def train_step(train_state: TrainState, rng_state: RandomMarkovState, batch, local_device_index):
             """Train for a single step."""
+            rng_state, subkey = rng_state.get_random_key()
+            subkey = jax.random.fold_in(subkey, local_device_index.reshape())
+            local_rng_state = RandomMarkovState(subkey)
             images = batch['image']
             # normalize image
             images = (images - 127.5) / 127.5
             output = text_embedder(
                 input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
-            # output = infer(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
             label_seq = output.last_hidden_state
             # Generate random probabilities to decide how much of this batch will be unconditional
@@ -134,10 +130,11 @@ class DiffusionTrainer(SimpleTrainer):
             label_seq = jnp.concat(
                 [null_labels_seq[:num_unconditional], label_seq[num_unconditional:]], axis=0)
-            noise_level, state = noise_schedule.generate_timesteps(
-                images.shape[0], state)
-            state, rngs = state.get_random_key()
+            noise_level, local_rng_state = noise_schedule.generate_timesteps(images.shape[0], local_rng_state)
+            local_rng_state, rngs = local_rng_state.get_random_key()
             noise: jax.Array = jax.random.normal(rngs, shape=images.shape)
             rates = noise_schedule.get_rates(noise_level)
             noisy_images, c_in, expected_output = model_output_transform.forward_diffusion(
                 images, noise, rates)
@@ -154,16 +151,17 @@ class DiffusionTrainer(SimpleTrainer):
                 loss = nloss
                 return loss
-            loss, grads = jax.value_and_grad(model_loss)(state.params)
+            loss, grads = jax.value_and_grad(model_loss)(train_state.params)
             if distributed_training:
-                grads = jax.lax.pmean(grads, "device")
-            state = state.apply_gradients(grads=grads)
-            state = state.apply_ema(self.ema_decay)
-            return state, loss
+                grads = jax.lax.pmean(grads, "data")
+                loss = jax.lax.pmean(loss, "data")
+            train_state = train_state.apply_gradients(grads=grads)
+            train_state = train_state.apply_ema(self.ema_decay)
+            return train_state, loss, rng_state
         if distributed_training:
-            train_step = jax.pmap(axis_name="device")(train_step)
-        else:
+            train_step = shard_map(train_step, mesh=self.mesh, in_specs=(P(), P(), P('data'), P('data')),
+                                   out_specs=(P(), P(), P()))
             train_step = jax.jit(train_step)
         return train_step
@@ -184,18 +182,3 @@ class DiffusionTrainer(SimpleTrainer):
         text_embedder = data['model']
         super().fit(data, steps_per_epoch, epochs, {
             "batch_size": local_batch_size, "null_labels_seq": null_labels_full, "text_embedder": text_embedder})
-                        pbar.set_postfix(loss=f'{loss:.4f}')
-                        pbar.update(100)
-            end_time = time.time()
-            self.state = state
-            total_time = end_time - start_time
-            avg_time_per_step = total_time / steps_per_epoch
-            avg_loss = epoch_loss / steps_per_epoch
-            if avg_loss < self.best_loss:
-                self.best_loss = avg_loss
-                self.best_state = state
-                self.save(epoch, best=True)
-            print(f"\n\tEpoch {epoch+1} completed. Avg Loss: {avg_loss}, Time: {total_time:.2f}s, Best Loss: {self.best_loss}")
-        return self.state

flaxdiff/trainer/simple_trainer.py CHANGED Viewed

@@ -5,14 +5,60 @@ import jax
 from typing import Callable
 from dataclasses import field
 import jax.numpy as jnp
+import numpy as np
+from functools import partial
 from clu import metrics
 from flax.training import train_state  # Useful dataclass to keep train state
 import optax
 from flax import struct                # Flax dataclasses
+import flax
 import time
 import os
 import orbax
 from flax.training import orbax_utils
+from jax.sharding import Mesh, PartitionSpec as P
+from jax.experimental import mesh_utils
+from jax.experimental.shard_map import shard_map
+from orbax.checkpoint.utils import fully_replicated_host_local_array_to_global_array
+from termcolor import colored
+from typing import Dict, Callable, Sequence, Any, Union, Tuple
+from flaxdiff.utils import RandomMarkovState
+PROCESS_COLOR_MAP = {
+    0: "green",
+    1: "yellow",
+    2: "magenta",
+    3: "cyan",
+    4: "white",
+    5: "light_blue",
+    6: "light_red",
+    7: "light_cyan"
+}
+def _build_global_shape_and_sharding(
+    local_shape: tuple[int, ...], global_mesh: Mesh
+) -> tuple[tuple[int, ...], jax.sharding.NamedSharding]:
+    sharding = jax.sharding.NamedSharding(global_mesh, P(global_mesh.axis_names))
+    global_shape = (jax.process_count() * local_shape[0],) + local_shape[1:]
+    return global_shape, sharding
+def form_global_array(path, array: np.ndarray, global_mesh: Mesh) -> jax.Array:
+    """Put local sharded array into local devices"""
+    global_shape, sharding = _build_global_shape_and_sharding(np.shape(array), global_mesh)
+    try:
+        local_device_arrays = np.split(array, len(global_mesh.local_devices), axis=0)
+    except ValueError as array_split_error:
+        raise ValueError(
+            f"Unable to put to devices shape {array.shape} with "
+            f"local device count {len(global_mesh.local_devices)} "
+        ) from array_split_error
+    local_device_buffers = jax.device_put(local_device_arrays, global_mesh.local_devices)
+    return jax.make_array_from_single_device_arrays(global_shape, sharding, local_device_buffers)
+def convert_to_global_tree(global_mesh, pytree):
+    return jax.tree_util.tree_map_with_path(partial(form_global_array, global_mesh=global_mesh), pytree)
 @struct.dataclass
 class Metrics(metrics.Collection):
@@ -44,41 +90,75 @@ class SimpleTrainer:
                  name: str = "Simple",
                  load_from_checkpoint: bool = False,
                  checkpoint_suffix: str = "",
+                 checkpoint_id: str = None,
                  loss_fn=optax.l2_loss,
                  param_transforms: Callable = None,
                  wandb_config: Dict[str, Any] = None,
                  distributed_training: bool = None,
+                 checkpoint_base_path: str = "./checkpoints",
                  ):
         if distributed_training is None or distributed_training is True:
             # Auto-detect if we are running on multiple devices
             distributed_training = jax.device_count() > 1
+            self.mesh = jax.sharding.Mesh(jax.devices(), 'data')
+            # self.sharding = jax.sharding.NamedSharding(self.mesh, jax.sharding.PartitionSpec('data'))
         self.distributed_training = distributed_training
         self.model = model
         self.name = name
         self.loss_fn = loss_fn
         self.input_shapes = input_shapes
-        if wandb_config is not None:
+        self.checkpoint_base_path = checkpoint_base_path
+        if wandb_config is not None and jax.process_index() == 0:
+            import wandb
             run = wandb.init(**wandb_config)
             self.wandb = run
+            # define our custom x axis metric
+            self.wandb.define_metric("train/step")
+            self.wandb.define_metric("train/epoch")
+            self.wandb.define_metric("train/loss", step_metric="train/step")
+            self.wandb.define_metric("train/epoch_time", step_metric="train/epoch")
+            self.wandb.define_metric("train/avg_time_per_step", step_metric="train/epoch")
+            self.wandb.define_metric("train/avg_loss", step_metric="train/epoch")
+            self.wandb.define_metric("train/best_loss", step_metric="train/epoch")
+        if checkpoint_id is None:
+            self.checkpoint_id = name.replace(' ', '_').replace('-', '_').lower()
+        else:
+            self.checkpoint_id = checkpoint_id
+        # checkpointer = orbax.checkpoint.PyTreeCheckpointer()
+        async_checkpointer = orbax.checkpoint.AsyncCheckpointer(orbax.checkpoint.PyTreeCheckpointHandler(), timeout_secs=60)
-        checkpointer = orbax.checkpoint.PyTreeCheckpointer()
         options = orbax.checkpoint.CheckpointManagerOptions(
             max_to_keep=4, create=True)
         self.checkpointer = orbax.checkpoint.CheckpointManager(
-            self.checkpoint_path() + checkpoint_suffix, checkpointer, options)
+            self.checkpoint_path() + checkpoint_suffix, async_checkpointer, options)
         if load_from_checkpoint:
-            latest_epoch, old_state, old_best_state = self.load()
+            latest_epoch, old_state, old_best_state, rngstate = self.load()
         else:
-            latest_epoch, old_state, old_best_state = 0, None, None
+            latest_epoch, old_state, old_best_state, rngstate = 0, None, None, None
         self.latest_epoch = latest_epoch
+        if rngstate:
+            self.rngstate = RandomMarkovState(**rngstate)
+        else:
+            self.rngstate = RandomMarkovState(rngs)
+        self.rngstate, subkey = self.rngstate.get_random_key()
         if train_state == None:
-            self.init_state(optimizer, rngs, existing_state=old_state,
-                            existing_best_state=old_best_state, model=model, param_transforms=param_transforms)
+            state, best_state = self.generate_states(
+                optimizer, subkey, old_state, old_best_state, model, param_transforms
+            )
+            self.init_state(state, best_state)
         else:
             self.state = train_state
             self.best_state = train_state
@@ -87,7 +167,7 @@ class SimpleTrainer:
     def get_input_ones(self):
         return {k: jnp.ones((1, *v)) for k, v in self.input_shapes.items()}
-    def __init_fn(
+    def generate_states(
         self,
         optimizer: optax.GradientTransformation,
         rngs: jax.random.PRNGKey,
@@ -96,17 +176,19 @@ class SimpleTrainer:
         model: nn.Module = None,
         param_transforms: Callable = None
     ) -> Tuple[SimpleTrainState, SimpleTrainState]:
+        print("Generating states for SimpleTrainer")
         rngs, subkey = jax.random.split(rngs)
         if existing_state == None:
             input_vars = self.get_input_ones()
             params = model.init(subkey, **input_vars)
+        else:
+            params = existing_state['params']
         state = SimpleTrainState.create(
             apply_fn=model.apply,
             params=params,
             tx=optimizer,
-            rngs=rngs,
             metrics=Metrics.empty()
         )
         if existing_best_state is not None:
@@ -119,40 +201,28 @@ class SimpleTrainer:
     def init_state(
         self,
-        optimizer: optax.GradientTransformation,
-        rngs: jax.random.PRNGKey,
-        existing_state: dict = None,
-        existing_best_state: dict = None,
-        model: nn.Module = None,
-        param_transforms: Callable = None
+        state: SimpleTrainState,
+        best_state: SimpleTrainState,
     ):
-        state, best_state = self.__init_fn(
-            optimizer, rngs, existing_state, existing_best_state, model, param_transforms
-        )
         self.best_loss = 1e9
-        if self.distributed_training:
-            devices = jax.local_devices()
-            if len(devices) > 1:
-                print("Replicating state across devices ", devices)
-                state = flax.jax_utils.replicate(state, devices)
-                best_state = flax.jax_utils.replicate(best_state, devices)
-            else:
-                print("Not replicating any state, Only single device connected to the process")
         self.state = state
         self.best_state = best_state
     def get_state(self):
-        return flax.jax_utils.unreplicate(self.state)
+        # return fully_replicated_host_local_array_to_global_array()
+        return jax.tree_util.tree_map(lambda x : np.array(x), self.state)
     def get_best_state(self):
-        return flax.jax_utils.unreplicate(self.best_state)
+        # return convert_to_global_tree(self.mesh, flax.jax_utils.replicate(self.best_state, jax.local_devices()))
+        return jax.tree_util.tree_map(lambda x : np.array(x), self.best_state)
+    def get_rngstate(self):
+        # return convert_to_global_tree(self.mesh, flax.jax_utils.replicate(self.rngstate, jax.local_devices()))
+        return jax.tree_util.tree_map(lambda x : np.array(x), self.rngstate)
     def checkpoint_path(self):
-        experiment_name = self.name
-        path = os.path.join(os.path.abspath('./checkpoints'), experiment_name)
+        path = os.path.join(self.checkpoint_base_path, self.checkpoint_id)
         if not os.path.exists(path):
             os.makedirs(path)
         return path
@@ -170,24 +240,27 @@ class SimpleTrainer:
         ckpt = self.checkpointer.restore(epoch)
         state = ckpt['state']
         best_state = ckpt['best_state']
+        rngstate = ckpt['rngs']
         # Convert the state to a TrainState
         self.best_loss = ckpt['best_loss']
         print(
             f"Loaded model from checkpoint at epoch {epoch}", ckpt['best_loss'])
-        return epoch, state, best_state
+        return epoch, state, best_state, rngstate
     def save(self, epoch=0):
         print(f"Saving model at epoch {epoch}")
         ckpt = {
             # 'model': self.model,
+            'rngs': self.get_rngstate(),
             'state': self.get_state(),
             'best_state': self.get_best_state(),
-            'best_loss': self.best_loss
+            'best_loss': np.array(self.best_loss),
         }
         try:
             save_args = orbax_utils.save_args_from_target(ckpt)
             self.checkpointer.save(epoch, ckpt, save_kwargs={
                                    'save_args': save_args}, force=True)
+            self.checkpointer.wait_until_finished()
             pass
         except Exception as e:
             print("Error saving checkpoint", e)
@@ -197,7 +270,7 @@ class SimpleTrainer:
         loss_fn = self.loss_fn
         distributed_training = self.distributed_training
-        def train_step(state: SimpleTrainState, batch):
+        def train_step(train_state: SimpleTrainState, rng_state: RandomMarkovState, batch, local_device_indexes):
             """Train for a single step."""
             images = batch['image']
             labels = batch['label']
@@ -208,17 +281,15 @@ class SimpleTrainer:
                 nloss = loss_fn(preds, expected_output)
                 loss = jnp.mean(nloss)
                 return loss
-            loss, grads = jax.value_and_grad(model_loss)(state.params)
+            loss, grads = jax.value_and_grad(model_loss)(train_state.params)
             if distributed_training:
-                grads = jax.lax.pmean(grads, "device")
-            state = state.apply_gradients(grads=grads)
-            return state, loss
+                grads = jax.lax.pmean(grads, "data")
+            train_state = train_state.apply_gradients(grads=grads)
+            return train_state, loss, rng_state
         if distributed_training:
-            train_step = jax.pmap(axis_name="device")(train_step)
-        else:
-            train_step = jax.jit(train_step)
+            train_step = shard_map(train_step, mesh=self.mesh, in_specs=(P(), P(), P('data'), P('data')), out_specs=(P(), P('data'), P()))
+            train_step = jax.pmap(train_step)
         return train_step
     def _define_compute_metrics(self):
@@ -251,6 +322,7 @@ class SimpleTrainer:
         }
     def init_tensorboard(self, batch_size, steps_per_epoch, epochs):
+        from flax.metrics import tensorboard
         summary_writer = tensorboard.SummaryWriter(self.tensorboard_path())
         summary_writer.hparams({
             **self.config(),
@@ -268,56 +340,79 @@ class SimpleTrainer:
             test_ds = None
         train_step = self._define_train_step(**train_step_args)
         compute_metrics = self._define_compute_metrics()
-        state = self.state
-        device_count = jax.local_device_count()
-        # train_ds = flax.jax_utils.prefetch_to_device(train_ds, jax.devices())
-        summary_writer = self.init_tensorboard(
-            data['global_batch_size'], steps_per_epoch, epochs)
+        train_state = self.state
+        rng_state = self.rngstate
+        global_device_count = jax.device_count()
+        local_device_count = jax.local_device_count()
+        process_index = jax.process_index()
+        if self.distributed_training:
+            global_device_indexes = jnp.arange(global_device_count)
+        else:
+            global_device_indexes = 0
-        while self.latest_epoch <= epochs:
-            self.latest_epoch += 1
-            current_epoch = self.latest_epoch
-            print(f"\nEpoch {current_epoch}/{epochs}")
-            start_time = time.time()
+        def train_loop(current_epoch, pbar: tqdm.tqdm, train_state, rng_state):
             epoch_loss = 0
-            with tqdm.tqdm(total=steps_per_epoch, desc=f'\t\tEpoch {current_epoch}', ncols=100, unit='step') as pbar:
-                for i in range(steps_per_epoch):
-                    batch = next(train_ds)
-                    if self.distributed_training and device_count > 1:
-                        batch = jax.tree.map(lambda x: x.reshape(
-                            (device_count, -1, *x.shape[1:])), batch)
+            current_step = 0
+            for i in range(steps_per_epoch):
+                batch = next(train_ds)
+                if self.distributed_training and global_device_count > 1:
+                    # Convert the local device batches to a unified global jax.Array
+                    batch = convert_to_global_tree(self.mesh, batch)
+                train_state, loss, rng_state = train_step(train_state, rng_state, batch, global_device_indexes)
+                if self.distributed_training:
+                    loss = jax.experimental.multihost_utils.process_allgather(loss)
+                    loss = jnp.mean(loss) # Just to make sure its a scaler value
+                epoch_loss += loss
-                    state, loss = train_step(state, batch)
-                    loss = jnp.mean(loss)
-                    epoch_loss += loss
+                if pbar is not None:
                     if i % 100 == 0:
                         pbar.set_postfix(loss=f'{loss:.4f}')
                         pbar.update(100)
                         current_step = current_epoch*steps_per_epoch + i
-                        summary_writer.scalar(
-                            'Train Loss', loss, step=current_step)
                         if self.wandb is not None:
-                            self.wandb.log({"train/loss": loss})
+                            self.wandb.log({
+                                "train/step" : current_step,
+                                "train/loss": loss,
+                            }, step=current_step)
+            print(colored(f"Epoch done on index {process_index} => {current_epoch} Loss: {epoch_loss/steps_per_epoch}", 'green'))
+            return epoch_loss, current_step, train_state, rng_state
+        while self.latest_epoch < epochs:
+            current_epoch = self.latest_epoch
+            self.latest_epoch += 1
+            print(f"\nEpoch {current_epoch}/{epochs}")
+            start_time = time.time()
+            epoch_loss = 0
-            print(f"\n\tEpoch done")
+            if process_index == 0:
+                with tqdm.tqdm(total=steps_per_epoch, desc=f'\t\tEpoch {current_epoch}', ncols=100, unit='step') as pbar:
+                    epoch_loss, current_step, train_state, rng_state = train_loop(current_epoch, pbar, train_state, rng_state)
+            else:
+                epoch_loss, current_step, train_state, rng_state = train_loop(current_epoch, None, train_state, rng_state)
+                print(colored(f"Epoch done on process index {process_index}", PROCESS_COLOR_MAP.get(process_index, 'white')))
             end_time = time.time()
-            self.state = state
+            self.state = train_state
+            self.rngstate = rng_state
             total_time = end_time - start_time
             avg_time_per_step = total_time / steps_per_epoch
             avg_loss = epoch_loss / steps_per_epoch
             if avg_loss < self.best_loss:
                 self.best_loss = avg_loss
-                self.best_state = state
+                self.best_state = train_state
                 self.save(current_epoch)
-            # Compute Metrics
-            metrics_str = ''
-            print(
-                f"\n\tEpoch {current_epoch} completed. Avg Loss: {avg_loss}, Time: {total_time:.2f}s, Best Loss: {self.best_loss} {metrics_str}")
+            if process_index == 0:
+                if self.wandb is not None:
+                    self.wandb.log({
+                        "train/epoch_time": total_time,
+                        "train/avg_time_per_step": avg_time_per_step,
+                        "train/avg_loss": avg_loss,
+                        "train/best_loss": self.best_loss,
+                        "train/epoch": current_epoch,
+                    }, step=current_step)
+                print(colored(f"\n\tEpoch {current_epoch} completed. Avg Loss: {avg_loss}, Time: {total_time:.2f}s, Best Loss: {self.best_loss}", 'green'))
         self.save(epochs)
-        return self.state
+        return self.state

{flaxdiff-0.1.4.dist-info → flaxdiff-0.1.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: flaxdiff
-Version: 0.1.4
+Version: 0.1.5
 Summary: A versatile and easy to understand Diffusion library
 Author: Ashish Kumar Singh
 Author-email: ashishkmr472@gmail.com
@@ -27,7 +27,7 @@ The `Diffusion_flax_linen.ipynb` notebook is my main workspace for experiments.
 In the `example notebooks` folder, you will find comprehensive notebooks for various diffusion techniques, written entirely from scratch and are independent of the FlaxDiff library. Each notebook includes detailed explanations of the underlying mathematics and concepts, making them invaluable resources for learning and understanding diffusion models.
-### Available Notebooks
+### Available Notebooks and Resources
 - **[Diffusion explained (nbviewer link)](https://nbviewer.org/github/AshishKumar4/FlaxDiff/blob/main/tutorial%20notebooks/simple%20diffusion%20flax.ipynb) [(local link)](tutorial%20notebooks/simple%20diffusion%20flax.ipynb)**
@@ -46,6 +46,14 @@ In the `example notebooks` folder, you will find comprehensive notebooks for var
 These notebooks aim to provide a very easy to understand and step-by-step guide to the various diffusion models and techniques. They are designed to be beginner-friendly, and thus although they may not adhere to the exact formulations and implementations of the original papers to make them more understandable and generalizable, I have tried my best to keep them as accurate as possible. If you find any mistakes or have any suggestions, please feel free to open an issue or a pull request.
+#### Other resources
+- **[Multi-host Data parallel training script in JAX](./training.py)**
+  - Training script for multi-host data parallel training in JAX, to serve as a reference for training large models on multiple GPUs/TPUs across multiple hosts. A full-fledged tutorial notebook is in the works.
+- **[TPU utilities for making life easier](./tpu-tools/)**
+  - A collection of utilities and scripts to make working with TPUs easier, such as cli to create/start/stop/setup TPUs, script to setup TPU VMs (install everything you need), mounting gcs datasets etc.
 ## Disclaimer (and About Me)
 I worked as a Machine Learning Researcher at Hyperverge from 2019-2021, focusing on computer vision, specifically facial anti-spoofing and facial detection & recognition. Since switching to my current job in 2021, I haven't engaged in as much R&D work, leading me to start this pet project to revisit and relearn the fundamentals and get familiar with the state-of-the-art. My current role involves primarily Golang system engineering with some applied ML work just sprinkled in. Therefore, the code may reflect my learning journey. Please forgive any mistakes and do open an issue to let me know.

{flaxdiff-0.1.4.dist-info → flaxdiff-0.1.5.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,14 @@
 flaxdiff/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 flaxdiff/utils.py,sha256=B0GcHlzlVYDNEIdh2v5qmP4u0neIT-FqexNohuyuCvg,2452
 flaxdiff/models/__init__.py,sha256=FAivVYXxM2JrCFIXf-C3374RB2Hth25dBrzOeNFhH1U,26
-flaxdiff/models/attention.py,sha256=SL9cvINjmabW1LPvXLAFZNHv-FF1Ez_d3J7n5uHBTyQ,15301
-flaxdiff/models/common.py,sha256=CjC4iRLjkF3oQ0f6rAqfiLaiHllZGtCOwN3rXDUndbE,274
+flaxdiff/models/attention.py,sha256=KiAUyfujGpUZR13aJR6RVnL6pBXk5UcyM62VIXhojMg,14468
+flaxdiff/models/common.py,sha256=jlyRB4uF7BmeuExor1YHaqEbBjSuyaDZ4mDsSW3rWKE,7948
 flaxdiff/models/favor_fastattn.py,sha256=79Ew1nqarsNLPzZaBSd1ILORzJr74CupYeqGiCQK5E4,27689
-flaxdiff/models/simple_unet.py,sha256=WlLry6v18syHBzcN8zAJ-zIVtq6ItMEIBWbeCcX0MLU,18693
+flaxdiff/models/simple_unet.py,sha256=o1DCa9yvqarEGTiUKsTqE70q-h6bRU6HcU0lZpb65jc,11418
 flaxdiff/models/simple_vit.py,sha256=vTu2CQRoSOxetBHTrnCWddm-vxrZDkMe8EpdNxtpJMk,4015
+flaxdiff/models/autoencoder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+flaxdiff/models/autoencoder/autoencoder.py,sha256=At-DhcmrZ0Gao4PUa4l9D25FTdTPwbE4gu6LKcFKzUQ,433
+flaxdiff/models/autoencoder/diffusers.py,sha256=gwyD98277vQGKVPFbyd6w6CupoxMsNgKlN67AtzLCtg,3267
 flaxdiff/predictors/__init__.py,sha256=SKkYYRF9Wfgk2zhtZw4vCXOdOeRlrm2Mk6cvuaEvAzc,4403
 flaxdiff/samplers/__init__.py,sha256=_S-9TwDeshrI0VmapV-J2hqjTByOa0-oOeUs_IdovjU,285
 flaxdiff/samplers/common.py,sha256=_an5h5Niz9Joz_-ppridLrGHpu8X0VVvhNGknPu6AUY,5272
@@ -24,9 +27,9 @@ flaxdiff/schedulers/exp.py,sha256=cPTnUJpYdzJRRZqMLYQz0rRUCpEmaP2tXhRumLx94jA,60
 flaxdiff/schedulers/karras.py,sha256=4GN120kGwdxxU-h2mVdhBVy9IORkUMm_vvz3XjthBcI,3355
 flaxdiff/schedulers/linear.py,sha256=6003F5ISq1Wc0h6UAzY95MJgsDIKGMhBzbiVALpea0k,581
 flaxdiff/schedulers/sqrt.py,sha256=1F84ZgQPuoNMhe6yxGTR2G0h7dPOZtm4UDQOakbSsEU,445
-flaxdiff/trainer/__init__.py,sha256=kwzkm-BD97hffFIXZUP1Hb3_D85fZ4SRNO7bviEwHU8,7591
-flaxdiff/trainer/simple_trainer.py,sha256=jafxr-yZ6FXn0Qi-iTSnlf275QWnIO4GnSvNAeB3H-Q,11651
-flaxdiff-0.1.4.dist-info/METADATA,sha256=G8OijdrrYWuKyAfCNtD_dKwdfBmdME56vpR-EYIZKXg,19229
-flaxdiff-0.1.4.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
-flaxdiff-0.1.4.dist-info/top_level.txt,sha256=-2-nXnfkJgSfkki1tjm5Faw6Dso7vhtdn2szwCdX5CQ,9
-flaxdiff-0.1.4.dist-info/RECORD,,
+flaxdiff/trainer/__init__.py,sha256=17qKQFITCfaXQFKYElMzkE-c-EPrv5iUL66gY1gKOsQ,7243
+flaxdiff/trainer/simple_trainer.py,sha256=f4g2KGuGM__d9v_4Ip3ng8wQubmenWZUW60VEu2ANOg,16774
+flaxdiff-0.1.5.dist-info/METADATA,sha256=tGKayFhkYSJJnLY_sHiaCJ60kJZqnO-kcLM3uH3JSN4,19811
+flaxdiff-0.1.5.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
+flaxdiff-0.1.5.dist-info/top_level.txt,sha256=-2-nXnfkJgSfkki1tjm5Faw6Dso7vhtdn2szwCdX5CQ,9
+flaxdiff-0.1.5.dist-info/RECORD,,

{flaxdiff-0.1.4.dist-info → flaxdiff-0.1.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{flaxdiff-0.1.4.dist-info → flaxdiff-0.1.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

flaxdiff 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

flaxdiff 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl