PyPI - flaxdiff - Versions diffs - 0.1.16__py3-none-any.whl → 0.1.18__py3-none-any.whl - Mend

flaxdiff 0.1.16py3-none-any.whl → 0.1.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

flaxdiff/data/online_loader.py CHANGED Viewed

@@ -13,7 +13,7 @@ from typing import Any, Dict, List, Tuple
 import numpy as np
 from functools import partial
-from datasets import load_dataset, concatenate_datasets, Dataset
+from datasets import load_dataset, concatenate_datasets, Dataset, load_from_disk
 from datasets.utils.file_utils import get_datasets_user_agent
 from concurrent.futures import ThreadPoolExecutor
 import io
@@ -25,7 +25,8 @@ import cv2
 USER_AGENT = get_datasets_user_agent()
 data_queue = Queue(16*2000)
-error_queue = Queue(16*2000)
+error_queue = Queue()
 def fetch_single_image(image_url, timeout=None, retries=0):
     for _ in range(retries + 1):
@@ -42,19 +43,35 @@ def fetch_single_image(image_url, timeout=None, retries=0):
             image = None
     return image
+def default_image_processor(image, image_shape, interpolation=cv2.INTER_LANCZOS4):
+    image = A.longest_max_size(image, max(
+        image_shape), interpolation=interpolation)
+    image = A.pad(
+        image,
+        min_height=image_shape[0],
+        min_width=image_shape[1],
+        border_mode=cv2.BORDER_CONSTANT,
+        value=[255, 255, 255],
+    )
+    return image
 def map_sample(
-    url, caption,
+    url, caption,
     image_shape=(256, 256),
     timeout=15,
     retries=3,
     upscale_interpolation=cv2.INTER_LANCZOS4,
     downscale_interpolation=cv2.INTER_AREA,
+    image_processor=default_image_processor,
 ):
     try:
-        image = fetch_single_image(url, timeout=timeout, retries=retries)  # Assuming fetch_single_image is defined elsewhere
+        # Assuming fetch_single_image is defined elsewhere
+        image = fetch_single_image(url, timeout=timeout, retries=retries)
         if image is None:
             return
         image = np.array(image)
         original_height, original_width = image.shape[:2]
         # check if the image is too small
@@ -69,14 +86,10 @@ def map_sample(
         image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
         downscale = max(original_width, original_height) > max(image_shape)
         interpolation = downscale_interpolation if downscale else upscale_interpolation
-        image = A.longest_max_size(image, max(image_shape), interpolation=interpolation)
-        image = A.pad(
-            image,
-            min_height=image_shape[0],
-            min_width=image_shape[1],
-            border_mode=cv2.BORDER_CONSTANT,
-            value=[255, 255, 255],
-        )
+        image = image_processor(
+            image, image_shape, interpolation=interpolation)
         data_queue.put({
             "url": url,
             "caption": caption,
@@ -85,65 +98,77 @@ def map_sample(
             "original_width": original_width,
         })
     except Exception as e:
-        print(f"Error in map_sample: {str(e)}")
-        error_queue.put({
+        error_queue.put_nowait({
             "url": url,
             "caption": caption,
             "error": str(e)
         })
-def map_batch(batch, num_threads=256, image_shape=(256, 256), timeout=15, retries=3):
+def map_batch(batch, num_threads=256, image_shape=(256, 256), timeout=15, retries=3, image_processor=default_image_processor):
     try:
-        map_sample_fn = partial(map_sample, image_shape=image_shape, timeout=timeout, retries=retries)
+        map_sample_fn = partial(map_sample, image_shape=image_shape,
+                                timeout=timeout, retries=retries, image_processor=image_processor)
         with ThreadPoolExecutor(max_workers=num_threads) as executor:
             executor.map(map_sample_fn, batch["url"], batch['caption'])
     except Exception as e:
-        print(f"Error in map_batch: {str(e)}")
         error_queue.put({
             "batch": batch,
             "error": str(e)
         })
-def parallel_image_loader(dataset: Dataset, num_workers: int = 8, image_shape=(256, 256), num_threads=256):
-    map_batch_fn = partial(map_batch, num_threads=num_threads, image_shape=image_shape)
+def parallel_image_loader(dataset: Dataset, num_workers: int = 8, image_shape=(256, 256),
+                          num_threads=256, timeout=15, retries=3, image_processor=default_image_processor):
+    map_batch_fn = partial(map_batch, num_threads=num_threads, image_shape=image_shape,
+                           timeout=timeout, retries=retries, image_processor=image_processor)
     shard_len = len(dataset) // num_workers
     print(f"Local Shard lengths: {shard_len}")
     with multiprocessing.Pool(num_workers) as pool:
         iteration = 0
         while True:
             # Repeat forever
-            print(f"Shuffling dataset with seed {iteration}")
-            # dataset = dataset.shuffle(seed=iteration)
-            shards = [dataset[i*shard_len:(i+1)*shard_len] for i in range(num_workers)]
+            shards = [dataset[i*shard_len:(i+1)*shard_len]
+                      for i in range(num_workers)]
             print(f"mapping {len(shards)} shards")
             pool.map(map_batch_fn, shards)
             iteration += 1
+            print(f"Shuffling dataset with seed {iteration}")
+            dataset = dataset.shuffle(seed=iteration)
+            # Clear the error queue
+            while not error_queue.empty():
+                error_queue.get_nowait()
 class ImageBatchIterator:
-    def __init__(self, dataset: Dataset, batch_size: int = 64, image_shape=(256, 256), num_workers: int = 8, num_threads=256):
+    def __init__(self, dataset: Dataset, batch_size: int = 64, image_shape=(256, 256),
+                 num_workers: int = 8, num_threads=256, timeout=15, retries=3, image_processor=default_image_processor):
         self.dataset = dataset
         self.num_workers = num_workers
         self.batch_size = batch_size
-        loader = partial(parallel_image_loader, num_threads=num_threads, image_shape=image_shape, num_workers=num_workers)
+        loader = partial(parallel_image_loader, num_threads=num_threads,
+                         image_shape=image_shape, num_workers=num_workers,
+                         timeout=timeout, retries=retries, image_processor=image_processor)
         self.thread = threading.Thread(target=loader, args=(dataset,))
         self.thread.start()
     def __iter__(self):
         return self
     def __next__(self):
         def fetcher(_):
             return data_queue.get()
         with ThreadPoolExecutor(max_workers=self.batch_size) as executor:
             batch = list(executor.map(fetcher, range(self.batch_size)))
         return batch
     def __del__(self):
         self.thread.join()
     def __len__(self):
         return len(self.dataset) // self.batch_size
 def default_collate(batch):
     urls = [sample["url"] for sample in batch]
     captions = [sample["caption"] for sample in batch]
@@ -153,7 +178,8 @@ def default_collate(batch):
         "caption": captions,
         "image": images,
     }
 def dataMapper(map: Dict[str, Any]):
     def _map(sample) -> Dict[str, Any]:
         return {
@@ -162,16 +188,17 @@ def dataMapper(map: Dict[str, Any]):
         }
     return _map
 class OnlineStreamingDataLoader():
     def __init__(
-        self,
-        dataset,
-        batch_size=64,
+        self,
+        dataset,
+        batch_size=64,
         image_shape=(256, 256),
-        num_workers=16,
+        num_workers=16,
         num_threads=512,
         default_split="all",
-        pre_map_maker=dataMapper,
+        pre_map_maker=dataMapper,
         pre_map_def={
             "url": "URL",
             "caption": "TEXT",
@@ -180,40 +207,53 @@ class OnlineStreamingDataLoader():
         global_process_index=0,
         prefetch=1000,
         collate_fn=default_collate,
+        timeout=15,
+        retries=3,
+        image_processor=default_image_processor,
     ):
         if isinstance(dataset, str):
             dataset_path = dataset
             print("Loading dataset from path")
-            dataset = load_dataset(dataset_path, split=default_split)
+            if "gs://" in dataset:
+                dataset = load_from_disk(dataset_path)
+            else:
+                dataset = load_dataset(dataset_path, split=default_split)
         elif isinstance(dataset, list):
             if isinstance(dataset[0], str):
                 print("Loading multiple datasets from paths")
-                dataset = [load_dataset(dataset_path, split=default_split) for dataset_path in dataset]
+                dataset = [load_from_disk(dataset_path) if "gs://" in dataset_path else load_dataset(
+                    dataset_path, split=default_split) for dataset_path in dataset]
             print("Concatenating multiple datasets")
             dataset = concatenate_datasets(dataset)
-        dataset = dataset.map(pre_map_maker(pre_map_def), batched=True, batch_size=10000000)
-        self.dataset = dataset.shard(num_shards=global_process_count, index=global_process_index)
+            dataset = dataset.shuffle(seed=0)
+        # dataset = dataset.map(pre_map_maker(pre_map_def), batched=True, batch_size=10000000)
+        self.dataset = dataset.shard(
+            num_shards=global_process_count, index=global_process_index)
         print(f"Dataset length: {len(dataset)}")
-        self.iterator = ImageBatchIterator(self.dataset, image_shape=image_shape, num_workers=num_workers, batch_size=batch_size, num_threads=num_threads)
-        self.collate_fn = collate_fn
+        self.iterator = ImageBatchIterator(self.dataset, image_shape=image_shape,
+                                           num_workers=num_workers, batch_size=batch_size, num_threads=num_threads,
+                                             timeout=timeout, retries=retries, image_processor=image_processor)
         self.batch_size = batch_size
         # Launch a thread to load batches in the background
         self.batch_queue = queue.Queue(prefetch)
         def batch_loader():
             for batch in self.iterator:
-                self.batch_queue.put(batch)
+                try:
+                    self.batch_queue.put(collate_fn(batch))
+                except Exception as e:
+                    print("Error processing batch", e)
         self.loader_thread = threading.Thread(target=batch_loader)
         self.loader_thread.start()
     def __iter__(self):
         return self
     def __next__(self):
-        return self.collate_fn(self.batch_queue.get())
+        return self.batch_queue.get()
         # return self.collate_fn(next(self.iterator))
     def __len__(self):
-        return len(self.dataset)
+        return len(self.dataset)

{flaxdiff-0.1.16.dist-info → flaxdiff-0.1.18.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: flaxdiff
-Version: 0.1.16
+Version: 0.1.18
 Summary: A versatile and easy to understand Diffusion library
 Author: Ashish Kumar Singh
 Author-email: ashishkmr472@gmail.com

{flaxdiff-0.1.16.dist-info → flaxdiff-0.1.18.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
 flaxdiff/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 flaxdiff/utils.py,sha256=B0GcHlzlVYDNEIdh2v5qmP4u0neIT-FqexNohuyuCvg,2452
 flaxdiff/data/__init__.py,sha256=PM3PkHihyohT5SHVYKc8vQ4IeVfGPpCktkSVwvqMjQ4,52
-flaxdiff/data/online_loader.py,sha256=nrtZU4srZHsg3iN0sG91y_6nY7QtYXRPLk5rGn_BTIU,7728
+flaxdiff/data/online_loader.py,sha256=qim6SRRGU1lRO0zQbDNjRYC7Qm6g7jtUfELEXotora0,8987
 flaxdiff/models/__init__.py,sha256=FAivVYXxM2JrCFIXf-C3374RB2Hth25dBrzOeNFhH1U,26
 flaxdiff/models/attention.py,sha256=ZbDGIb5Q6FRqJ6qRY660cqw4WvF9IwCnhEuYdTpLPdM,13023
 flaxdiff/models/common.py,sha256=fd-Fl0VCNEBjijHNwGBqYL5VvXe9u0347h25czNTmRw,10780
@@ -34,7 +34,7 @@ flaxdiff/trainer/__init__.py,sha256=T-vUVq4zHcMK6kpCsG4Gu8vn71q6lZD-lg-Ul7yKfEk,
 flaxdiff/trainer/autoencoder_trainer.py,sha256=al7AsZ7yeDMEiDD-gbcXf0ADq_xfk1VMxvg24GfA-XQ,7008
 flaxdiff/trainer/diffusion_trainer.py,sha256=wKkg63DWZjx2MoM3VQNCDIr40rWN8fUGxH9jWWxfZao,9373
 flaxdiff/trainer/simple_trainer.py,sha256=Z77zRS5viJpd2Mpl6sonJk5WcnEWi2Cd4gl4u5tIX2M,18206
-flaxdiff-0.1.16.dist-info/METADATA,sha256=BM2RLOiCDqRSWO_owxvWmL_PS3aFtNokPbf-qAuyK4o,22083
-flaxdiff-0.1.16.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
-flaxdiff-0.1.16.dist-info/top_level.txt,sha256=-2-nXnfkJgSfkki1tjm5Faw6Dso7vhtdn2szwCdX5CQ,9
-flaxdiff-0.1.16.dist-info/RECORD,,
+flaxdiff-0.1.18.dist-info/METADATA,sha256=aUSr3lBb9P2mnrpmbcgQa41DT8YYM-DtVMU8NI3CZEE,22083
+flaxdiff-0.1.18.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
+flaxdiff-0.1.18.dist-info/top_level.txt,sha256=-2-nXnfkJgSfkki1tjm5Faw6Dso7vhtdn2szwCdX5CQ,9
+flaxdiff-0.1.18.dist-info/RECORD,,

{flaxdiff-0.1.16.dist-info → flaxdiff-0.1.18.dist-info}/WHEEL RENAMED Viewed

File without changes

{flaxdiff-0.1.16.dist-info → flaxdiff-0.1.18.dist-info}/top_level.txt RENAMED Viewed

File without changes

flaxdiff 0.1.16__py3-none-any.whl → 0.1.18__py3-none-any.whl

flaxdiff 0.1.16py3-none-any.whl → 0.1.18py3-none-any.whl