PyPI - datago - Versions diffs - 2025.10.2__tar.gz → 2025.12.2__tar.gz - Mend

datago 2025.10.2tar.gz → 2025.12.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

{datago-2025.10.2 → datago-2025.12.2}/Cargo.lock RENAMED Viewed

@@ -111,6 +111,16 @@ version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
+[[package]]
+name = "assert-json-diff"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47e4f2b81832e72834d7518d8487a0396a28cc408186a2e8854c0f98011faf12"
+dependencies = [
+ "serde",
+ "serde_json",
+]
 [[package]]
 name = "async-channel"
 version = "1.9.0"
@@ -464,7 +474,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d067ad48b8650848b989a59a86c6c36a995d02d2bf778d45c3c5d57bc2718f02"
 dependencies = [
  "smallvec",
- "target-lexicon 0.12.16",
+ "target-lexicon",
 ]
 [[package]]
@@ -613,7 +623,7 @@ dependencies = [
 [[package]]
 name = "datago"
-version = "2025.10.2"
+version = "2025.12.2"
 dependencies = [
  "async-compression",
  "async-tar",
@@ -644,8 +654,27 @@ dependencies = [
  "tokio-util",
  "url",
  "walkdir",
+ "wiremock",
+]
+[[package]]
+name = "deadpool"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb84100978c1c7b37f09ed3ce3e5f843af02c2a2c431bae5b19230dad2c1b490"
+dependencies = [
+ "async-trait",
+ "deadpool-runtime",
+ "num_cpus",
+ "tokio",
 ]
+[[package]]
+name = "deadpool-runtime"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "092966b41edc516079bdf31ec78a2e0588d1d0c08f78b91d8307215928642b2b"
 [[package]]
 name = "dirs-next"
 version = "2.0.0"
@@ -1118,6 +1147,12 @@ version = "1.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
+[[package]]
+name = "httpdate"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
 [[package]]
 name = "hyper"
 version = "1.6.0"
@@ -1131,6 +1166,7 @@ dependencies = [
  "http",
  "http-body",
  "httparse",
+ "httpdate",
  "itoa",
  "pin-project-lite",
  "smallvec",
@@ -2026,9 +2062,9 @@ dependencies = [
 [[package]]
 name = "pyo3"
-version = "0.24.1"
+version = "0.22.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "17da310086b068fbdcefbba30aeb3721d5bb9af8db4987d6735b2183ca567229"
+checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884"
 dependencies = [
  "cfg-if",
  "indoc",
@@ -2044,19 +2080,19 @@ dependencies = [
 [[package]]
 name = "pyo3-build-config"
-version = "0.24.1"
+version = "0.22.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e27165889bd793000a098bb966adc4300c312497ea25cf7a690a9f0ac5aa5fc1"
+checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38"
 dependencies = [
  "once_cell",
- "target-lexicon 0.13.2",
+ "target-lexicon",
 ]
 [[package]]
 name = "pyo3-ffi"
-version = "0.24.1"
+version = "0.22.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05280526e1dbf6b420062f3ef228b78c0c54ba94e157f5cb724a609d0f2faabc"
+checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636"
 dependencies = [
  "libc",
  "pyo3-build-config",
@@ -2064,9 +2100,9 @@ dependencies = [
 [[package]]
 name = "pyo3-macros"
-version = "0.24.1"
+version = "0.22.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c3ce5686aa4d3f63359a5100c62a127c9f15e8398e5fdeb5deef1fed5cd5f44"
+checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453"
 dependencies = [
  "proc-macro2",
  "pyo3-macros-backend",
@@ -2076,9 +2112,9 @@ dependencies = [
 [[package]]
 name = "pyo3-macros-backend"
-version = "0.24.1"
+version = "0.22.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f4cf6faa0cbfb0ed08e89beb8103ae9724eb4750e3a78084ba4017cbe94f3855"
+checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe"
 dependencies = [
  "heck",
  "proc-macro2",
@@ -2753,12 +2789,6 @@ version = "0.12.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
-[[package]]
-name = "target-lexicon"
-version = "0.13.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a"
 [[package]]
 name = "tempfile"
 version = "3.20.0"
@@ -3454,6 +3484,30 @@ dependencies = [
  "memchr",
 ]
+[[package]]
+name = "wiremock"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2b8b99d4cdbf36b239a9532e31fe4fb8acc38d1897c1761e161550a7dc78e6a"
+dependencies = [
+ "assert-json-diff",
+ "async-trait",
+ "base64",
+ "deadpool",
+ "futures",
+ "http",
+ "http-body-util",
+ "hyper",
+ "hyper-util",
+ "log",
+ "once_cell",
+ "regex",
+ "serde",
+ "serde_json",
+ "tokio",
+ "url",
+]
 [[package]]
 name = "wit-bindgen-rt"
 version = "0.33.0"

{datago-2025.10.2 → datago-2025.12.2}/Cargo.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [package]
 name = "datago"
 edition = "2021"
-version = "2025.10.2"
+version = "2025.12.2"
 readme = "README.md"
 [lib]
@@ -24,7 +24,7 @@ kanal = "0.1"
 clap = { version = "4.5.27", features = ["derive"] }
 tokio = { version = "1.43.1", features = ["rt-multi-thread", "macros"] }
 prettytable-rs = "0.10.0"
-pyo3 = { version = "0.24.1", features = ["extension-module"] }
+pyo3 = { version = "0.22", features = ["extension-module"] }
 threadpool = "1.8.1"
 openssl = { version = "0.10", features = ["vendored"] }
 walkdir = "2.5.0"
@@ -46,6 +46,7 @@ fast_image_resize = { version ="5.1.3", features=["image"]}
 [dev-dependencies]
 tempfile = "3.13.0"
+wiremock = "0.6.0"
 [profile.release]
 opt-level = 3

{datago-2025.10.2 → datago-2025.12.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datago
-Version: 2025.10.2
+Version: 2025.12.2
 Classifier: Programming Language :: Rust
 Classifier: Programming Language :: Python :: Implementation :: CPython
 Classifier: Programming Language :: Python :: Implementation :: PyPy
@@ -8,7 +8,7 @@ Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: MIT License
 License-File: LICENSE
 Summary: A high performance dataloader for Python, written in Rust
-Author: Benjamin Lefaudeux
+Author: Benjamin Lefaudeux, Roman Frigg
 Author-email: Photoroom <team@photoroom.com>
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
@@ -267,7 +267,7 @@ Create a new tag and a new release in this repo, a new package will be pushed au
 <details> <summary><strong>Benchmarks</strong></summary>
 As usual, benchmarks are a tricky game, and you shouldn't read too much into the following plots but do your own tests. Some python benchmark examples are provided in the [python](./python/) folder.
-In general, Datago will be impactful if you want to load a lot of images very fast, but if you consume them as you go at a more leisury pace then it's not really needed. The more CPU work there is with the images and the higher quality they are, the more Datago will shine. The following benchmarks are using ImageNet 1k, which is very low resolution and thus kind of a worst case scenario. Data is served from cache (i.e. the OS cache) and the images are not pre-processed. In this case the receiving python process is typically the bottleneck, and caps at around 2000 images per second.
+In general, Datago will be impactful if you want to load a lot of images very fast, but if you consume them as you go at a more leisury pace then it's not really needed. The more CPU work there is with the images and the higher quality they are, the more Datago will shine. The following benchmarks are using ImageNet 1k, which is very low resolution and thus kind of a worst case scenario. Data is served from cache (i.e. the OS cache) and the images are not pre-processed. In this case the receiving python process is typically the bottleneck, and caps at around 3000 images per second.
 ### AMD Zen3 laptop - IN1k - disk
 ![AMD Zen3 laptop & M2 SSD](assets/zen3_ssd.png)
@@ -275,7 +275,7 @@ In general, Datago will be impactful if you want to load a lot of images very fa
 ### AMD EPYC 9454 - IN1k - disk
 ![AMD EPYC 9454](assets/epyc_vast.png)
-This benchmark is using the PD12M dataset, which is a 12M images dataset, with a lot of high resolution images. It's accessed through the webdataset front end, datago is compared with the popular python webdataset library. Note that datago will start streaming the images faster here (almost instantly !), so given enough time the two results would look closer.
+This benchmark is using the PD12M dataset, which hosts high resolution images. It's accessed through the webdataset front end, datago is compared with the popular python webdataset library. Note that datago will start streaming the images faster here (almost instantly !), so given enough time the two results would look closer.
 ### AMD EPYC 9454 - pd12m - webdataset
 ![AMD EPYC 9454](assets/epyc_wds.png)

{datago-2025.10.2 → datago-2025.12.2}/README.md RENAMED Viewed

@@ -250,7 +250,7 @@ Create a new tag and a new release in this repo, a new package will be pushed au
 <details> <summary><strong>Benchmarks</strong></summary>
 As usual, benchmarks are a tricky game, and you shouldn't read too much into the following plots but do your own tests. Some python benchmark examples are provided in the [python](./python/) folder.
-In general, Datago will be impactful if you want to load a lot of images very fast, but if you consume them as you go at a more leisury pace then it's not really needed. The more CPU work there is with the images and the higher quality they are, the more Datago will shine. The following benchmarks are using ImageNet 1k, which is very low resolution and thus kind of a worst case scenario. Data is served from cache (i.e. the OS cache) and the images are not pre-processed. In this case the receiving python process is typically the bottleneck, and caps at around 2000 images per second.
+In general, Datago will be impactful if you want to load a lot of images very fast, but if you consume them as you go at a more leisury pace then it's not really needed. The more CPU work there is with the images and the higher quality they are, the more Datago will shine. The following benchmarks are using ImageNet 1k, which is very low resolution and thus kind of a worst case scenario. Data is served from cache (i.e. the OS cache) and the images are not pre-processed. In this case the receiving python process is typically the bottleneck, and caps at around 3000 images per second.
 ### AMD Zen3 laptop - IN1k - disk
 ![AMD Zen3 laptop & M2 SSD](assets/zen3_ssd.png)
@@ -258,7 +258,7 @@ In general, Datago will be impactful if you want to load a lot of images very fa
 ### AMD EPYC 9454 - IN1k - disk
 ![AMD EPYC 9454](assets/epyc_vast.png)
-This benchmark is using the PD12M dataset, which is a 12M images dataset, with a lot of high resolution images. It's accessed through the webdataset front end, datago is compared with the popular python webdataset library. Note that datago will start streaming the images faster here (almost instantly !), so given enough time the two results would look closer.
+This benchmark is using the PD12M dataset, which hosts high resolution images. It's accessed through the webdataset front end, datago is compared with the popular python webdataset library. Note that datago will start streaming the images faster here (almost instantly !), so given enough time the two results would look closer.
 ### AMD EPYC 9454 - pd12m - webdataset
 ![AMD EPYC 9454](assets/epyc_wds.png)

datago-2025.12.2/assets/epyc_vast.png ADDED Viewed

Binary file

datago-2025.12.2/assets/zen3_ssd.png ADDED Viewed

Binary file

{datago-2025.10.2 → datago-2025.12.2}/pyproject.toml RENAMED Viewed

@@ -2,7 +2,7 @@
 name = "datago"
 dynamic = ["version"]
 authors = [
-  { name = "Benjamin Lefaudeux" },
+  { name = "Benjamin Lefaudeux, Roman Frigg" },
   { name = "Photoroom", email = "team@photoroom.com" }
 ]
 description = "A high performance dataloader for Python, written in Rust"

{datago-2025.10.2 → datago-2025.12.2}/python/benchmark_db.py RENAMED Viewed

@@ -6,7 +6,7 @@ import typer
 from benchmark_defaults import IMAGE_CONFIG
 from datago import DatagoClient  # type: ignore
 from PIL import Image
-from raw_types import raw_array_to_numpy, raw_array_to_pil_image
+from raw_types import raw_array_to_numpy
 from tqdm import tqdm
@@ -58,21 +58,7 @@ def benchmark(
     for _ in tqdm(range(limit), dynamic_ncols=True):
         sample = client.get_sample()
         if sample.id:
-            # Bring the masks and image to PIL
-            if hasattr(sample, "image"):
-                img = raw_array_to_pil_image(sample.image)
-            if hasattr(sample, "masks"):
-                for _, mask_buffer in sample.masks.items():
-                    mask = raw_array_to_pil_image(mask_buffer)
-            if (
-                hasattr(sample, "additional_images")
-                and "masked_image" in sample.additional_images
-            ):
-                masked_image = raw_array_to_pil_image(
-                    sample.AdditionalImages["masked_image"]
-                )
+            # Images are already PIL by default
             # Bring the latents to numpy
             if hasattr(sample, "latents"):

{datago-2025.10.2 → datago-2025.12.2}/python/benchmark_filesystem.py RENAMED Viewed

@@ -9,26 +9,34 @@ from tqdm import tqdm
 def benchmark(
-    root_path: str = typer.Option(os.getenv("DATAGO_TEST_FILESYSTEM", ""), help="The source to test out"),
+    root_path: str = typer.Option(
+        os.getenv("DATAGO_TEST_FILESYSTEM", ""), help="The source to test out"
+    ),
     limit: int = typer.Option(2000, help="The number of samples to test on"),
-    crop_and_resize: bool = typer.Option(False, help="Crop and resize the images on the fly"),
+    crop_and_resize: bool = typer.Option(
+        False, help="Crop and resize the images on the fly"
+    ),
     compare_torch: bool = typer.Option(True, help="Compare against torch dataloader"),
     num_workers: int = typer.Option(os.cpu_count(), help="Number of workers to use"),
     sweep: bool = typer.Option(False, help="Sweep over the number of workers"),
 ):
     if sweep:
-        results = {}
-        for num_workers in range(2, (os.cpu_count() or 2), 16):
-            results[num_workers] = benchmark(root_path, limit, crop_and_resize, compare_torch, num_workers, False)
+        results_sweep = {}
+        for num_workers in range(2, (os.cpu_count() * 2 or 2), 2):
+            results_sweep[num_workers] = benchmark(
+                root_path, limit, crop_and_resize, compare_torch, num_workers, False
+            )
         # Save results to a json file
         with open("benchmark_results_filesystem.json", "w") as f:
-            json.dump(results, f, indent=2)
+            json.dump(results_sweep, f, indent=2)
-        return results
+        return results_sweep
-    print(f"Running benchmark for {root_path} - {limit} samples - {num_workers} workers")
+    print(
+        f"Running benchmark for {root_path} - {limit} samples - {num_workers} workers"
+    )
     # This setting is not exposed in the config, but an env variable can be used instead
     os.environ["DATAGO_MAX_TASKS"] = str(num_workers)
@@ -59,6 +67,11 @@ def benchmark(
     for sample in tqdm(datago_dataset, desc="Datago", dynamic_ncols=True):
         assert sample["id"] != ""
         img = sample["image"]
+        if count < limit - 1:
+            del img
+            img = None  # Help with memory pressure
         count += 1
     assert count == limit, f"Expected {limit} samples, got {count}"
@@ -80,7 +93,9 @@ def benchmark(
         transform = (
             transforms.Compose(
                 [
-                    transforms.Resize((1024, 1024), interpolation=transforms.InterpolationMode.LANCZOS),
+                    transforms.Resize(
+                        (1024, 1024), interpolation=transforms.InterpolationMode.LANCZOS
+                    ),
                 ]
             )
             if crop_and_resize
@@ -88,7 +103,9 @@ def benchmark(
         )
         # Create the ImageFolder dataset
-        dataset = datasets.ImageFolder(root=root_path, transform=transform, allow_empty=True)
+        dataset = datasets.ImageFolder(
+            root=root_path, transform=transform, allow_empty=True
+        )
         # Create a DataLoader to allow for multiple workers
         # Use available CPU count for num_workers
@@ -107,6 +124,8 @@ def benchmark(
             n_images += len(batch)
             if n_images > limit:
                 break
+            del batch  # Help with memory pressure, same as above
         fps = n_images / (time.time() - start)
         results["torch"] = {"fps": fps, "count": n_images}
         print(f"Torch - FPS {fps:.2f} - workers {num_workers}")

{datago-2025.10.2 → datago-2025.12.2}/python/dataset.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from datago import DatagoClient, initialize_logging
 import json
 from typing import Dict, Any
-from raw_types import raw_array_to_pil_image, raw_array_to_numpy
+from raw_types import raw_array_to_numpy
 class DatagoIterDataset:
@@ -29,11 +29,13 @@ class DatagoIterDataset:
             return json.loads(item)
         if isinstance(item, dict):
-            # recurvisely convert the dictionary
+            # recursively convert the dictionary
             return {k: DatagoIterDataset.to_python_types(v, k) for k, v in item.items()}
         elif "image" in key:
-            return raw_array_to_pil_image(item)
+            # The Rust-side returns PythonImagePayload objects that are callable
+            # Call them to get the actual PIL image
+            return item()
         elif "latent" in key:
             return raw_array_to_numpy(item)

datago-2025.12.2/python/raw_types.py ADDED Viewed

@@ -0,0 +1,40 @@
+from PIL import Image
+from typing import Optional, Union
+import numpy as np
+from datago import ImagePayload
+def raw_array_to_numpy(raw_array: ImagePayload) -> Optional[np.ndarray]:
+    if len(raw_array.data) == 0:
+        return None
+    # Generic numpy-serialized array
+    try:
+        return np.load(raw_array.data, allow_pickle=False)
+    except ValueError:
+        # Do not try to handle these, return None and we'll handle it in the caller
+        print("Could not deserialize numpy array")
+        return None
+def decode_image_payload(payload: ImagePayload) -> Image.Image:
+    """
+    Decode an ImagePayload (encoded image) into a PIL Image.
+    This is the proper way to decode encoded images for API users.
+    """
+    import io
+    return Image.open(io.BytesIO(payload.data))
+def get_image_mode(image_or_payload: Union[ImagePayload, Image.Image]) -> str:
+    """
+    Helper function to get the mode of an image, whether it's a PIL Image or ImagePayload.
+    For ImagePayload objects (encoded images), we need to decode them first.
+    """
+    if hasattr(image_or_payload, "mode"):
+        # It's a PIL Image
+        return image_or_payload.mode
+    else:
+        # It's an ImagePayload (encoded image), decode it first
+        return decode_image_payload(image_or_payload).mode

{datago-2025.10.2 → datago-2025.12.2}/python/test_datago_client.py RENAMED Viewed

@@ -115,7 +115,13 @@ class TestDatagoClient:
                 assert sample.source == "filesystem"
                 assert sample.image.width > 0
                 assert sample.image.height > 0
-                assert len(sample.image.data) > 0
+                # Check the payload path
+                payload = sample.image.get_payload()
+                assert len(payload.data) > 0
+                assert payload.width == sample.image.width
+                assert payload.height == sample.image.height
+                assert payload.channels == 3
     def test_client_with_image_transformations(self):
         """Test client with image transformation configuration."""
@@ -149,7 +155,7 @@ class TestDatagoClient:
             assert sample is not None
             assert sample.image.width <= 64
             assert sample.image.height <= 64
-            assert sample.image.channels == 3  # RGB8
+            assert sample.image.mode == "RGB"
     def test_client_with_image_encoding(self):
         """Test client with image encoding enabled."""
@@ -177,8 +183,7 @@ class TestDatagoClient:
             sample = client.get_sample()
             assert sample is not None
-            assert sample.image.channels == -1  # Encoded images have channels = -1
-            assert len(sample.image.data) > 0
+            assert sample.image.mode == "RGB"
     def test_random_sampling(self):
         """Test that random sampling produces different results."""

{datago-2025.10.2 → datago-2025.12.2}/python/test_datago_db.py RENAMED Viewed

@@ -3,7 +3,9 @@ import pytest
 import os
 import json
-from raw_types import raw_array_to_pil_image, raw_array_to_numpy, get_image_mode, decode_image_payload
+from raw_types import (
+    decode_image_payload,
+)
 from dataset import DatagoIterDataset
@@ -62,9 +64,11 @@ def test_caption_and_image():
         assert img.height > 0
         assert img.width > 0
-        assert img.height <= img.original_height
-        assert img.width <= img.original_width
-        assert img.channels == channels
+        payload = img.get_payload()
+        assert img.height <= payload.original_height
+        assert img.width <= payload.original_width
+        assert img.mode == "RGB" if channels == 3 else "L"
+        assert payload.channels == channels
     for i, sample in enumerate(dataset):
         assert sample.source != ""
@@ -80,14 +84,11 @@ def test_caption_and_image():
         check_image(sample.masks["segmentation_mask"], 1)
         # Check the image decoding
-        assert raw_array_to_pil_image(sample.image).mode == "RGB", "Image should be RGB"
-        assert (
-            raw_array_to_pil_image(sample.additional_images["masked_image"]).mode
-            == "RGB"
-        ), "Image should be RGB"
-        assert raw_array_to_pil_image(sample.masks["segmentation_mask"]).mode == "L", (
-            "Mask should be L"
+        assert sample.image.mode == "RGB", "Image should be RGB"
+        assert sample.additional_images["masked_image"].mode == "RGB", (
+            "Image should be RGB"
         )
+        assert sample.masks["segmentation_mask"].mode == "L", "Mask should be L"
         if i > N_SAMPLES:
             break
@@ -146,36 +147,13 @@ def test_jpeg_compression():
     sample = next(iter(dataset))
-    # When images are encoded, channels is set to -1 to signal encoded format
-    assert sample.image.channels == -1, "Image should be encoded (channels == -1)"
-    assert (
-        sample.additional_images["masked_image"].channels == -1
-    ), "Additional image should be encoded"
-    assert (
-        sample.masks["segmentation_mask"].channels == -1
-    ), "Mask should be encoded"
-    # Test that raw_array_to_pil_image returns ImagePayload for encoded images
-    image_result = raw_array_to_pil_image(sample.image)
-    assert not hasattr(image_result, 'mode'), "Should return ImagePayload, not PIL Image"
-    assert hasattr(image_result, 'data'), "Should have data attribute"
-    assert hasattr(image_result, 'channels'), "Should have channels attribute"
-    assert image_result.channels == -1, "Should be encoded ImagePayload"
-    # Test proper decoding using decode_image_payload
-    decoded_image = decode_image_payload(image_result)
-    assert hasattr(decoded_image, 'mode'), "Decoded image should be PIL Image"
-    assert decoded_image.mode == "RGB", "Image should decode to RGB"
-    assert decoded_image.size == (sample.image.width, sample.image.height), "Size should match"
-    # Test additional images and masks
-    additional_result = raw_array_to_pil_image(sample.additional_images["masked_image"])
-    decoded_additional = decode_image_payload(additional_result)
-    assert decoded_additional.mode == "RGB", "Additional image should decode to RGB"
-    mask_result = raw_array_to_pil_image(sample.masks["segmentation_mask"])
-    decoded_mask = decode_image_payload(mask_result)
-    assert decoded_mask.mode == "L", "Mask should decode to L"
+    # Check that the image is properly accessible through PIL, but that it is encoded
+    assert sample.image.mode == "RGB", "Image should be RGB"
+    assert sample.additional_images["masked_image"].mode == "RGB", "Image should be RGB"
+    assert sample.masks["segmentation_mask"].mode == "L", "Mask should be L"
+    # Check that the image is encoded, as JPG PIL
+    # TODO: @blefaudeux
 def test_png_compression():
@@ -185,23 +163,11 @@ def test_png_compression():
     # Don't specify encode_format - should default to PNG
     dataset = DatagoIterDataset(client_config, return_python_types=False)
-    sample = next(iter(dataset))
-    # When images are encoded, channels is set to -1 to signal encoded format
-    assert sample.image.channels == -1, "Image should be encoded (channels == -1)"
-    # Test that raw_array_to_pil_image returns ImagePayload for encoded images
-    image_result = raw_array_to_pil_image(sample.image)
-    assert not hasattr(image_result, 'mode'), "Should return ImagePayload, not PIL Image"
-    assert hasattr(image_result, 'data'), "Should have data attribute"
-    assert hasattr(image_result, 'channels'), "Should have channels attribute"
-    assert image_result.channels == -1, "Should be encoded ImagePayload"
+    _sample = next(iter(dataset))
-    # Test proper decoding using decode_image_payload
-    decoded_image = decode_image_payload(image_result)
-    assert hasattr(decoded_image, 'mode'), "Decoded image should be PIL Image"
-    assert decoded_image.mode == "RGB", "Image should decode to RGB"
-    assert decoded_image.size == (sample.image.width, sample.image.height), "Size should match"
+    # Check that the image is encoded, as JPG PIL
+    # TODO: @blefaudeux
+    # same as above
 def test_original_image():
@@ -212,14 +178,9 @@ def test_original_image():
     dataset = DatagoIterDataset(client_config, return_python_types=False)
     sample = next(iter(dataset))
-    assert raw_array_to_pil_image(sample.image).mode == "RGB", "Image should be RGB"
-    assert (
-        raw_array_to_pil_image(sample.additional_images["masked_image"]).mode == "RGB"
-    ), "Image should be RGB"
-    assert raw_array_to_pil_image(sample.masks["segmentation_mask"]).mode == "L", (
-        "Mask should be L"
-    )
+    payload = sample.image.get_payload()
+    assert payload.original_height == payload.height == sample.image.height
+    assert payload.original_width == payload.width == sample.image.width
 def test_duplicate_state():

{datago-2025.10.2 → datago-2025.12.2}/python/test_datago_edge_cases.py RENAMED Viewed

@@ -90,11 +90,13 @@ class TestDatagoEdgeCases:
             sample = client.get_sample()
             assert sample is not None
-            assert sample.image.original_width == 2000
-            assert sample.image.original_height == 2000
+            image_payload = sample.image.get_payload()
+            assert image_payload is not None
+            assert image_payload.original_width == 2000
+            assert image_payload.original_height == 2000
             # Should be resized
-            assert sample.image.width <= 512
-            assert sample.image.height <= 512
+            assert image_payload.width <= 512
+            assert image_payload.height <= 512
     def test_very_small_images(self):
         """Test handling of very small images."""

datago 2025.10.2__tar.gz → 2025.12.2__tar.gz

datago 2025.10.2tar.gz → 2025.12.2tar.gz