PyPI - datago - Versions diffs - 2025.8.1__tar.gz → 2025.12.1__tar.gz - Mend

datago 2025.8.1tar.gz → 2025.12.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

{datago-2025.8.1 → datago-2025.12.1}/Cargo.lock RENAMED Viewed

@@ -111,6 +111,16 @@ version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
+[[package]]
+name = "assert-json-diff"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47e4f2b81832e72834d7518d8487a0396a28cc408186a2e8854c0f98011faf12"
+dependencies = [
+ "serde",
+ "serde_json",
+]
 [[package]]
 name = "async-channel"
 version = "1.9.0"
@@ -464,7 +474,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d067ad48b8650848b989a59a86c6c36a995d02d2bf778d45c3c5d57bc2718f02"
 dependencies = [
  "smallvec",
- "target-lexicon 0.12.16",
+ "target-lexicon",
 ]
 [[package]]
@@ -613,7 +623,7 @@ dependencies = [
 [[package]]
 name = "datago"
-version = "2025.8.1"
+version = "2025.12.1"
 dependencies = [
  "async-compression",
  "async-tar",
@@ -644,8 +654,27 @@ dependencies = [
  "tokio-util",
  "url",
  "walkdir",
+ "wiremock",
+]
+[[package]]
+name = "deadpool"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb84100978c1c7b37f09ed3ce3e5f843af02c2a2c431bae5b19230dad2c1b490"
+dependencies = [
+ "async-trait",
+ "deadpool-runtime",
+ "num_cpus",
+ "tokio",
 ]
+[[package]]
+name = "deadpool-runtime"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "092966b41edc516079bdf31ec78a2e0588d1d0c08f78b91d8307215928642b2b"
 [[package]]
 name = "dirs-next"
 version = "2.0.0"
@@ -1118,6 +1147,12 @@ version = "1.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
+[[package]]
+name = "httpdate"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
 [[package]]
 name = "hyper"
 version = "1.6.0"
@@ -1131,6 +1166,7 @@ dependencies = [
  "http",
  "http-body",
  "httparse",
+ "httpdate",
  "itoa",
  "pin-project-lite",
  "smallvec",
@@ -2026,9 +2062,9 @@ dependencies = [
 [[package]]
 name = "pyo3"
-version = "0.24.1"
+version = "0.22.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "17da310086b068fbdcefbba30aeb3721d5bb9af8db4987d6735b2183ca567229"
+checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884"
 dependencies = [
  "cfg-if",
  "indoc",
@@ -2044,19 +2080,19 @@ dependencies = [
 [[package]]
 name = "pyo3-build-config"
-version = "0.24.1"
+version = "0.22.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e27165889bd793000a098bb966adc4300c312497ea25cf7a690a9f0ac5aa5fc1"
+checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38"
 dependencies = [
  "once_cell",
- "target-lexicon 0.13.2",
+ "target-lexicon",
 ]
 [[package]]
 name = "pyo3-ffi"
-version = "0.24.1"
+version = "0.22.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05280526e1dbf6b420062f3ef228b78c0c54ba94e157f5cb724a609d0f2faabc"
+checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636"
 dependencies = [
  "libc",
  "pyo3-build-config",
@@ -2064,9 +2100,9 @@ dependencies = [
 [[package]]
 name = "pyo3-macros"
-version = "0.24.1"
+version = "0.22.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c3ce5686aa4d3f63359a5100c62a127c9f15e8398e5fdeb5deef1fed5cd5f44"
+checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453"
 dependencies = [
  "proc-macro2",
  "pyo3-macros-backend",
@@ -2076,9 +2112,9 @@ dependencies = [
 [[package]]
 name = "pyo3-macros-backend"
-version = "0.24.1"
+version = "0.22.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f4cf6faa0cbfb0ed08e89beb8103ae9724eb4750e3a78084ba4017cbe94f3855"
+checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe"
 dependencies = [
  "heck",
  "proc-macro2",
@@ -2753,12 +2789,6 @@ version = "0.12.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
-[[package]]
-name = "target-lexicon"
-version = "0.13.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a"
 [[package]]
 name = "tempfile"
 version = "3.20.0"
@@ -3454,6 +3484,30 @@ dependencies = [
  "memchr",
 ]
+[[package]]
+name = "wiremock"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2b8b99d4cdbf36b239a9532e31fe4fb8acc38d1897c1761e161550a7dc78e6a"
+dependencies = [
+ "assert-json-diff",
+ "async-trait",
+ "base64",
+ "deadpool",
+ "futures",
+ "http",
+ "http-body-util",
+ "hyper",
+ "hyper-util",
+ "log",
+ "once_cell",
+ "regex",
+ "serde",
+ "serde_json",
+ "tokio",
+ "url",
+]
 [[package]]
 name = "wit-bindgen-rt"
 version = "0.33.0"

{datago-2025.8.1 → datago-2025.12.1}/Cargo.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [package]
 name = "datago"
 edition = "2021"
-version = "2025.8.1"
+version = "2025.12.1"
 readme = "README.md"
 [lib]
@@ -24,7 +24,7 @@ kanal = "0.1"
 clap = { version = "4.5.27", features = ["derive"] }
 tokio = { version = "1.43.1", features = ["rt-multi-thread", "macros"] }
 prettytable-rs = "0.10.0"
-pyo3 = { version = "0.24.1", features = ["extension-module"] }
+pyo3 = { version = "0.22", features = ["extension-module"] }
 threadpool = "1.8.1"
 openssl = { version = "0.10", features = ["vendored"] }
 walkdir = "2.5.0"
@@ -46,6 +46,7 @@ fast_image_resize = { version ="5.1.3", features=["image"]}
 [dev-dependencies]
 tempfile = "3.13.0"
+wiremock = "0.6.0"
 [profile.release]
 opt-level = 3

{datago-2025.8.1 → datago-2025.12.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datago
-Version: 2025.8.1
+Version: 2025.12.1
 Classifier: Programming Language :: Rust
 Classifier: Programming Language :: Python :: Implementation :: CPython
 Classifier: Programming Language :: Python :: Implementation :: PyPy
@@ -8,7 +8,7 @@ Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: MIT License
 License-File: LICENSE
 Summary: A high performance dataloader for Python, written in Rust
-Author: Benjamin Lefaudeux
+Author: Benjamin Lefaudeux, Roman Frigg
 Author-email: Photoroom <team@photoroom.com>
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
@@ -97,7 +97,7 @@ config = {
     "source_config": {
         "root_path": "myPath",
         "random_sampling": False, # True if used directly for training
-        "rank": 0,
+        "rank": 0, # Optional, distributed workloads are possible
         "world_size": 1,
     },
     "limit": 200,
@@ -137,15 +137,6 @@ client_config = {
         "rank": 0,
         "world_size": 1,
     },
-    # Optional pre-processing of the images, placing them in an aspect ratio bucket to preseve as much as possible of the original content
-    "image_config": {
-        "crop_and_resize": True, # False to turn it off, or just omit this part of the config
-        "default_image_size": 1024,
-        "downsampling_ratio": 32,
-        "min_aspect_ratio": 0.5,
-        "max_aspect_ratio": 2.0,
-        "pre_encode_images": False,
-    },
     "prefetch_buffer_size": 128,
     "samples_buffer_size": 64,
     "limit": 1_000_000, # Dummy example, max number of samples you would like to serve
@@ -159,6 +150,38 @@ for _ in range(10):
 </details>
+## Process images on the fly
+Datago can also process images on the fly, for instance to align different image payloads. This is done by adding an `image_config` to the configuration. The following example shows how to align different image payloads.
+Processing can be very CPU heavy, but it will be distributed over all CPU cores wihout requiring multiple python processes. I.e., you can keep a single python process using `get_sample()` on the client and still saturate all CPU cores.
+There are three main processing topics that you can choose from:
+- crop the images to within an aspect ratio bucket (which is very handy for all Transformer / patch based architectures)
+- resize the images (setting here will be related to the square aspect ratio bucket, other buckets will differ of course)
+- pre-encode the images to a specific format (jpg, png, ...)
+```python
+   config = {
+    "source_type": "file",
+    "source_config": {
+        "root_path": "myPath",
+        "random_sampling": False, # True if used directly for training
+    },
+    # Optional pre-processing of the images, placing them in an aspect ratio bucket to preserve as much as possible of the original content
+    "image_config": {
+        "crop_and_resize": True, # False to turn it off, or just omit this part of the config
+        "default_image_size": 1024,
+        "downsampling_ratio": 32,
+        "min_aspect_ratio": 0.5,
+        "max_aspect_ratio": 2.0,
+        "pre_encode_images": False,
+    },
+    "limit": 200,
+    "samples_buffer_size": 32,
+}
+```
 ## Match the raw exported buffers with typical python types
@@ -171,6 +194,14 @@ You can set the log level using the RUST_LOG environment variable. E.g. `RUST_LO
 When using the library from Python, `env_logger` will be initialized automatically when creating a `DatagoClient`. There is also a `initialize_logging` function in the `datago` module, which if called before using a client, allows to customize the log level. This only works if RUST_LOG is not set.
+## Env variables
+There are a couple of env variables which will change the behavior of the library, for settings which felt too low level to be exposed in the config.
+- `DATAGO_MAX_TASKS`: refers to the number of threads which will be used to load the samples. Defaults to a multiple of the CPU cores.
+- `RUST_LOG`: see above, will change the level of logging for the whole library, could be useful for debugging or to report an issue here.
+- `DATAGO_MAX_RETRIES`: number of retries for a failed sample load, defaults to 3.
 </details><details> <summary><strong>Build it</strong></summary>
 ## Preamble
@@ -233,6 +264,25 @@ Create a new tag and a new release in this repo, a new package will be pushed au
 </details>
+<details> <summary><strong>Benchmarks</strong></summary>
+As usual, benchmarks are a tricky game, and you shouldn't read too much into the following plots but do your own tests. Some python benchmark examples are provided in the [python](./python/) folder.
+In general, Datago will be impactful if you want to load a lot of images very fast, but if you consume them as you go at a more leisury pace then it's not really needed. The more CPU work there is with the images and the higher quality they are, the more Datago will shine. The following benchmarks are using ImageNet 1k, which is very low resolution and thus kind of a worst case scenario. Data is served from cache (i.e. the OS cache) and the images are not pre-processed. In this case the receiving python process is typically the bottleneck, and caps at around 2000 images per second.
+### AMD Zen3 laptop - IN1k - disk
+![AMD Zen3 laptop & M2 SSD](assets/zen3_ssd.png)
+### AMD EPYC 9454 - IN1k - disk
+![AMD EPYC 9454](assets/epyc_vast.png)
+This benchmark is using the PD12M dataset, which is a 12M images dataset, with a lot of high resolution images. It's accessed through the webdataset front end, datago is compared with the popular python webdataset library. Note that datago will start streaming the images faster here (almost instantly !), so given enough time the two results would look closer.
+### AMD EPYC 9454 - pd12m - webdataset
+![AMD EPYC 9454](assets/epyc_wds.png)
+</details>
 ## License
 MIT License

{datago-2025.8.1 → datago-2025.12.1}/README.md RENAMED Viewed

@@ -80,7 +80,7 @@ config = {
     "source_config": {
         "root_path": "myPath",
         "random_sampling": False, # True if used directly for training
-        "rank": 0,
+        "rank": 0, # Optional, distributed workloads are possible
         "world_size": 1,
     },
     "limit": 200,
@@ -120,15 +120,6 @@ client_config = {
         "rank": 0,
         "world_size": 1,
     },
-    # Optional pre-processing of the images, placing them in an aspect ratio bucket to preseve as much as possible of the original content
-    "image_config": {
-        "crop_and_resize": True, # False to turn it off, or just omit this part of the config
-        "default_image_size": 1024,
-        "downsampling_ratio": 32,
-        "min_aspect_ratio": 0.5,
-        "max_aspect_ratio": 2.0,
-        "pre_encode_images": False,
-    },
     "prefetch_buffer_size": 128,
     "samples_buffer_size": 64,
     "limit": 1_000_000, # Dummy example, max number of samples you would like to serve
@@ -142,6 +133,38 @@ for _ in range(10):
 </details>
+## Process images on the fly
+Datago can also process images on the fly, for instance to align different image payloads. This is done by adding an `image_config` to the configuration. The following example shows how to align different image payloads.
+Processing can be very CPU heavy, but it will be distributed over all CPU cores wihout requiring multiple python processes. I.e., you can keep a single python process using `get_sample()` on the client and still saturate all CPU cores.
+There are three main processing topics that you can choose from:
+- crop the images to within an aspect ratio bucket (which is very handy for all Transformer / patch based architectures)
+- resize the images (setting here will be related to the square aspect ratio bucket, other buckets will differ of course)
+- pre-encode the images to a specific format (jpg, png, ...)
+```python
+   config = {
+    "source_type": "file",
+    "source_config": {
+        "root_path": "myPath",
+        "random_sampling": False, # True if used directly for training
+    },
+    # Optional pre-processing of the images, placing them in an aspect ratio bucket to preserve as much as possible of the original content
+    "image_config": {
+        "crop_and_resize": True, # False to turn it off, or just omit this part of the config
+        "default_image_size": 1024,
+        "downsampling_ratio": 32,
+        "min_aspect_ratio": 0.5,
+        "max_aspect_ratio": 2.0,
+        "pre_encode_images": False,
+    },
+    "limit": 200,
+    "samples_buffer_size": 32,
+}
+```
 ## Match the raw exported buffers with typical python types
@@ -154,6 +177,14 @@ You can set the log level using the RUST_LOG environment variable. E.g. `RUST_LO
 When using the library from Python, `env_logger` will be initialized automatically when creating a `DatagoClient`. There is also a `initialize_logging` function in the `datago` module, which if called before using a client, allows to customize the log level. This only works if RUST_LOG is not set.
+## Env variables
+There are a couple of env variables which will change the behavior of the library, for settings which felt too low level to be exposed in the config.
+- `DATAGO_MAX_TASKS`: refers to the number of threads which will be used to load the samples. Defaults to a multiple of the CPU cores.
+- `RUST_LOG`: see above, will change the level of logging for the whole library, could be useful for debugging or to report an issue here.
+- `DATAGO_MAX_RETRIES`: number of retries for a failed sample load, defaults to 3.
 </details><details> <summary><strong>Build it</strong></summary>
 ## Preamble
@@ -216,6 +247,25 @@ Create a new tag and a new release in this repo, a new package will be pushed au
 </details>
+<details> <summary><strong>Benchmarks</strong></summary>
+As usual, benchmarks are a tricky game, and you shouldn't read too much into the following plots but do your own tests. Some python benchmark examples are provided in the [python](./python/) folder.
+In general, Datago will be impactful if you want to load a lot of images very fast, but if you consume them as you go at a more leisury pace then it's not really needed. The more CPU work there is with the images and the higher quality they are, the more Datago will shine. The following benchmarks are using ImageNet 1k, which is very low resolution and thus kind of a worst case scenario. Data is served from cache (i.e. the OS cache) and the images are not pre-processed. In this case the receiving python process is typically the bottleneck, and caps at around 2000 images per second.
+### AMD Zen3 laptop - IN1k - disk
+![AMD Zen3 laptop & M2 SSD](assets/zen3_ssd.png)
+### AMD EPYC 9454 - IN1k - disk
+![AMD EPYC 9454](assets/epyc_vast.png)
+This benchmark is using the PD12M dataset, which is a 12M images dataset, with a lot of high resolution images. It's accessed through the webdataset front end, datago is compared with the popular python webdataset library. Note that datago will start streaming the images faster here (almost instantly !), so given enough time the two results would look closer.
+### AMD EPYC 9454 - pd12m - webdataset
+![AMD EPYC 9454](assets/epyc_wds.png)
+</details>
 ## License
 MIT License

datago-2025.12.1/assets/epyc_vast.png ADDED Viewed

Binary file

datago-2025.12.1/assets/epyc_wds.png ADDED Viewed

Binary file

datago-2025.12.1/assets/zen3_ssd.png ADDED Viewed

Binary file

{datago-2025.8.1 → datago-2025.12.1}/pyproject.toml RENAMED Viewed

@@ -2,7 +2,7 @@
 name = "datago"
 dynamic = ["version"]
 authors = [
-  { name = "Benjamin Lefaudeux" },
+  { name = "Benjamin Lefaudeux, Roman Frigg" },
   { name = "Photoroom", email = "team@photoroom.com" }
 ]
 description = "A high performance dataloader for Python, written in Rust"

{datago-2025.8.1 → datago-2025.12.1}/python/benchmark_db.py RENAMED Viewed

@@ -1,11 +1,13 @@
-from datago import DatagoClient  # type: ignore
+import json
 import time
-from tqdm import tqdm
 import numpy as np
-from raw_types import raw_array_to_pil_image, raw_array_to_numpy
 import typer
-import json
+from benchmark_defaults import IMAGE_CONFIG
+from datago import DatagoClient  # type: ignore
 from PIL import Image
+from raw_types import raw_array_to_numpy
+from tqdm import tqdm
 def benchmark(
@@ -31,19 +33,20 @@ def benchmark(
             "rank": 0,
             "world_size": 1,
         },
-        "image_config": {
-            "crop_and_resize": crop_and_resize,
-            "default_image_size": 1024,
-            "downsampling_ratio": 32,
-            "min_aspect_ratio": 0.5,
-            "max_aspect_ratio": 2.0,
-            "pre_encode_images": encode_images,
-        },
         "prefetch_buffer_size": 128,
         "samples_buffer_size": 64,
         "limit": limit,
     }
+    if crop_and_resize or encode_images:
+        client_config["image_config"] = IMAGE_CONFIG
+    if encode_images:
+        client_config["image_config"]["crop_and_resize"] = (  # type: ignore
+            crop_and_resize  # You may want to encode images without resizing them
+        )
+        client_config["image_config"]["pre_encode_images"] = True  # type: ignore
     client = DatagoClient(json.dumps(client_config))
     client.start()  # Optional, but good practice to start the client to reduce latency to first sample (while you're instantiating models for instance)
     start = time.time()
@@ -55,21 +58,7 @@ def benchmark(
     for _ in tqdm(range(limit), dynamic_ncols=True):
         sample = client.get_sample()
         if sample.id:
-            # Bring the masks and image to PIL
-            if hasattr(sample, "image"):
-                img = raw_array_to_pil_image(sample.image)
-            if hasattr(sample, "masks"):
-                for _, mask_buffer in sample.masks.items():
-                    mask = raw_array_to_pil_image(mask_buffer)
-            if (
-                hasattr(sample, "additional_images")
-                and "masked_image" in sample.additional_images
-            ):
-                masked_image = raw_array_to_pil_image(
-                    sample.AdditionalImages["masked_image"]
-                )
+            # Images are already PIL by default
             # Bring the latents to numpy
             if hasattr(sample, "latents"):

datago-2025.12.1/python/benchmark_defaults.py ADDED Viewed

@@ -0,0 +1,8 @@
+IMAGE_CONFIG = {
+    "crop_and_resize": True,
+    "default_image_size": 1024,
+    "downsampling_ratio": 32,
+    "min_aspect_ratio": 0.5,
+    "max_aspect_ratio": 2.0,
+    "pre_encode_images": False,
+}

{datago-2025.8.1 → datago-2025.12.1}/python/benchmark_filesystem.py RENAMED Viewed

@@ -1,8 +1,11 @@
-import time
-from tqdm import tqdm
+import json
 import os
+import time
 import typer
+from benchmark_defaults import IMAGE_CONFIG
 from dataset import DatagoIterDataset
+from tqdm import tqdm
 def benchmark(
@@ -14,12 +17,30 @@ def benchmark(
         False, help="Crop and resize the images on the fly"
     ),
     compare_torch: bool = typer.Option(True, help="Compare against torch dataloader"),
+    num_workers: int = typer.Option(os.cpu_count(), help="Number of workers to use"),
+    sweep: bool = typer.Option(False, help="Sweep over the number of workers"),
 ):
-    print(f"Running benchmark for {root_path} - {limit} samples")
+    if sweep:
+        results_sweep = {}
+        for num_workers in range(2, (os.cpu_count() * 2 or 2), 2):
+            results_sweep[num_workers] = benchmark(
+                root_path, limit, crop_and_resize, compare_torch, num_workers, False
+            )
+        # Save results to a json file
+        with open("benchmark_results_filesystem.json", "w") as f:
+            json.dump(results_sweep, f, indent=2)
+        return results_sweep
     print(
-        "Please run the benchmark twice if you want to compare against torch dataloader, so that file caching affects both paths"
+        f"Running benchmark for {root_path} - {limit} samples - {num_workers} workers"
     )
+    # This setting is not exposed in the config, but an env variable can be used instead
+    os.environ["DATAGO_MAX_TASKS"] = str(num_workers)
     client_config = {
         "source_type": "file",
         "source_config": {
@@ -27,19 +48,14 @@ def benchmark(
             "rank": 0,
             "world_size": 1,
         },
-        "image_config": {
-            "crop_and_resize": crop_and_resize,
-            "default_image_size": 1024,
-            "downsampling_ratio": 32,
-            "min_aspect_ratio": 0.5,
-            "max_aspect_ratio": 2.0,
-            "pre_encode_images": False,
-        },
         "prefetch_buffer_size": 256,
         "samples_buffer_size": 256,
         "limit": limit,
     }
+    if crop_and_resize:
+        client_config["image_config"] = IMAGE_CONFIG
     # Make sure in the following that we compare apples to apples, meaning in that case
     # that we materialize the payloads in the python scope in the expected format
     # (PIL.Image for images and masks for instance, numpy arrays for latents)
@@ -48,14 +64,20 @@ def benchmark(
     img = None
     count = 0
-    for sample in tqdm(datago_dataset, dynamic_ncols=True):
+    for sample in tqdm(datago_dataset, desc="Datago", dynamic_ncols=True):
         assert sample["id"] != ""
         img = sample["image"]
+        if count < limit - 1:
+            del img
+            img = None  # Help with memory pressure
         count += 1
     assert count == limit, f"Expected {limit} samples, got {count}"
     fps = limit / (time.time() - start)
-    print(f"Datago FPS {fps:.2f}")
+    results = {"datago": {"fps": fps, "count": count}}
+    print(f"Datago - FPS {fps:.2f} - workers {num_workers}")
     del datago_dataset
     # Save the last image as a test
@@ -64,10 +86,9 @@ def benchmark(
     # Let's compare against a classic pytorch dataloader
     if compare_torch:
-        from torchvision import datasets, transforms  # type: ignore
         from torch.utils.data import DataLoader
+        from torchvision import datasets, transforms  # type: ignore
-        print("Benchmarking torch dataloader")
         # Define the transformations to apply to each image
         transform = (
             transforms.Compose(
@@ -88,7 +109,6 @@ def benchmark(
         # Create a DataLoader to allow for multiple workers
         # Use available CPU count for num_workers
-        num_workers = os.cpu_count() or 8  # Default to 8 if cpu_count returns None
         dataloader = DataLoader(
             dataset,
             batch_size=1,
@@ -100,12 +120,17 @@ def benchmark(
         # Iterate over the DataLoader
         start = time.time()
         n_images = 0
-        for batch in tqdm(dataloader, dynamic_ncols=True):
+        for batch in tqdm(dataloader, desc="Torch", dynamic_ncols=True):
             n_images += len(batch)
             if n_images > limit:
                 break
+            del batch  # Help with memory pressure, same as above
         fps = n_images / (time.time() - start)
-        print(f"Torch FPS {fps:.2f}")
+        results["torch"] = {"fps": fps, "count": n_images}
+        print(f"Torch - FPS {fps:.2f} - workers {num_workers}")
+    return results
 if __name__ == "__main__":

datago 2025.8.1__tar.gz → 2025.12.1__tar.gz

datago 2025.8.1tar.gz → 2025.12.1tar.gz