PyPI - datago - Versions diffs - 2025.6.5__tar.gz → 2025.8.1__tar.gz - Mend

datago 2025.6.5tar.gz → 2025.8.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{datago-2025.6.5 → datago-2025.8.1}/Cargo.lock RENAMED Viewed

@@ -613,7 +613,7 @@ dependencies = [
 [[package]]
 name = "datago"
-version = "2025.6.5"
+version = "2025.8.1"
 dependencies = [
  "async-compression",
  "async-tar",

{datago-2025.6.5 → datago-2025.8.1}/Cargo.toml RENAMED Viewed

@@ -1,7 +1,8 @@
 [package]
 name = "datago"
 edition = "2021"
-version = "2025.6.5"
+version = "2025.8.1"
+readme = "README.md"
 [lib]
 # exposed by pyo3

{datago-2025.6.5 → datago-2025.8.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datago
-Version: 2025.6.5
+Version: 2025.8.1
 Classifier: Programming Language :: Rust
 Classifier: Programming Language :: Python :: Implementation :: CPython
 Classifier: Programming Language :: Python :: Implementation :: PyPy
@@ -8,6 +8,7 @@ Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: MIT License
 License-File: LICENSE
 Summary: A high performance dataloader for Python, written in Rust
+Author: Benjamin Lefaudeux
 Author-email: Photoroom <team@photoroom.com>
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM

{datago-2025.6.5 → datago-2025.8.1}/pyproject.toml RENAMED Viewed

@@ -1,17 +1,19 @@
 [project]
 name = "datago"
+dynamic = ["version"]
 authors = [
-  { name="Photoroom", email="team@photoroom.com" },
+  { name = "Benjamin Lefaudeux" },
+  { name = "Photoroom", email = "team@photoroom.com" }
 ]
 description = "A high performance dataloader for Python, written in Rust"
 readme = "README.md"
 requires-python = ">=3.8"
 classifiers = [
-    "Programming Language :: Rust",
-    "Programming Language :: Python :: Implementation :: CPython",
-    "Programming Language :: Python :: Implementation :: PyPy",
-    "Programming Language :: Python :: 3",
-    "License :: OSI Approved :: MIT License",
+  "Programming Language :: Rust",
+  "Programming Language :: Python :: Implementation :: CPython",
+  "Programming Language :: Python :: Implementation :: PyPy",
+  "Programming Language :: Python :: 3",
+  "License :: OSI Approved :: MIT License",
 ]
 dependencies = []

{datago-2025.6.5 → datago-2025.8.1}/python/test_datago_edge_cases.py RENAMED Viewed

@@ -238,11 +238,10 @@ class TestDatagoEdgeCases:
                 "samples_buffer_size": 10,
             }
+            # Should flag that the config is not correct
             client = DatagoClient(json.dumps(config))
-            _sample = client.get_sample()
-            # Should handle gracefully (might return None or work with adjusted parameters)
-            # The exact behavior depends on implementation
+            sample = client.get_sample()
+            assert sample is None
     def test_very_large_buffer_sizes(self):
         """Test with very large buffer sizes."""

{datago-2025.6.5 → datago-2025.8.1}/src/client.rs RENAMED Viewed

@@ -26,14 +26,58 @@ pub struct DatagoClient {
     // Holds all the variables related to a running engine
     engine: Option<DatagoEngine>,
+    is_valid: bool,
+}
+fn check_config(str_config: &str) -> Option<DatagoClientConfig> {
+    match serde_json::from_str::<DatagoClientConfig>(str_config) {
+        Ok(config) => {
+            if config.samples_buffer_size == 0 {
+                error!("Samples buffer size must be greater than 0");
+                return None;
+            }
+            if config.limit == 0 {
+                error!("Limit must be greater than 0");
+                return None;
+            }
+            // Check that a distributed config is valid, and error out early if not
+            let world_size = config
+                .source_config
+                .get("world_size")
+                .and_then(|v| v.as_u64())
+                .unwrap_or(1) as usize;
+            let rank = config
+                .source_config
+                .get("rank")
+                .and_then(|v| v.as_u64())
+                .unwrap_or(0) as usize;
+            if world_size == 0 {
+                error!("World size must be greater than 0");
+                return None;
+            }
+            if rank >= world_size {
+                error!("Rank must be less than world size");
+                return None;
+            }
+            Some(config)
+        }
+        Err(e) => {
+            error!("Failed to parse config: {e}");
+            None
+        }
+    }
 }
 #[pymethods]
 impl DatagoClient {
     #[new]
     pub fn new(str_config: String) -> Self {
-        match serde_json::from_str::<DatagoClientConfig>(&str_config) {
-            Ok(config) => {
+        match check_config(&str_config) {
+            Some(config) => {
                 let mut image_transform: Option<ARAwareTransform> = None;
                 let mut encode_images = false;
                 let mut image_to_rgb8 = false;
@@ -45,8 +89,6 @@ impl DatagoClient {
                     image_to_rgb8 = image_config.image_to_rgb8;
                 }
-                assert!(config.limit > 0, "Limit must be greater than 0");
                 DatagoClient {
                     is_started: false,
                     source_type: config.source_type,
@@ -58,10 +100,24 @@ impl DatagoClient {
                     encode_images,
                     image_to_rgb8,
                     engine: None,
+                    is_valid: true,
                 }
             }
-            Err(e) => {
-                panic!("Failed to parse config: {}", e);
+            None => {
+                error!("Failed to parse config");
+                DatagoClient {
+                    is_started: false,
+                    source_type: SourceType::Invalid,
+                    source_config: serde_json::Value::Null,
+                    samples_buffer: 0,
+                    limit: 0,
+                    max_connections: 0,
+                    image_transform: None,
+                    encode_images: false,
+                    image_to_rgb8: false,
+                    engine: None,
+                    is_valid: false,
+                }
             }
         }
     }
@@ -87,12 +143,20 @@ impl DatagoClient {
                 warn!("WebDataset source type is new and experimental, use with caution!\nPlease report any issues you encounter to https://github.com/Photoroom/datago/issues.");
                 self.engine = Some(generator_wds::orchestrate(self));
             }
+            SourceType::Invalid => {
+                error!("Client ill-defined, probably a config error. Cannot start");
+                return;
+            }
         }
         self.is_started = true;
     }
     pub fn get_sample(&mut self) -> Option<Sample> {
+        if !self.is_valid {
+            return None;
+        }
         if !self.is_started {
             self.start();
         }
@@ -120,7 +184,7 @@ impl DatagoClient {
                     }
                 },
                 Err(e) => {
-                    warn!("Timeout waiting for sample, stopping the client. {}", e);
+                    warn!("Timeout waiting for sample, stopping the client. {e}");
                     self.stop();
                     None
                 }
@@ -514,7 +578,7 @@ mod tests {
         let mut config = get_test_config();
         let tag1 = "photo";
         let tag2 = "graphic";
-        config["source_config"]["tags_ne_all"] = format!("{},{}", tag1, tag2).into();
+        config["source_config"]["tags_ne_all"] = format!("{tag1},{tag2}").into();
         let mut client = DatagoClient::new(config.to_string());
         let sample = client.get_sample();
@@ -599,7 +663,7 @@ mod tests {
         config["source_config"]["sources_ne"] = "LAION_ART".into();
         config["limit"] = json!(limit);
-        debug!("{}", config);
+        debug!("{config}");
         let mut client = DatagoClient::new(config.to_string());
         for _ in 0..limit {

{datago-2025.6.5 → datago-2025.8.1}/src/generator_files.rs RENAMED Viewed

@@ -5,8 +5,6 @@ use kanal::bounded;
 use log::{debug, info};
 use rand::seq::SliceRandom;
 use serde::{Deserialize, Serialize};
-use std::collections::hash_map::DefaultHasher;
-use std::hash::{Hash, Hasher};
 use std::thread;
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -23,17 +21,24 @@ pub struct SourceFileConfig {
     pub world_size: usize,
 }
-// Hash function to be able to dispatch the samples to the correct rank
+fn get_data_slice_multirank(quorum: usize, rank: usize, world_size: usize) -> (usize, usize) {
+    assert!(rank < world_size, "Rank must be less than world size");
-// The seed ensures consistent hashing across different runs,
-// essentially acting as a deterministic salt
-const HASH_SEED: u64 = 0x51_73_b3_c3_7f_d9_2e_a1;
+    let chunk_size = quorum / world_size; // This floors by default
+    let remainder = quorum % world_size;
-fn hash<T: Hash>(t: &T) -> u64 {
-    let mut hasher = DefaultHasher::new();
-    HASH_SEED.hash(&mut hasher); // Add seed first
-    t.hash(&mut hasher); // Then hash the actual data
-    hasher.finish()
+    let start = if rank < remainder {
+        rank * (chunk_size + 1)
+    } else {
+        remainder * (chunk_size + 1) + (rank - remainder) * chunk_size
+    };
+    let end = if (rank + 1) <= remainder {
+        (rank + 1) * (chunk_size + 1)
+    } else {
+        remainder * (chunk_size + 1) + (rank + 1 - remainder) * chunk_size
+    };
+    (start, end)
 }
 fn enumerate_files(
@@ -49,6 +54,7 @@ fn enumerate_files(
         .into_iter()
         .filter_map(|e| e.ok());
+    // We need to materialize the file list to be able to shuffle it
     let mut files_list: Vec<walkdir::DirEntry> = files
         .filter_map(|entry| {
             let path = entry.path();
@@ -65,13 +71,18 @@ fn enumerate_files(
         .collect();
     // If shuffle is set, shuffle the files
-    let files_iter = if source_config.random_sampling {
+    if source_config.random_sampling {
         let mut rng = rand::rng(); // Get a random number generator, thread local. We don´t seed, so typically won't be reproducible
-        files_list.shuffle(&mut rng);
-        files_list.into_iter()
-    } else {
-        files_list.into_iter()
-    };
+        files_list.shuffle(&mut rng); // This happens in place
+    }
+    // If world_size > 1, we need to split the files list into chunks and only process the chunk corresponding to the rank
+    if source_config.world_size > 1 {
+        let quorum = files_list.len();
+        let (start, end) =
+            get_data_slice_multirank(quorum, source_config.rank, source_config.world_size);
+        files_list = files_list[start..end].to_vec();
+    }
     // Iterate over the files and send the paths as they come
     let mut count = 0;
@@ -81,17 +92,8 @@ fn enumerate_files(
     let max_submitted_samples = (1.1 * (limit as f64)).ceil() as usize;
     // Build a page from the files iterator
-    for entry in files_iter {
-        let file_name = entry.path().to_str().unwrap().to_string();
-        // If world_size is not 0, we need to dispatch the samples to the correct rank
-        if source_config.world_size > 1 {
-            let hash = hash(&file_name);
-            let target_rank = (hash % source_config.world_size as u64) as usize;
-            if target_rank != source_config.rank {
-                continue;
-            }
-        }
+    for entry in files_list.iter() {
+        let file_name: String = entry.path().to_str().unwrap().to_string();
         if samples_metadata_tx
             .send(serde_json::Value::String(file_name))
@@ -110,10 +112,7 @@ fn enumerate_files(
     }
     // Either we don't have any more samples or we have reached the limit
-    debug!(
-        "ping_pages: total samples requested: {}. page samples served {}",
-        limit, count
-    );
+    debug!("ping_pages: total samples requested: {limit}. page samples served {count}");
     // Send an empty value to signal the end of the stream
     match samples_metadata_tx.send(serde_json::Value::Null) {
@@ -183,19 +182,55 @@ mod tests {
     use tempfile::TempDir;
     #[test]
-    fn test_hash_function() {
-        let str1 = "test_string1";
-        let str2 = "test_string2";
-        let str3 = "test_string1"; // Same as str1
-        let hash1 = hash(&str1);
-        let hash2 = hash(&str2);
-        let hash3 = hash(&str3);
-        // Same input should produce same hash
-        assert_eq!(hash1, hash3);
-        // Different inputs should likely produce different hashes
-        assert_ne!(hash1, hash2);
+    fn test_get_data_slice_multirank() {
+        // Test case 1: Equal distribution with no remainder
+        let (start, end) = get_data_slice_multirank(10, 0, 2);
+        assert_eq!(start, 0);
+        assert_eq!(end, 5);
+        let (start, end) = get_data_slice_multirank(10, 1, 2);
+        assert_eq!(start, 5);
+        assert_eq!(end, 10);
+        // Test case 2: Unequal distribution with remainder
+        let (start, end) = get_data_slice_multirank(11, 0, 2);
+        assert_eq!(start, 0);
+        assert_eq!(end, 6);
+        let (start, end) = get_data_slice_multirank(11, 1, 2);
+        assert_eq!(start, 6);
+        assert_eq!(end, 11);
+        // Test case 3: Multiple ranks with remainder
+        let (start, end) = get_data_slice_multirank(13, 0, 3);
+        assert_eq!(start, 0);
+        assert_eq!(end, 5);
+        let (start, end) = get_data_slice_multirank(13, 1, 3);
+        assert_eq!(start, 5);
+        assert_eq!(end, 9);
+        let (start, end) = get_data_slice_multirank(13, 2, 3);
+        assert_eq!(start, 9);
+        assert_eq!(end, 13);
+        // Test case 4: Single rank
+        let (start, end) = get_data_slice_multirank(10, 0, 1);
+        assert_eq!(start, 0);
+        assert_eq!(end, 10);
+        // Test case 5: Edge case with zero quorum
+        let (start, end) = get_data_slice_multirank(0, 0, 1);
+        assert_eq!(start, 0);
+        assert_eq!(end, 0);
+        // Test case 6: Edge case with zero world size (should panic or handle gracefully)
+        // Note: This test assumes the function should panic or handle the zero division gracefully
+        // You may need to adjust the test based on your actual error handling
+        let result = std::panic::catch_unwind(|| {
+            get_data_slice_multirank(10, 0, 0);
+        });
+        assert!(result.is_err());
     }
     #[test]
@@ -227,14 +262,19 @@ mod tests {
         assert_eq!(config.world_size, 4);
     }
-    fn create_test_images(dir: &Path) -> Vec<String> {
+    fn create_test_images(dir: &Path, min_num_files: usize) -> Vec<String> {
         let extensions = ["jpg", "png", "bmp", "gif", "JPEG"];
         let mut files = Vec::new();
-        for (i, ext) in extensions.iter().enumerate() {
-            let filename = format!("test_image_{}.{}", i, ext);
-            let filepath = dir.join(&filename);
-            fs::write(&filepath, "fake_image_data").unwrap();
-            files.push(filepath.to_string_lossy().to_string());
+        let mut n_files = 0;
+        while n_files < min_num_files {
+            for (i, ext) in extensions.iter().enumerate() {
+                let filename = format!("test_image_{n_files}_{i}.{ext}");
+                let filepath = dir.join(&filename);
+                fs::write(&filepath, "fake_image_data").unwrap();
+                files.push(filepath.to_string_lossy().to_string());
+                n_files += 1;
+            }
         }
         // Create a non-image file that should be ignored
@@ -248,8 +288,8 @@ mod tests {
     fn test_enumerate_files_basic() {
         let temp_dir = TempDir::new().unwrap();
         let temp_path = temp_dir.path();
-        let created_files = create_test_images(temp_path);
+        let limit = 10;
+        let created_files = create_test_images(temp_path, limit);
         let (tx, rx) = kanal::bounded(100);
         let config = SourceFileConfig {
@@ -284,8 +324,8 @@ mod tests {
     fn test_enumerate_files_with_limit() {
         let temp_dir = TempDir::new().unwrap();
         let temp_path = temp_dir.path();
-        create_test_images(temp_path);
+        let limit = 10;
+        create_test_images(temp_path, limit);
         let (tx, rx) = kanal::bounded(100);
         let config = SourceFileConfig {
@@ -295,7 +335,6 @@ mod tests {
             world_size: 1,
         };
-        let limit = 2;
         std::thread::spawn(move || {
             enumerate_files(tx, config, limit);
         });
@@ -318,8 +357,9 @@ mod tests {
     fn test_enumerate_files_with_world_size() {
         let temp_dir = TempDir::new().unwrap();
         let temp_path = temp_dir.path();
+        let limit = 10;
-        create_test_images(temp_path);
+        create_test_images(temp_path, limit * 2); // We'll check that each rank has "limit" files l416
         // Test rank 0 of world_size 2
         let (tx1, rx1) = kanal::bounded(100);
@@ -340,11 +380,11 @@ mod tests {
         };
         std::thread::spawn(move || {
-            enumerate_files(tx1, config1, 10);
+            enumerate_files(tx1, config1, limit);
         });
         std::thread::spawn(move || {
-            enumerate_files(tx2, config2, 10);
+            enumerate_files(tx2, config2, limit);
         });
         let mut files_rank0 = Vec::new();
@@ -373,16 +413,17 @@ mod tests {
         }
         // Both ranks should have some files
-        assert!(!files_rank0.is_empty());
-        assert!(!files_rank1.is_empty());
+        assert!(files_rank0.len() >= limit);
+        assert!(files_rank1.len() >= limit);
     }
     #[test]
     fn test_enumerate_files_random_sampling() {
         let temp_dir = TempDir::new().unwrap();
         let temp_path = temp_dir.path();
+        let limit = 10;
-        create_test_images(temp_path);
+        create_test_images(temp_path, limit);
         // Run twice with random sampling to see if order changes
         let (tx1, rx1) = kanal::bounded(100);
@@ -402,11 +443,11 @@ mod tests {
         };
         std::thread::spawn(move || {
-            enumerate_files(tx1, config1, 10);
+            enumerate_files(tx1, config1, limit);
         });
         std::thread::spawn(move || {
-            enumerate_files(tx2, config2, 10);
+            enumerate_files(tx2, config2, limit);
         });
         let mut files1 = Vec::new();

{datago-2025.6.5 → datago-2025.8.1}/src/generator_http.rs RENAMED Viewed

@@ -130,9 +130,9 @@ struct DbRequest {
 impl DbRequest {
     async fn get_http_request(&self, api_url: &str, api_key: &str) -> reqwest::Request {
         let mut url = if self.random_sampling {
-            Url::parse(&format!("{}images/random/", api_url))
+            Url::parse(&format!("{api_url}images/random/"))
         } else {
-            Url::parse(&format!("{}images/", api_url))
+            Url::parse(&format!("{api_url}images/"))
         }
         .unwrap(); // Cannot survive without the URL, that's a panic
@@ -183,7 +183,7 @@ impl DbRequest {
         let mut req = reqwest::Request::new(reqwest::Method::GET, url);
         req.headers_mut().append(
             AUTHORIZATION,
-            HeaderValue::from_str(&format!("Token {}", api_key))
+            HeaderValue::from_str(&format!("Token {api_key}"))
                 .expect("Couldn't parse the provided API key"),
         );
@@ -276,7 +276,7 @@ fn build_request(source_config: SourceDBConfig) -> DbRequest {
         "Rank cannot be greater than or equal to world size"
     );
-    debug!("Fields: {}", fields);
+    debug!("Fields: {fields}");
     debug!(
         "Rank: {}, World size: {}",
         source_config.rank, source_config.world_size
@@ -363,7 +363,7 @@ async fn async_pull_and_dispatch_pages(
     let mut headers = HeaderMap::new();
     headers.insert(
         AUTHORIZATION,
-        HeaderValue::from_str(&format!("Token  {}", api_key)).unwrap(),
+        HeaderValue::from_str(&format!("Token  {api_key}")).unwrap(),
     );
     let db_request = build_request(source_config.clone());
@@ -380,7 +380,7 @@ async fn async_pull_and_dispatch_pages(
             if let Some(next) = response_json.get("next") {
                 next_url = next;
             } else {
-                debug!("No next URL in the response {:?}", response_json);
+                debug!("No next URL in the response {response_json:?}");
             }
         }
         Err(e) => {
@@ -420,7 +420,7 @@ async fn async_pull_and_dispatch_pages(
                 }
             }
             None => {
-                debug!("No results in the response: {:?}", response_json);
+                debug!("No results in the response: {response_json:?}");
             }
         }
@@ -455,8 +455,7 @@ async fn async_pull_and_dispatch_pages(
     // Either we don't have any more samples or we have reached the limit
     debug!(
-        "pull_and_dispatch_pages: total samples requested: {}. page samples served {}",
-        limit, count
+        "pull_and_dispatch_pages: total samples requested: {limit}. page samples served {count}"
     );
     // Send an empty value to signal the end of the stream

{datago-2025.6.5 → datago-2025.8.1}/src/generator_wds.rs RENAMED Viewed

@@ -95,9 +95,7 @@ async fn pull_tarballs(
     // Convert the byte stream to an AsyncRead
     let byte_stream = response.bytes_stream();
     let stream_reader =
-        StreamReader::new(byte_stream.map(|res_bytes| {
-            res_bytes.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))
-        }));
+        StreamReader::new(byte_stream.map(|res_bytes| res_bytes.map_err(std::io::Error::other)));
     // Wrap in BufReader for the async Tar reader
     let buf_reader = BufReader::new(stream_reader);
@@ -107,18 +105,18 @@ async fn pull_tarballs(
     let mut entries = archive
         .entries()
-        .map_err(|e| format!("Failed to fetch TarballSample: {}", e))?; // This returns a stream
+        .map_err(|e| format!("Failed to fetch TarballSample: {e}"))?; // This returns a stream
     let mut current_sample_key: Option<String> = None;
     let mut current_files_for_sample = TarballSample::new(url.to_string());
     while let Some(entry_result) = entries.next().await {
         let mut entry =
-            entry_result.map_err(|e| format!("Failed to read TarballSample entry: {}", e))?;
+            entry_result.map_err(|e| format!("Failed to read TarballSample entry: {e}"))?;
         let header_path = entry
             .path()
-            .map_err(|e| format!("Error considering TarballSample content {}", e))?
+            .map_err(|e| format!("Error considering TarballSample content {e}"))?
             .into_owned();
         let filename = header_path.to_string_lossy().into_owned();
@@ -172,7 +170,7 @@ async fn pull_tarballs(
         entry
             .read_to_end(&mut buffer)
             .await
-            .map_err(|e| format!("Failed to read TarballSample {}", e))?; // Read the content of the current file
+            .map_err(|e| format!("Failed to read TarballSample {e}"))?; // Read the content of the current file
         current_files_for_sample.add(BinaryFile { filename, buffer });
         debug!(
@@ -191,10 +189,7 @@ async fn pull_tarballs(
         return Err("Channel closed".into());
     }
-    debug!(
-        "dispatch_shards (streaming): finished processing TarballSample {}",
-        url
-    );
+    debug!("dispatch_shards (streaming): finished processing TarballSample {url}");
     Ok(())
 }
@@ -221,10 +216,7 @@ async fn pull_tarballs_task(
             }
             Err(e) => {
                 attempt += 1;
-                debug!(
-                    "Error pulling TarballSample: {}. Attempt {}/{}",
-                    e, attempt, retries
-                );
+                debug!("Error pulling TarballSample: {e}. Attempt {attempt}/{retries}");
                 if samples_metadata_tx.is_closed() {
                     debug!(
                         "dispatch_shards: samples_metadata_tx channel closed, stopping retries."
@@ -235,8 +227,7 @@ async fn pull_tarballs_task(
         }
     }
     Err(format!(
-        "Failed to pull TarballSample after {} attempts",
-        retries
+        "Failed to pull TarballSample after {retries} attempts"
     ))
 }
@@ -266,19 +257,19 @@ async fn get_url_list(
         // Given the url, list all the available webdataset files
         let request = reqwest::Request::new(
             reqwest::Method::GET,
-            Url::parse(&config.url).map_err(|e| format!("Failed parsing url: {}", e))?,
+            Url::parse(&config.url).map_err(|e| format!("Failed parsing url: {e}"))?,
         );
         let response = shared_client
             .client
             .execute(request)
             .await
-            .map_err(|e| format!("Failed parsing reply: {}", e))?;
+            .map_err(|e| format!("Failed parsing reply: {e}"))?;
         let response_text = response
             .text()
             .await
-            .map_err(|e| format!("Failed parsing reply: {}", e))?;
+            .map_err(|e| format!("Failed parsing reply: {e}"))?;
         let response_json: serde_json::Value =
             serde_json::from_str(&response_text).unwrap_or(serde_json::Value::Null);
@@ -336,7 +327,7 @@ async fn tasks_from_shards(
                                 }
                                 Err(e) => {
                                     // Logging as debug, could be that channels are closed
-                                    debug!("dispatch_shards: task returned error: {:?}", e);
+                                    debug!("dispatch_shards: task returned error: {e:?}");
                                     join_error = Some(e);
                                     break;
                                 }
@@ -361,7 +352,7 @@ async fn tasks_from_shards(
                         count += 1;
                     }
                     Err(e) => {
-                        debug!("dispatch_shards: task returned error: {:?}", e);
+                        debug!("dispatch_shards: task returned error: {e}");
                         // Note that we only keep the first error, which is probably the most relevant
                         if join_error.is_none() {
                             join_error = Some(e);
@@ -372,7 +363,7 @@ async fn tasks_from_shards(
             if join_error.is_some() {
                 // If we had an error, we log it and return an error
-                warn!("dispatch_shards: one of the tasks failed: {:?}", join_error);
+                warn!("dispatch_shards: one of the tasks failed: {join_error:?}");
                 return Err(join_error.unwrap().to_string());
             }
@@ -380,7 +371,7 @@ async fn tasks_from_shards(
             if count == 0 {
                 warn!("No items found in the response");
             }
-            debug!("Served {} items from the bucket", count);
+            debug!("Served {count} items from the bucket");
             // Send an empty value to signal the end of the stream
             if samples_metadata_tx
@@ -393,7 +384,7 @@ async fn tasks_from_shards(
             Ok(response_json)
         }
         Err(e) => {
-            warn!("Failed to get URL list: {}", e);
+            warn!("Failed to get URL list: {e}");
             Err(e) // Return a JoinError with the error message
         }
     }
@@ -421,7 +412,7 @@ fn query_shards_and_dispatch(
                     debug!("query_shards_and_dispatch: finished processing all shards");
                 }
                 Err(e) => {
-                    debug!("query_shards_and_dispatch: ended with : {:?}", e);
+                    debug!("query_shards_and_dispatch: ended with : {e:?}");
                 }
             }
         });
@@ -542,7 +533,7 @@ mod tests {
                 }
             }
-            debug!("Received {} items", count);
+            debug!("Received {count} items");
             let _ = samples_meta_rx.close();
             feeder.join().expect("Feeder thread panicked");
@@ -593,7 +584,7 @@ mod tests {
                     break;
                 }
             }
-            info!("Received {} items", count);
+            info!("Received {count} items");
             assert!(count >= limit, "Not enough items found in the bucket");
             client.stop();
@@ -650,7 +641,7 @@ mod tests {
                     break;
                 }
             }
-            info!("Received {} items", count);
+            info!("Received {count} items");
             client.stop();
             samples

{datago-2025.6.5 → datago-2025.8.1}/src/image_processing.rs RENAMED Viewed

@@ -59,10 +59,7 @@ impl ImageTransformConfig {
             self.max_aspect_ratio,
         );
-        debug!(
-            "Cropping and resizing images. Target image sizes:\n{:?}\n",
-            target_image_sizes
-        );
+        debug!("Cropping and resizing images. Target image sizes:\n{target_image_sizes:?}\n");
         let mut aspect_ratio_to_size = std::collections::HashMap::new();
         for img_size in &target_image_sizes {
@@ -325,12 +322,7 @@ pub async fn image_to_payload(
             image.height(),
             image.color().into(),
         )
-        .map_err(|e| {
-            image::ImageError::IoError(std::io::Error::new(
-                std::io::ErrorKind::Other,
-                e.to_string(),
-            ))
-        })?;
+        .map_err(std::io::Error::other)?;
         channels = -1; // Signal the fact that the image is encoded
     } else {
@@ -687,8 +679,8 @@ mod tests {
         // Fill with some test data
         let buffer = img.buffer_mut();
-        for i in 0..buffer.len() {
-            buffer[i] = (i % 256) as u8;
+        for (i, item) in buffer.iter_mut().enumerate() {
+            *item = (i % 256) as u8;
         }
         let dyn_img = image_to_dyn_image(&img);
@@ -704,8 +696,8 @@ mod tests {
         let mut img = Image::new(width, height, fr::PixelType::U8x4);
         let buffer = img.buffer_mut();
-        for i in 0..buffer.len() {
-            buffer[i] = ((i * 63) % 256) as u8;
+        for (i, item) in buffer.iter_mut().enumerate() {
+            *item = ((i * 63) % 256) as u8;
         }
         let dyn_img = image_to_dyn_image(&img);
@@ -721,8 +713,8 @@ mod tests {
         let mut img = Image::new(width, height, fr::PixelType::U8);
         let buffer = img.buffer_mut();
-        for i in 0..buffer.len() {
-            buffer[i] = (i % 256) as u8;
+        for (i, item) in buffer.iter_mut().enumerate() {
+            *item = (i % 256) as u8;
         }
         let dyn_img = image_to_dyn_image(&img);

{datago-2025.6.5 → datago-2025.8.1}/src/main.rs RENAMED Viewed

@@ -119,7 +119,7 @@ fn main() {
         "samples_buffer_size": samples_buffer_size
     });
-    info!("{}", config);
+    info!("{config}");
     let mut client = client::DatagoClient::new(config.to_string());
@@ -141,7 +141,7 @@ fn main() {
             }
             if save_samples {
                 let img = image::load_from_memory(&sample.image.data).unwrap();
-                let filename = format!("sample_{:?}.jpg", num_samples_received);
+                let filename = format!("sample_{num_samples_received:?}.jpg");
                 img.save(filename).unwrap();
             }
             num_samples_received += 1;
@@ -160,18 +160,15 @@ fn main() {
         }
     }
     client.stop();
-    info!(
-        "All samples processed. Got {:?} samples\n",
-        num_samples_received
-    );
+    info!("All samples processed. Got {num_samples_received:?} samples\n");
     // Report the per-bucket occupancy, good sanity check
     if crop_and_resize {
         let mut size_buckets_str = String::from("Size buckets:\n");
         for (size, count) in size_buckets.iter() {
-            size_buckets_str.push_str(&format!("{}: {}\n", size, count));
+            size_buckets_str.push_str(&format!("{size}: {count}\n"));
         }
-        info!("{}", size_buckets_str);
+        info!("{size_buckets_str}");
     }
     let elapsed_secs = start_time.elapsed().as_secs_f64();

{datago-2025.6.5 → datago-2025.8.1}/src/structs.rs RENAMED Viewed

@@ -13,6 +13,7 @@ pub enum SourceType {
     Db,
     File,
     WebDataset,
+    Invalid,
 }
 fn default_source_type() -> SourceType {

{datago-2025.6.5 → datago-2025.8.1}/src/worker_files.rs RENAMED Viewed

@@ -6,9 +6,8 @@ use std::collections::HashMap;
 use std::sync::Arc;
 async fn image_from_path(path: &str) -> Result<image::DynamicImage, image::ImageError> {
-    let bytes = std::fs::read(path).map_err(|e| {
-        image::ImageError::IoError(std::io::Error::new(std::io::ErrorKind::Other, e))
-    })?;
+    let bytes =
+        std::fs::read(path).map_err(|e| image::ImageError::IoError(std::io::Error::other(e)))?;
     image::load_from_memory(&bytes)
 }
@@ -70,7 +69,7 @@ async fn pull_sample(
             Ok(())
         }
         Err(e) => {
-            error!("Failed to load image from path {} {}", sample_json, e);
+            error!("Failed to load image from path {sample_json} {e}");
             Err(())
         }
     }
@@ -125,7 +124,7 @@ async fn async_pull_samples(
             debug!("file_worker: task failed or was cancelled");
         }
     });
-    debug!("file_worker: total samples sent: {}\n", count);
+    debug!("file_worker: total samples sent: {count}\n");
     // Signal the end of the stream
     if samples_tx.send(None).is_ok() {};
@@ -349,7 +348,7 @@ mod tests {
         // Create multiple test images
         let mut image_paths = Vec::new();
         for i in 0..3 {
-            let image_path = temp_dir.path().join(format!("test_{}.png", i));
+            let image_path = temp_dir.path().join(format!("test_{i}.png"));
             create_test_image(&image_path);
             image_paths.push(image_path.to_str().unwrap().to_string());
         }
@@ -391,7 +390,7 @@ mod tests {
         // Create more images than the limit
         for i in 0..10 {
-            let image_path = temp_dir.path().join(format!("test_{}.png", i));
+            let image_path = temp_dir.path().join(format!("test_{i}.png"));
             create_test_image(&image_path);
         }
@@ -400,7 +399,7 @@ mod tests {
         // Send more paths than the limit
         for i in 0..10 {
-            let path = temp_dir.path().join(format!("test_{}.png", i));
+            let path = temp_dir.path().join(format!("test_{i}.png"));
             metadata_tx
                 .send(serde_json::Value::String(
                     path.to_str().unwrap().to_string(),

{datago-2025.6.5 → datago-2025.8.1}/src/worker_http.rs RENAMED Viewed

@@ -64,14 +64,13 @@ async fn image_from_url(
             match image::load_from_memory(&bytes) {
                 Ok(image) => return Ok(image),
                 Err(e) => {
-                    warn!("Failed to decode image from URL: {}. Retrying", url);
-                    warn!("Error: {:?}", e);
+                    warn!("Failed to decode image from URL: {url}. Retrying");
+                    warn!("Error: {e:?}");
                 }
             }
         }
     }
-    Err(image::ImageError::IoError(std::io::Error::new(
-        std::io::ErrorKind::Other,
+    Err(image::ImageError::IoError(std::io::Error::other(
         "Failed to fetch image bytes",
     )))
 }
@@ -88,14 +87,11 @@ async fn payload_from_url(
                 return Ok(bytes);
             }
             None => {
-                warn!("Failed to get bytes from URL: {}. Retrying", url);
+                warn!("Failed to get bytes from URL: {url}. Retrying");
             }
         }
     }
-    Err(std::io::Error::new(
-        std::io::ErrorKind::Other,
-        "Failed to fetch bytes buffer",
-    ))
+    Err(std::io::Error::other("Failed to fetch bytes buffer"))
 }
 async fn image_payload_from_url(
@@ -157,8 +153,8 @@ async fn pull_sample(
                 Some(payload)
             }
             Err(e) => {
-                error!("Failed to get image from URL: {}\n {:?}", image_url, e);
-                error!("Error: {:?}", e);
+                error!("Failed to get image from URL: {image_url}\n {e:?}");
+                error!("Error: {e:?}");
                 return Err(());
             }
         };
@@ -282,7 +278,7 @@ async fn async_pull_samples(
     // We use async-await here, to better use IO stalls
     // We'll keep a pool of N async tasks in parallel
     let max_tasks = min(num_cpus::get(), limit);
-    debug!("Using {} tasks in the async threadpool", max_tasks);
+    debug!("Using {max_tasks} tasks in the async threadpool");
     let mut tasks = tokio::task::JoinSet::new();
     let mut count = 0;
     let shareable_channel_tx: Arc<kanal::Sender<Option<Sample>>> = Arc::new(samples_tx);
@@ -314,7 +310,7 @@ async fn async_pull_samples(
                 }
                 Some(Err(e)) => {
                     // Task failed, log the error
-                    error!("file_worker: task failed with error: {:?}", e);
+                    error!("file_worker: task failed with error: {e}");
                     join_error = Some(e);
                     break;
                 }
@@ -337,7 +333,7 @@ async fn async_pull_samples(
                 count += 1;
             }
             Err(e) => {
-                error!("dispatch_shards: task failed with error: {:?}", e);
+                error!("dispatch_shards: task failed with error: {e}");
                 if join_error.is_none() {
                     join_error = Some(e);
                 }
@@ -345,7 +341,7 @@ async fn async_pull_samples(
         }
     }
-    debug!("http_worker: total samples sent: {}\n", count);
+    debug!("http_worker: total samples sent: {count}\n");
     // Signal the end of the stream
     let _ = shareable_channel_tx.send(None); // Channel could have been closed by a .stop() call
@@ -387,7 +383,7 @@ pub fn pull_samples(
                     debug!("http_worker: all samples pulled successfully");
                 }
                 Err(e) => {
-                    error!("http_worker: error pulling samples: {:?}", e);
+                    error!("http_worker: error pulling samples: {e}");
                 }
             }
         });

{datago-2025.6.5 → datago-2025.8.1}/src/worker_wds.rs RENAMED Viewed

@@ -102,7 +102,7 @@ async fn process_sample(
                                 debug!("wds_worker: unpacked {}", item.filename);
                             }
                             Err(e) => {
-                                debug!("wds_worker: error loading image: {}", e);
+                                debug!("wds_worker: error loading image: {e}");
                                 continue;
                             }
                         }
@@ -142,7 +142,7 @@ async fn async_deserialize_samples(
     // We use async-await here, to better use IO stalls
     // We'll keep a pool of N async tasks in parallel
     let max_tasks = min(num_cpus::get(), limit);
-    info!("Using {} tasks in the async threadpool", max_tasks);
+    info!("Using {max_tasks} tasks in the async threadpool");
     let mut tasks = tokio::task::JoinSet::new();
     let mut count = 0;
     let shareable_channel_tx: Arc<kanal::Sender<Option<Sample>>> = Arc::new(samples_tx);
@@ -172,7 +172,7 @@ async fn async_deserialize_samples(
                 match result {
                     Ok(_) => count += 1,
                     Err(e) => {
-                        join_error = Some(format!("Task failed: {}", e));
+                        join_error = Some(format!("Task failed: {e}"));
                         break;
                     }
                 }
@@ -192,7 +192,7 @@ async fn async_deserialize_samples(
                 count += 1;
             }
             Err(e) => {
-                error!("dispatch_shards: task failed with error: {:?}", e);
+                error!("dispatch_shards: task failed with error: {e}");
                 if join_error.is_none() {
                     join_error = Some(e.to_string());
                 }
@@ -200,16 +200,13 @@ async fn async_deserialize_samples(
         }
     }
-    info!("wds_worker: total samples sent: {}\n", count);
+    info!("wds_worker: total samples sent: {count}\n");
     // Signal the end of the stream
     let _ = shareable_channel_tx.send(None); // Channel could have been closed by a .stop() call
     if let Some(error) = join_error {
-        error!(
-            "wds_worker: encountered an error while processing samples: {}",
-            error
-        );
+        error!("wds_worker: encountered an error while processing samples: {error}");
         return Err(error);
     }
     Ok(())
@@ -242,7 +239,7 @@ pub fn deserialize_samples(
             .await
             {
                 Ok(_) => debug!("wds_worker: all samples processed successfully"),
-                Err(e) => error!("wds_worker: error processing samples : {:?}", e),
+                Err(e) => error!("wds_worker: error processing samples : {e}"),
             }
         });
 }