datago 2025.3.11__tar.gz → 2025.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {datago-2025.3.11 → datago-2025.4.1}/Cargo.lock +1 -1
  2. {datago-2025.3.11 → datago-2025.4.1}/Cargo.toml +2 -2
  3. {datago-2025.3.11 → datago-2025.4.1}/PKG-INFO +18 -1
  4. {datago-2025.3.11 → datago-2025.4.1}/README.md +17 -0
  5. {datago-2025.3.11 → datago-2025.4.1}/src/generator_http.rs +71 -1
  6. {datago-2025.3.11 → datago-2025.4.1}/src/image_processing.rs +18 -9
  7. {datago-2025.3.11 → datago-2025.4.1}/tests/client_test.rs +114 -0
  8. {datago-2025.3.11 → datago-2025.4.1}/.github/workflows/ci-cd.yml +0 -0
  9. {datago-2025.3.11 → datago-2025.4.1}/.github/workflows/rust.yml +0 -0
  10. {datago-2025.3.11 → datago-2025.4.1}/.gitignore +0 -0
  11. {datago-2025.3.11 → datago-2025.4.1}/.pre-commit-config.yaml +0 -0
  12. {datago-2025.3.11 → datago-2025.4.1}/LICENSE +0 -0
  13. {datago-2025.3.11 → datago-2025.4.1}/pyproject.toml +0 -0
  14. {datago-2025.3.11 → datago-2025.4.1}/python/benchmark_db.py +0 -0
  15. {datago-2025.3.11 → datago-2025.4.1}/python/benchmark_filesystem.py +0 -0
  16. {datago-2025.3.11 → datago-2025.4.1}/python/dataset.py +0 -0
  17. {datago-2025.3.11 → datago-2025.4.1}/python/raw_types.py +0 -0
  18. {datago-2025.3.11 → datago-2025.4.1}/python/test_datago_db.py +0 -0
  19. {datago-2025.3.11 → datago-2025.4.1}/python/test_datago_filesystem.py +0 -0
  20. {datago-2025.3.11 → datago-2025.4.1}/requirements-tests.txt +0 -0
  21. {datago-2025.3.11 → datago-2025.4.1}/requirements.txt +0 -0
  22. {datago-2025.3.11 → datago-2025.4.1}/src/client.rs +0 -0
  23. {datago-2025.3.11 → datago-2025.4.1}/src/generator_files.rs +0 -0
  24. {datago-2025.3.11 → datago-2025.4.1}/src/lib.rs +0 -0
  25. {datago-2025.3.11 → datago-2025.4.1}/src/main.rs +0 -0
  26. {datago-2025.3.11 → datago-2025.4.1}/src/structs.rs +0 -0
  27. {datago-2025.3.11 → datago-2025.4.1}/src/worker_files.rs +0 -0
  28. {datago-2025.3.11 → datago-2025.4.1}/src/worker_http.rs +0 -0
@@ -387,7 +387,7 @@ dependencies = [
387
387
 
388
388
  [[package]]
389
389
  name = "datago"
390
- version = "2025.3.11"
390
+ version = "2025.4.1"
391
391
  dependencies = [
392
392
  "clap",
393
393
  "image",
@@ -1,7 +1,7 @@
1
1
  [package]
2
2
  name = "datago"
3
3
  edition = "2021"
4
- version = "2025.3.11"
4
+ version = "2025.4.1"
5
5
 
6
6
  [lib]
7
7
  # exposed by pyo3
@@ -14,7 +14,7 @@ name = "datago"
14
14
  path = "src/main.rs"
15
15
 
16
16
  [dependencies]
17
- image = "0.25.5"
17
+ image = { version = "0.25.5", features = ["png"]}
18
18
  reqwest = { version = "0.12.12", features = ["blocking"] }
19
19
  serde = { version = "1.0", features = ["derive"] }
20
20
  serde_json = "1.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datago
3
- Version: 2025.3.11
3
+ Version: 2025.4.1
4
4
  Classifier: Programming Language :: Rust
5
5
  Classifier: Programming Language :: Python :: Implementation :: CPython
6
6
  Classifier: Programming Language :: Python :: Implementation :: PyPy
@@ -102,6 +102,23 @@ See helper functions provided in `raw_types.py`, should be self explanatory. Che
102
102
 
103
103
  Just install the rust toolchain via rustup
104
104
 
105
+ ## [Apple Silicon MacOS only]
106
+
107
+ If you are using an Apple Silicon Mac OS machine, create a `.cargo/config` file and paste the following:
108
+ ```
109
+ [target.x86_64-apple-darwin]
110
+ rustflags = [
111
+ "-C", "link-arg=-undefined",
112
+ "-C", "link-arg=dynamic_lookup",
113
+ ]
114
+
115
+ [target.aarch64-apple-darwin]
116
+ rustflags = [
117
+ "-C", "link-arg=-undefined",
118
+ "-C", "link-arg=dynamic_lookup",
119
+ ]
120
+ ```
121
+
105
122
  ## Build a benchmark CLI
106
123
  `cargo run --release -- -h` to get all the information, should be fairly straightforward
107
124
 
@@ -86,6 +86,23 @@ See helper functions provided in `raw_types.py`, should be self explanatory. Che
86
86
 
87
87
  Just install the rust toolchain via rustup
88
88
 
89
+ ## [Apple Silicon MacOS only]
90
+
91
+ If you are using an Apple Silicon Mac OS machine, create a `.cargo/config` file and paste the following:
92
+ ```
93
+ [target.x86_64-apple-darwin]
94
+ rustflags = [
95
+ "-C", "link-arg=-undefined",
96
+ "-C", "link-arg=dynamic_lookup",
97
+ ]
98
+
99
+ [target.aarch64-apple-darwin]
100
+ rustflags = [
101
+ "-C", "link-arg=-undefined",
102
+ "-C", "link-arg=dynamic_lookup",
103
+ ]
104
+ ```
105
+
89
106
  ## Build a benchmark CLI
90
107
  `cargo run --release -- -h` to get all the information, should be fairly straightforward
91
108
 
@@ -24,9 +24,18 @@ pub struct SourceDBConfig {
24
24
  #[serde(default)]
25
25
  pub tags: String,
26
26
 
27
+ #[serde(default)]
28
+ pub tags_all: String,
29
+
27
30
  #[serde(default)]
28
31
  pub tags_ne: String,
29
32
 
33
+ #[serde(default)]
34
+ pub tags_ne_all: String,
35
+
36
+ #[serde(default)]
37
+ pub tags_empty: String,
38
+
30
39
  #[serde(default)]
31
40
  pub has_attributes: String,
32
41
 
@@ -60,6 +69,9 @@ pub struct SourceDBConfig {
60
69
  #[serde(default)]
61
70
  pub duplicate_state: i32,
62
71
 
72
+ #[serde(default)]
73
+ pub attributes: String,
74
+
63
75
  #[serde(default)]
64
76
  pub random_sampling: bool,
65
77
  }
@@ -73,7 +85,10 @@ struct DbRequest {
73
85
  pub page_size: String,
74
86
 
75
87
  pub tags: String,
88
+ pub tags_all: String,
76
89
  pub tags_ne: String,
90
+ pub tags_ne_all: String,
91
+ pub tags_empty: String,
77
92
 
78
93
  pub has_attributes: String,
79
94
  pub lacks_attributes: String,
@@ -92,6 +107,7 @@ struct DbRequest {
92
107
  pub max_pixel_count: String,
93
108
 
94
109
  pub duplicate_state: String,
110
+ pub attributes: String,
95
111
  pub random_sampling: bool,
96
112
 
97
113
  pub partitions_count: String,
@@ -131,7 +147,10 @@ impl DbRequest {
131
147
  maybe_add_field("page_size", &self.page_size);
132
148
 
133
149
  maybe_add_field("tags", &self.tags);
150
+ maybe_add_field("tags__all", &self.tags_all);
134
151
  maybe_add_field("tags__ne", &self.tags_ne);
152
+ maybe_add_field("tags__ne_all", &self.tags_ne_all);
153
+ maybe_add_field("tags__empty", &self.tags_empty);
135
154
  maybe_add_field("has_attributes", &self.has_attributes);
136
155
  maybe_add_field("lacks_attributes", &self.lacks_attributes);
137
156
  maybe_add_field("has_masks", &self.has_masks);
@@ -144,6 +163,7 @@ impl DbRequest {
144
163
  maybe_add_field("pixel_count__gte", &self.min_pixel_count);
145
164
  maybe_add_field("pixel_count__lte", &self.max_pixel_count);
146
165
  maybe_add_field("duplicate_state", &self.duplicate_state);
166
+ maybe_add_field("attributes", &self.attributes);
147
167
  maybe_add_field("partitions_count", &self.partitions_count);
148
168
  maybe_add_field("partition", &self.partition);
149
169
  }
@@ -180,7 +200,53 @@ fn build_request(source_config: SourceDBConfig, rank: usize, world_size: usize)
180
200
 
181
201
  if !source_config.tags.is_empty() {
182
202
  fields.push_str(",tags");
183
- println!("Including some tags: {}", source_config.tags);
203
+ println!(
204
+ "Including some tags, must have any of: {}",
205
+ source_config.tags
206
+ );
207
+ }
208
+
209
+ if !source_config.tags_all.is_empty() {
210
+ fields.push_str(",tags");
211
+ println!(
212
+ "Including tags, must have all of: {}",
213
+ source_config.tags_all
214
+ );
215
+ }
216
+
217
+ if !source_config.tags_ne.is_empty() {
218
+ fields.push_str(",tags");
219
+ println!(
220
+ "Including tags, must not have any of: {}",
221
+ source_config.tags_ne
222
+ );
223
+ }
224
+
225
+ if !source_config.tags_empty.is_empty() {
226
+ fields.push_str(",tags");
227
+ println!(
228
+ "Using filter: Tags must{} be empty",
229
+ if source_config.tags_empty == "true" {
230
+ " not"
231
+ } else {
232
+ ""
233
+ }
234
+ );
235
+ if !source_config.tags_all.is_empty()
236
+ || !source_config.tags.is_empty()
237
+ || !source_config.tags_ne.is_empty()
238
+ || !source_config.tags_ne_all.is_empty()
239
+ {
240
+ println!("WARNING: you've set `tags_empty` in addition to `tags`, `tags_all`, `tags_ne` or `tags_ne_all`. The combination might be incompatible or redundant.");
241
+ }
242
+ }
243
+
244
+ if !source_config.tags_ne_all.is_empty() {
245
+ fields.push_str(",tags");
246
+ println!(
247
+ "Including tags, must not have all of: {}",
248
+ source_config.tags_ne_all
249
+ );
184
250
  }
185
251
 
186
252
  if source_config.require_embeddings {
@@ -213,7 +279,10 @@ fn build_request(source_config: SourceDBConfig, rank: usize, world_size: usize)
213
279
  sources_ne: source_config.sources_ne,
214
280
  page_size: source_config.page_size.to_string(),
215
281
  tags: source_config.tags,
282
+ tags_all: source_config.tags_all,
216
283
  tags_ne: source_config.tags_ne,
284
+ tags_ne_all: source_config.tags_ne_all,
285
+ tags_empty: source_config.tags_empty,
217
286
  has_attributes: source_config.has_attributes,
218
287
  lacks_attributes: source_config.lacks_attributes,
219
288
  has_masks: source_config.has_masks,
@@ -226,6 +295,7 @@ fn build_request(source_config: SourceDBConfig, rank: usize, world_size: usize)
226
295
  min_pixel_count: maybe_add_int(source_config.min_pixel_count),
227
296
  max_pixel_count: maybe_add_int(source_config.max_pixel_count),
228
297
  duplicate_state: maybe_add_int(source_config.duplicate_state),
298
+ attributes: source_config.attributes,
229
299
  random_sampling: source_config.random_sampling,
230
300
  partition: if world_size > 1 {
231
301
  format!("{}", rank)
@@ -1,8 +1,8 @@
1
1
  use crate::structs::ImagePayload;
2
+ use image::ImageEncoder;
2
3
  use serde::Deserialize;
3
4
  use serde::Serialize;
4
5
  use std::io::Cursor;
5
-
6
6
  // --- Sample data structures - these will be exposed to the Python world ---------------------------------------------------------------------------------------------------------------------------------------------------------------
7
7
 
8
8
  #[derive(Debug, Serialize, Deserialize)]
@@ -176,15 +176,24 @@ pub async fn image_to_payload(
176
176
  // Encode the image if needed
177
177
  let mut image_bytes: Vec<u8> = Vec::new();
178
178
  if encode_images {
179
- if image
180
- .write_to(&mut Cursor::new(&mut image_bytes), image::ImageFormat::Png)
181
- .is_err()
182
- {
183
- return Err(image::ImageError::IoError(std::io::Error::new(
179
+ // Use the encoder directly with the raw bytes
180
+ image::codecs::png::PngEncoder::new_with_quality(
181
+ &mut Cursor::new(&mut image_bytes),
182
+ image::codecs::png::CompressionType::Fast,
183
+ image::codecs::png::FilterType::Adaptive,
184
+ )
185
+ .write_image(
186
+ image.as_bytes(),
187
+ image.width(),
188
+ image.height(),
189
+ image.color().into(),
190
+ )
191
+ .map_err(|e| {
192
+ image::ImageError::IoError(std::io::Error::new(
184
193
  std::io::ErrorKind::Other,
185
- "Failed to encode image",
186
- )));
187
- }
194
+ e.to_string(),
195
+ ))
196
+ })?;
188
197
 
189
198
  channels = -1; // Signal the fact that the image is encoded
190
199
  } else {
@@ -19,6 +19,9 @@ fn get_test_config() -> serde_json::Value {
19
19
  "require_embeddings": false,
20
20
  "tags": "",
21
21
  "tags_ne": "",
22
+ "tags_all": "",
23
+ "tags_ne_all": "",
24
+ "tags_empty": "",
22
25
  "has_attributes": "",
23
26
  "lacks_attributes": "",
24
27
  "has_masks": "",
@@ -30,6 +33,7 @@ fn get_test_config() -> serde_json::Value {
30
33
  "min_pixel_count": -1,
31
34
  "max_pixel_count": -1,
32
35
  "duplicate_state": -1,
36
+ "attributes": "",
33
37
  "random_sampling": false,
34
38
  "page_size": 10,
35
39
  },
@@ -253,6 +257,116 @@ fn test_tags() {
253
257
  client.stop();
254
258
  }
255
259
 
260
+ #[test]
261
+ fn test_tags_all() {
262
+ let mut config = get_test_config();
263
+ let tags = "v4_trainset_hq,photo";
264
+ config["source_config"]["tags_all"] = tags.into();
265
+ let mut client = DatagoClient::new(config.to_string());
266
+
267
+ let sample = client.get_sample();
268
+ assert!(sample.is_some());
269
+
270
+ let sample = sample.unwrap();
271
+ assert!(!sample.id.is_empty());
272
+ // Check that sample.tags contains all the tags in the tags string
273
+ for tag in tags.split(',') {
274
+ assert!(sample.tags.contains(&tag.to_string()));
275
+ }
276
+ client.stop();
277
+ }
278
+
279
+ #[test]
280
+ fn test_tags_ne() {
281
+ let mut config = get_test_config();
282
+ let tags = "v4_trainset_hq,photo";
283
+ config["source_config"]["tags_ne"] = tags.into();
284
+ let mut client = DatagoClient::new(config.to_string());
285
+
286
+ let sample = client.get_sample();
287
+ assert!(sample.is_some());
288
+
289
+ let sample = sample.unwrap();
290
+ assert!(!sample.id.is_empty());
291
+ // Check that sample.tags does not contain any of the tags in the tags string
292
+ println!("{:?}", sample.tags);
293
+ for tag in tags.split(',') {
294
+ assert!(!sample.tags.contains(&tag.to_string()));
295
+ }
296
+ client.stop();
297
+ }
298
+
299
+ #[test]
300
+ fn test_tags_empty() {
301
+ let mut config = get_test_config();
302
+ config["source_config"]["tags_empty"] = "true".into();
303
+ let mut client = DatagoClient::new(config.to_string());
304
+
305
+ let sample = client.get_sample();
306
+ assert!(sample.is_some());
307
+
308
+ let sample = sample.unwrap();
309
+ assert!(sample.tags.is_empty());
310
+ client.stop();
311
+ }
312
+
313
+ #[test]
314
+ fn test_tags_ne_all() {
315
+ let mut config = get_test_config();
316
+ let tag1 = "photo";
317
+ let tag2 = "graphic";
318
+ config["source_config"]["tags_ne_all"] = format!("{},{}", tag1, tag2).into();
319
+ let mut client = DatagoClient::new(config.to_string());
320
+
321
+ let sample = client.get_sample();
322
+ assert!(sample.is_some());
323
+
324
+ let sample = sample.unwrap();
325
+ assert!(!sample.id.is_empty());
326
+ // Assert that the sample does not contain both tags at the same time
327
+ let has_first = sample.tags.contains(&tag1.to_string());
328
+ let has_second = sample.tags.contains(&tag2.to_string());
329
+ assert!(
330
+ !(has_first && has_second),
331
+ "Sample should not contain both tags at the same time"
332
+ );
333
+ client.stop();
334
+ }
335
+
336
+ #[test]
337
+ fn test_attributes_filter() {
338
+ let mut config = get_test_config();
339
+ config["source_config"]["attributes"] = "aesthetic_score__gte:0.5".into();
340
+ let mut client = DatagoClient::new(config.to_string());
341
+
342
+ let sample = client.get_sample();
343
+ assert!(sample.is_some());
344
+
345
+ let sample = sample.unwrap();
346
+ assert!(!sample.id.is_empty());
347
+ assert!(sample.attributes.contains_key("aesthetic_score"));
348
+ assert!(sample.attributes["aesthetic_score"].as_f64().unwrap() >= 0.5);
349
+ client.stop();
350
+ }
351
+
352
+ #[test]
353
+ fn test_pixel_count_filter() {
354
+ let mut config = get_test_config();
355
+ config["source_config"]["min_pixel_count"] = 1000000.into();
356
+ config["source_config"]["max_pixel_count"] = 2000000.into();
357
+ config["source_config"]["require_images"] = json!(true);
358
+ let mut client = DatagoClient::new(config.to_string());
359
+
360
+ let sample = client.get_sample();
361
+ assert!(sample.is_some());
362
+
363
+ let sample = sample.unwrap();
364
+ assert!(!sample.id.is_empty());
365
+ assert!(sample.image.width * sample.image.height >= 1000000);
366
+ assert!(sample.image.width * sample.image.height <= 2000000);
367
+ client.stop();
368
+ }
369
+
256
370
  #[test]
257
371
  fn test_multiple_sources() {
258
372
  let limit = 10;
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes