datago 2025.3.11__tar.gz → 2025.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datago-2025.3.11 → datago-2025.4.1}/Cargo.lock +1 -1
- {datago-2025.3.11 → datago-2025.4.1}/Cargo.toml +2 -2
- {datago-2025.3.11 → datago-2025.4.1}/PKG-INFO +18 -1
- {datago-2025.3.11 → datago-2025.4.1}/README.md +17 -0
- {datago-2025.3.11 → datago-2025.4.1}/src/generator_http.rs +71 -1
- {datago-2025.3.11 → datago-2025.4.1}/src/image_processing.rs +18 -9
- {datago-2025.3.11 → datago-2025.4.1}/tests/client_test.rs +114 -0
- {datago-2025.3.11 → datago-2025.4.1}/.github/workflows/ci-cd.yml +0 -0
- {datago-2025.3.11 → datago-2025.4.1}/.github/workflows/rust.yml +0 -0
- {datago-2025.3.11 → datago-2025.4.1}/.gitignore +0 -0
- {datago-2025.3.11 → datago-2025.4.1}/.pre-commit-config.yaml +0 -0
- {datago-2025.3.11 → datago-2025.4.1}/LICENSE +0 -0
- {datago-2025.3.11 → datago-2025.4.1}/pyproject.toml +0 -0
- {datago-2025.3.11 → datago-2025.4.1}/python/benchmark_db.py +0 -0
- {datago-2025.3.11 → datago-2025.4.1}/python/benchmark_filesystem.py +0 -0
- {datago-2025.3.11 → datago-2025.4.1}/python/dataset.py +0 -0
- {datago-2025.3.11 → datago-2025.4.1}/python/raw_types.py +0 -0
- {datago-2025.3.11 → datago-2025.4.1}/python/test_datago_db.py +0 -0
- {datago-2025.3.11 → datago-2025.4.1}/python/test_datago_filesystem.py +0 -0
- {datago-2025.3.11 → datago-2025.4.1}/requirements-tests.txt +0 -0
- {datago-2025.3.11 → datago-2025.4.1}/requirements.txt +0 -0
- {datago-2025.3.11 → datago-2025.4.1}/src/client.rs +0 -0
- {datago-2025.3.11 → datago-2025.4.1}/src/generator_files.rs +0 -0
- {datago-2025.3.11 → datago-2025.4.1}/src/lib.rs +0 -0
- {datago-2025.3.11 → datago-2025.4.1}/src/main.rs +0 -0
- {datago-2025.3.11 → datago-2025.4.1}/src/structs.rs +0 -0
- {datago-2025.3.11 → datago-2025.4.1}/src/worker_files.rs +0 -0
- {datago-2025.3.11 → datago-2025.4.1}/src/worker_http.rs +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "datago"
|
|
3
3
|
edition = "2021"
|
|
4
|
-
version = "2025.
|
|
4
|
+
version = "2025.4.1"
|
|
5
5
|
|
|
6
6
|
[lib]
|
|
7
7
|
# exposed by pyo3
|
|
@@ -14,7 +14,7 @@ name = "datago"
|
|
|
14
14
|
path = "src/main.rs"
|
|
15
15
|
|
|
16
16
|
[dependencies]
|
|
17
|
-
image = "0.25.5"
|
|
17
|
+
image = { version = "0.25.5", features = ["png"]}
|
|
18
18
|
reqwest = { version = "0.12.12", features = ["blocking"] }
|
|
19
19
|
serde = { version = "1.0", features = ["derive"] }
|
|
20
20
|
serde_json = "1.0"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datago
|
|
3
|
-
Version: 2025.
|
|
3
|
+
Version: 2025.4.1
|
|
4
4
|
Classifier: Programming Language :: Rust
|
|
5
5
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
6
6
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
@@ -102,6 +102,23 @@ See helper functions provided in `raw_types.py`, should be self explanatory. Che
|
|
|
102
102
|
|
|
103
103
|
Just install the rust toolchain via rustup
|
|
104
104
|
|
|
105
|
+
## [Apple Silicon MacOS only]
|
|
106
|
+
|
|
107
|
+
If you are using an Apple Silicon Mac OS machine, create a `.cargo/config` file and paste the following:
|
|
108
|
+
```
|
|
109
|
+
[target.x86_64-apple-darwin]
|
|
110
|
+
rustflags = [
|
|
111
|
+
"-C", "link-arg=-undefined",
|
|
112
|
+
"-C", "link-arg=dynamic_lookup",
|
|
113
|
+
]
|
|
114
|
+
|
|
115
|
+
[target.aarch64-apple-darwin]
|
|
116
|
+
rustflags = [
|
|
117
|
+
"-C", "link-arg=-undefined",
|
|
118
|
+
"-C", "link-arg=dynamic_lookup",
|
|
119
|
+
]
|
|
120
|
+
```
|
|
121
|
+
|
|
105
122
|
## Build a benchmark CLI
|
|
106
123
|
`cargo run --release -- -h` to get all the information, should be fairly straightforward
|
|
107
124
|
|
|
@@ -86,6 +86,23 @@ See helper functions provided in `raw_types.py`, should be self explanatory. Che
|
|
|
86
86
|
|
|
87
87
|
Just install the rust toolchain via rustup
|
|
88
88
|
|
|
89
|
+
## [Apple Silicon MacOS only]
|
|
90
|
+
|
|
91
|
+
If you are using an Apple Silicon Mac OS machine, create a `.cargo/config` file and paste the following:
|
|
92
|
+
```
|
|
93
|
+
[target.x86_64-apple-darwin]
|
|
94
|
+
rustflags = [
|
|
95
|
+
"-C", "link-arg=-undefined",
|
|
96
|
+
"-C", "link-arg=dynamic_lookup",
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
[target.aarch64-apple-darwin]
|
|
100
|
+
rustflags = [
|
|
101
|
+
"-C", "link-arg=-undefined",
|
|
102
|
+
"-C", "link-arg=dynamic_lookup",
|
|
103
|
+
]
|
|
104
|
+
```
|
|
105
|
+
|
|
89
106
|
## Build a benchmark CLI
|
|
90
107
|
`cargo run --release -- -h` to get all the information, should be fairly straightforward
|
|
91
108
|
|
|
@@ -24,9 +24,18 @@ pub struct SourceDBConfig {
|
|
|
24
24
|
#[serde(default)]
|
|
25
25
|
pub tags: String,
|
|
26
26
|
|
|
27
|
+
#[serde(default)]
|
|
28
|
+
pub tags_all: String,
|
|
29
|
+
|
|
27
30
|
#[serde(default)]
|
|
28
31
|
pub tags_ne: String,
|
|
29
32
|
|
|
33
|
+
#[serde(default)]
|
|
34
|
+
pub tags_ne_all: String,
|
|
35
|
+
|
|
36
|
+
#[serde(default)]
|
|
37
|
+
pub tags_empty: String,
|
|
38
|
+
|
|
30
39
|
#[serde(default)]
|
|
31
40
|
pub has_attributes: String,
|
|
32
41
|
|
|
@@ -60,6 +69,9 @@ pub struct SourceDBConfig {
|
|
|
60
69
|
#[serde(default)]
|
|
61
70
|
pub duplicate_state: i32,
|
|
62
71
|
|
|
72
|
+
#[serde(default)]
|
|
73
|
+
pub attributes: String,
|
|
74
|
+
|
|
63
75
|
#[serde(default)]
|
|
64
76
|
pub random_sampling: bool,
|
|
65
77
|
}
|
|
@@ -73,7 +85,10 @@ struct DbRequest {
|
|
|
73
85
|
pub page_size: String,
|
|
74
86
|
|
|
75
87
|
pub tags: String,
|
|
88
|
+
pub tags_all: String,
|
|
76
89
|
pub tags_ne: String,
|
|
90
|
+
pub tags_ne_all: String,
|
|
91
|
+
pub tags_empty: String,
|
|
77
92
|
|
|
78
93
|
pub has_attributes: String,
|
|
79
94
|
pub lacks_attributes: String,
|
|
@@ -92,6 +107,7 @@ struct DbRequest {
|
|
|
92
107
|
pub max_pixel_count: String,
|
|
93
108
|
|
|
94
109
|
pub duplicate_state: String,
|
|
110
|
+
pub attributes: String,
|
|
95
111
|
pub random_sampling: bool,
|
|
96
112
|
|
|
97
113
|
pub partitions_count: String,
|
|
@@ -131,7 +147,10 @@ impl DbRequest {
|
|
|
131
147
|
maybe_add_field("page_size", &self.page_size);
|
|
132
148
|
|
|
133
149
|
maybe_add_field("tags", &self.tags);
|
|
150
|
+
maybe_add_field("tags__all", &self.tags_all);
|
|
134
151
|
maybe_add_field("tags__ne", &self.tags_ne);
|
|
152
|
+
maybe_add_field("tags__ne_all", &self.tags_ne_all);
|
|
153
|
+
maybe_add_field("tags__empty", &self.tags_empty);
|
|
135
154
|
maybe_add_field("has_attributes", &self.has_attributes);
|
|
136
155
|
maybe_add_field("lacks_attributes", &self.lacks_attributes);
|
|
137
156
|
maybe_add_field("has_masks", &self.has_masks);
|
|
@@ -144,6 +163,7 @@ impl DbRequest {
|
|
|
144
163
|
maybe_add_field("pixel_count__gte", &self.min_pixel_count);
|
|
145
164
|
maybe_add_field("pixel_count__lte", &self.max_pixel_count);
|
|
146
165
|
maybe_add_field("duplicate_state", &self.duplicate_state);
|
|
166
|
+
maybe_add_field("attributes", &self.attributes);
|
|
147
167
|
maybe_add_field("partitions_count", &self.partitions_count);
|
|
148
168
|
maybe_add_field("partition", &self.partition);
|
|
149
169
|
}
|
|
@@ -180,7 +200,53 @@ fn build_request(source_config: SourceDBConfig, rank: usize, world_size: usize)
|
|
|
180
200
|
|
|
181
201
|
if !source_config.tags.is_empty() {
|
|
182
202
|
fields.push_str(",tags");
|
|
183
|
-
println!(
|
|
203
|
+
println!(
|
|
204
|
+
"Including some tags, must have any of: {}",
|
|
205
|
+
source_config.tags
|
|
206
|
+
);
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
if !source_config.tags_all.is_empty() {
|
|
210
|
+
fields.push_str(",tags");
|
|
211
|
+
println!(
|
|
212
|
+
"Including tags, must have all of: {}",
|
|
213
|
+
source_config.tags_all
|
|
214
|
+
);
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
if !source_config.tags_ne.is_empty() {
|
|
218
|
+
fields.push_str(",tags");
|
|
219
|
+
println!(
|
|
220
|
+
"Including tags, must not have any of: {}",
|
|
221
|
+
source_config.tags_ne
|
|
222
|
+
);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
if !source_config.tags_empty.is_empty() {
|
|
226
|
+
fields.push_str(",tags");
|
|
227
|
+
println!(
|
|
228
|
+
"Using filter: Tags must{} be empty",
|
|
229
|
+
if source_config.tags_empty == "true" {
|
|
230
|
+
" not"
|
|
231
|
+
} else {
|
|
232
|
+
""
|
|
233
|
+
}
|
|
234
|
+
);
|
|
235
|
+
if !source_config.tags_all.is_empty()
|
|
236
|
+
|| !source_config.tags.is_empty()
|
|
237
|
+
|| !source_config.tags_ne.is_empty()
|
|
238
|
+
|| !source_config.tags_ne_all.is_empty()
|
|
239
|
+
{
|
|
240
|
+
println!("WARNING: you've set `tags_empty` in addition to `tags`, `tags_all`, `tags_ne` or `tags_ne_all`. The combination might be incompatible or redundant.");
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
if !source_config.tags_ne_all.is_empty() {
|
|
245
|
+
fields.push_str(",tags");
|
|
246
|
+
println!(
|
|
247
|
+
"Including tags, must not have all of: {}",
|
|
248
|
+
source_config.tags_ne_all
|
|
249
|
+
);
|
|
184
250
|
}
|
|
185
251
|
|
|
186
252
|
if source_config.require_embeddings {
|
|
@@ -213,7 +279,10 @@ fn build_request(source_config: SourceDBConfig, rank: usize, world_size: usize)
|
|
|
213
279
|
sources_ne: source_config.sources_ne,
|
|
214
280
|
page_size: source_config.page_size.to_string(),
|
|
215
281
|
tags: source_config.tags,
|
|
282
|
+
tags_all: source_config.tags_all,
|
|
216
283
|
tags_ne: source_config.tags_ne,
|
|
284
|
+
tags_ne_all: source_config.tags_ne_all,
|
|
285
|
+
tags_empty: source_config.tags_empty,
|
|
217
286
|
has_attributes: source_config.has_attributes,
|
|
218
287
|
lacks_attributes: source_config.lacks_attributes,
|
|
219
288
|
has_masks: source_config.has_masks,
|
|
@@ -226,6 +295,7 @@ fn build_request(source_config: SourceDBConfig, rank: usize, world_size: usize)
|
|
|
226
295
|
min_pixel_count: maybe_add_int(source_config.min_pixel_count),
|
|
227
296
|
max_pixel_count: maybe_add_int(source_config.max_pixel_count),
|
|
228
297
|
duplicate_state: maybe_add_int(source_config.duplicate_state),
|
|
298
|
+
attributes: source_config.attributes,
|
|
229
299
|
random_sampling: source_config.random_sampling,
|
|
230
300
|
partition: if world_size > 1 {
|
|
231
301
|
format!("{}", rank)
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
use crate::structs::ImagePayload;
|
|
2
|
+
use image::ImageEncoder;
|
|
2
3
|
use serde::Deserialize;
|
|
3
4
|
use serde::Serialize;
|
|
4
5
|
use std::io::Cursor;
|
|
5
|
-
|
|
6
6
|
// --- Sample data structures - these will be exposed to the Python world ---------------------------------------------------------------------------------------------------------------------------------------------------------------
|
|
7
7
|
|
|
8
8
|
#[derive(Debug, Serialize, Deserialize)]
|
|
@@ -176,15 +176,24 @@ pub async fn image_to_payload(
|
|
|
176
176
|
// Encode the image if needed
|
|
177
177
|
let mut image_bytes: Vec<u8> = Vec::new();
|
|
178
178
|
if encode_images {
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
179
|
+
// Use the encoder directly with the raw bytes
|
|
180
|
+
image::codecs::png::PngEncoder::new_with_quality(
|
|
181
|
+
&mut Cursor::new(&mut image_bytes),
|
|
182
|
+
image::codecs::png::CompressionType::Fast,
|
|
183
|
+
image::codecs::png::FilterType::Adaptive,
|
|
184
|
+
)
|
|
185
|
+
.write_image(
|
|
186
|
+
image.as_bytes(),
|
|
187
|
+
image.width(),
|
|
188
|
+
image.height(),
|
|
189
|
+
image.color().into(),
|
|
190
|
+
)
|
|
191
|
+
.map_err(|e| {
|
|
192
|
+
image::ImageError::IoError(std::io::Error::new(
|
|
184
193
|
std::io::ErrorKind::Other,
|
|
185
|
-
|
|
186
|
-
))
|
|
187
|
-
}
|
|
194
|
+
e.to_string(),
|
|
195
|
+
))
|
|
196
|
+
})?;
|
|
188
197
|
|
|
189
198
|
channels = -1; // Signal the fact that the image is encoded
|
|
190
199
|
} else {
|
|
@@ -19,6 +19,9 @@ fn get_test_config() -> serde_json::Value {
|
|
|
19
19
|
"require_embeddings": false,
|
|
20
20
|
"tags": "",
|
|
21
21
|
"tags_ne": "",
|
|
22
|
+
"tags_all": "",
|
|
23
|
+
"tags_ne_all": "",
|
|
24
|
+
"tags_empty": "",
|
|
22
25
|
"has_attributes": "",
|
|
23
26
|
"lacks_attributes": "",
|
|
24
27
|
"has_masks": "",
|
|
@@ -30,6 +33,7 @@ fn get_test_config() -> serde_json::Value {
|
|
|
30
33
|
"min_pixel_count": -1,
|
|
31
34
|
"max_pixel_count": -1,
|
|
32
35
|
"duplicate_state": -1,
|
|
36
|
+
"attributes": "",
|
|
33
37
|
"random_sampling": false,
|
|
34
38
|
"page_size": 10,
|
|
35
39
|
},
|
|
@@ -253,6 +257,116 @@ fn test_tags() {
|
|
|
253
257
|
client.stop();
|
|
254
258
|
}
|
|
255
259
|
|
|
260
|
+
#[test]
|
|
261
|
+
fn test_tags_all() {
|
|
262
|
+
let mut config = get_test_config();
|
|
263
|
+
let tags = "v4_trainset_hq,photo";
|
|
264
|
+
config["source_config"]["tags_all"] = tags.into();
|
|
265
|
+
let mut client = DatagoClient::new(config.to_string());
|
|
266
|
+
|
|
267
|
+
let sample = client.get_sample();
|
|
268
|
+
assert!(sample.is_some());
|
|
269
|
+
|
|
270
|
+
let sample = sample.unwrap();
|
|
271
|
+
assert!(!sample.id.is_empty());
|
|
272
|
+
// Check that sample.tags contains all the tags in the tags string
|
|
273
|
+
for tag in tags.split(',') {
|
|
274
|
+
assert!(sample.tags.contains(&tag.to_string()));
|
|
275
|
+
}
|
|
276
|
+
client.stop();
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
#[test]
|
|
280
|
+
fn test_tags_ne() {
|
|
281
|
+
let mut config = get_test_config();
|
|
282
|
+
let tags = "v4_trainset_hq,photo";
|
|
283
|
+
config["source_config"]["tags_ne"] = tags.into();
|
|
284
|
+
let mut client = DatagoClient::new(config.to_string());
|
|
285
|
+
|
|
286
|
+
let sample = client.get_sample();
|
|
287
|
+
assert!(sample.is_some());
|
|
288
|
+
|
|
289
|
+
let sample = sample.unwrap();
|
|
290
|
+
assert!(!sample.id.is_empty());
|
|
291
|
+
// Check that sample.tags does not contain any of the tags in the tags string
|
|
292
|
+
println!("{:?}", sample.tags);
|
|
293
|
+
for tag in tags.split(',') {
|
|
294
|
+
assert!(!sample.tags.contains(&tag.to_string()));
|
|
295
|
+
}
|
|
296
|
+
client.stop();
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
#[test]
|
|
300
|
+
fn test_tags_empty() {
|
|
301
|
+
let mut config = get_test_config();
|
|
302
|
+
config["source_config"]["tags_empty"] = "true".into();
|
|
303
|
+
let mut client = DatagoClient::new(config.to_string());
|
|
304
|
+
|
|
305
|
+
let sample = client.get_sample();
|
|
306
|
+
assert!(sample.is_some());
|
|
307
|
+
|
|
308
|
+
let sample = sample.unwrap();
|
|
309
|
+
assert!(sample.tags.is_empty());
|
|
310
|
+
client.stop();
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
#[test]
|
|
314
|
+
fn test_tags_ne_all() {
|
|
315
|
+
let mut config = get_test_config();
|
|
316
|
+
let tag1 = "photo";
|
|
317
|
+
let tag2 = "graphic";
|
|
318
|
+
config["source_config"]["tags_ne_all"] = format!("{},{}", tag1, tag2).into();
|
|
319
|
+
let mut client = DatagoClient::new(config.to_string());
|
|
320
|
+
|
|
321
|
+
let sample = client.get_sample();
|
|
322
|
+
assert!(sample.is_some());
|
|
323
|
+
|
|
324
|
+
let sample = sample.unwrap();
|
|
325
|
+
assert!(!sample.id.is_empty());
|
|
326
|
+
// Assert that the sample does not contain both tags at the same time
|
|
327
|
+
let has_first = sample.tags.contains(&tag1.to_string());
|
|
328
|
+
let has_second = sample.tags.contains(&tag2.to_string());
|
|
329
|
+
assert!(
|
|
330
|
+
!(has_first && has_second),
|
|
331
|
+
"Sample should not contain both tags at the same time"
|
|
332
|
+
);
|
|
333
|
+
client.stop();
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
#[test]
|
|
337
|
+
fn test_attributes_filter() {
|
|
338
|
+
let mut config = get_test_config();
|
|
339
|
+
config["source_config"]["attributes"] = "aesthetic_score__gte:0.5".into();
|
|
340
|
+
let mut client = DatagoClient::new(config.to_string());
|
|
341
|
+
|
|
342
|
+
let sample = client.get_sample();
|
|
343
|
+
assert!(sample.is_some());
|
|
344
|
+
|
|
345
|
+
let sample = sample.unwrap();
|
|
346
|
+
assert!(!sample.id.is_empty());
|
|
347
|
+
assert!(sample.attributes.contains_key("aesthetic_score"));
|
|
348
|
+
assert!(sample.attributes["aesthetic_score"].as_f64().unwrap() >= 0.5);
|
|
349
|
+
client.stop();
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
#[test]
|
|
353
|
+
fn test_pixel_count_filter() {
|
|
354
|
+
let mut config = get_test_config();
|
|
355
|
+
config["source_config"]["min_pixel_count"] = 1000000.into();
|
|
356
|
+
config["source_config"]["max_pixel_count"] = 2000000.into();
|
|
357
|
+
config["source_config"]["require_images"] = json!(true);
|
|
358
|
+
let mut client = DatagoClient::new(config.to_string());
|
|
359
|
+
|
|
360
|
+
let sample = client.get_sample();
|
|
361
|
+
assert!(sample.is_some());
|
|
362
|
+
|
|
363
|
+
let sample = sample.unwrap();
|
|
364
|
+
assert!(!sample.id.is_empty());
|
|
365
|
+
assert!(sample.image.width * sample.image.height >= 1000000);
|
|
366
|
+
assert!(sample.image.width * sample.image.height <= 2000000);
|
|
367
|
+
client.stop();
|
|
368
|
+
}
|
|
369
|
+
|
|
256
370
|
#[test]
|
|
257
371
|
fn test_multiple_sources() {
|
|
258
372
|
let limit = 10;
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|