datachain 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/client/fsspec.py +1 -1
- datachain/lib/dc.py +60 -4
- datachain/lib/file.py +22 -8
- datachain/query/dataset.py +2 -2
- datachain/query/session.py +15 -3
- {datachain-0.9.0.dist-info → datachain-0.10.0.dist-info}/METADATA +3 -3
- {datachain-0.9.0.dist-info → datachain-0.10.0.dist-info}/RECORD +11 -11
- {datachain-0.9.0.dist-info → datachain-0.10.0.dist-info}/LICENSE +0 -0
- {datachain-0.9.0.dist-info → datachain-0.10.0.dist-info}/WHEEL +0 -0
- {datachain-0.9.0.dist-info → datachain-0.10.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.9.0.dist-info → datachain-0.10.0.dist-info}/top_level.txt +0 -0
datachain/client/fsspec.py
CHANGED
|
@@ -390,7 +390,7 @@ class Client(ABC):
|
|
|
390
390
|
) # type: ignore[return-value]
|
|
391
391
|
|
|
392
392
|
def upload(self, data: bytes, path: str) -> "File":
|
|
393
|
-
full_path = self.get_full_path(path)
|
|
393
|
+
full_path = path if path.startswith(self.PREFIX) else self.get_full_path(path)
|
|
394
394
|
|
|
395
395
|
parent = posixpath.dirname(full_path)
|
|
396
396
|
self.fs.makedirs(parent, exist_ok=True)
|
datachain/lib/dc.py
CHANGED
|
@@ -411,6 +411,7 @@ class DataChain:
|
|
|
411
411
|
object_name: str = "file",
|
|
412
412
|
update: bool = False,
|
|
413
413
|
anon: bool = False,
|
|
414
|
+
client_config: Optional[dict] = None,
|
|
414
415
|
) -> "Self":
|
|
415
416
|
"""Get data from a storage as a list of file with all file attributes.
|
|
416
417
|
It returns the chain itself as usual.
|
|
@@ -423,15 +424,32 @@ class DataChain:
|
|
|
423
424
|
object_name : Created object column name.
|
|
424
425
|
update : force storage reindexing. Default is False.
|
|
425
426
|
anon : If True, we will treat cloud bucket as public one
|
|
427
|
+
client_config : Optional client configuration for the storage client.
|
|
426
428
|
|
|
427
429
|
Example:
|
|
430
|
+
Simple call from s3
|
|
428
431
|
```py
|
|
429
432
|
chain = DataChain.from_storage("s3://my-bucket/my-dir")
|
|
430
433
|
```
|
|
434
|
+
|
|
435
|
+
With AWS S3-compatible storage
|
|
436
|
+
```py
|
|
437
|
+
chain = DataChain.from_storage(
|
|
438
|
+
"s3://my-bucket/my-dir",
|
|
439
|
+
client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
|
|
440
|
+
)
|
|
441
|
+
```
|
|
442
|
+
|
|
443
|
+
Pass existing session
|
|
444
|
+
```py
|
|
445
|
+
session = Session.get()
|
|
446
|
+
chain = DataChain.from_storage("s3://my-bucket/my-dir", session=session)
|
|
447
|
+
```
|
|
431
448
|
"""
|
|
432
449
|
file_type = get_file_type(type)
|
|
433
450
|
|
|
434
|
-
|
|
451
|
+
if anon:
|
|
452
|
+
client_config = (client_config or {}) | {"anon": True}
|
|
435
453
|
session = Session.get(session, client_config=client_config, in_memory=in_memory)
|
|
436
454
|
cache = session.catalog.cache
|
|
437
455
|
client_config = session.catalog.client_config
|
|
@@ -481,25 +499,56 @@ class DataChain:
|
|
|
481
499
|
version: Optional[int] = None,
|
|
482
500
|
session: Optional[Session] = None,
|
|
483
501
|
settings: Optional[dict] = None,
|
|
484
|
-
|
|
502
|
+
fallback_to_studio: bool = True,
|
|
485
503
|
) -> "Self":
|
|
486
504
|
"""Get data from a saved Dataset. It returns the chain itself.
|
|
505
|
+
If dataset or version is not found locally, it will try to pull it from Studio.
|
|
487
506
|
|
|
488
507
|
Parameters:
|
|
489
508
|
name : dataset name
|
|
490
509
|
version : dataset version
|
|
510
|
+
session : Session to use for the chain.
|
|
511
|
+
settings : Settings to use for the chain.
|
|
512
|
+
fallback_to_studio : Try to pull dataset from Studio if not found locally.
|
|
513
|
+
Default is True.
|
|
491
514
|
|
|
492
515
|
Example:
|
|
493
516
|
```py
|
|
494
517
|
chain = DataChain.from_dataset("my_cats")
|
|
495
518
|
```
|
|
519
|
+
|
|
520
|
+
```py
|
|
521
|
+
chain = DataChain.from_dataset("my_cats", fallback_to_studio=False)
|
|
522
|
+
```
|
|
523
|
+
|
|
524
|
+
```py
|
|
525
|
+
chain = DataChain.from_dataset("my_cats", version=1)
|
|
526
|
+
```
|
|
527
|
+
|
|
528
|
+
```py
|
|
529
|
+
session = Session.get(client_config={"aws_endpoint_url": "<minio-url>"})
|
|
530
|
+
settings = {
|
|
531
|
+
"cache": True,
|
|
532
|
+
"parallel": 4,
|
|
533
|
+
"workers": 4,
|
|
534
|
+
"min_task_size": 1000,
|
|
535
|
+
"prefetch": 10,
|
|
536
|
+
}
|
|
537
|
+
chain = DataChain.from_dataset(
|
|
538
|
+
name="my_cats",
|
|
539
|
+
version=1,
|
|
540
|
+
session=session,
|
|
541
|
+
settings=settings,
|
|
542
|
+
fallback_to_studio=True,
|
|
543
|
+
)
|
|
544
|
+
```
|
|
496
545
|
"""
|
|
497
546
|
query = DatasetQuery(
|
|
498
547
|
name=name,
|
|
499
548
|
version=version,
|
|
500
549
|
session=session,
|
|
501
550
|
indexing_column_types=File._datachain_column_types,
|
|
502
|
-
|
|
551
|
+
fallback_to_studio=fallback_to_studio,
|
|
503
552
|
)
|
|
504
553
|
telemetry.send_event_once("class", "datachain_init", name=name, version=version)
|
|
505
554
|
if settings:
|
|
@@ -2444,7 +2493,7 @@ class DataChain:
|
|
|
2444
2493
|
self._setup = self._setup | kwargs
|
|
2445
2494
|
return self
|
|
2446
2495
|
|
|
2447
|
-
def
|
|
2496
|
+
def to_storage(
|
|
2448
2497
|
self,
|
|
2449
2498
|
output: str,
|
|
2450
2499
|
signal: str = "file",
|
|
@@ -2462,6 +2511,13 @@ class DataChain:
|
|
|
2462
2511
|
use_cache: If `True`, cache the files before exporting.
|
|
2463
2512
|
link_type: Method to use for exporting files.
|
|
2464
2513
|
Falls back to `'copy'` if symlinking fails.
|
|
2514
|
+
|
|
2515
|
+
Example:
|
|
2516
|
+
Cross cloud transfer
|
|
2517
|
+
```py
|
|
2518
|
+
ds = DataChain.from_storage("s3://mybucket")
|
|
2519
|
+
ds.to_storage("gs://mybucket", placement="filename")
|
|
2520
|
+
```
|
|
2465
2521
|
"""
|
|
2466
2522
|
if placement == "filename" and (
|
|
2467
2523
|
self._query.distinct(pathfunc.name(C(f"{signal}__path"))).count()
|
datachain/lib/file.py
CHANGED
|
@@ -17,6 +17,7 @@ from urllib.parse import unquote, urlparse
|
|
|
17
17
|
from urllib.request import url2pathname
|
|
18
18
|
|
|
19
19
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
20
|
+
from fsspec.utils import stringify_path
|
|
20
21
|
from PIL import Image as PilImage
|
|
21
22
|
from pydantic import Field, field_validator
|
|
22
23
|
|
|
@@ -214,10 +215,13 @@ class File(DataModel):
|
|
|
214
215
|
|
|
215
216
|
catalog = get_catalog()
|
|
216
217
|
|
|
217
|
-
|
|
218
|
+
from datachain.client.fsspec import Client
|
|
218
219
|
|
|
219
|
-
|
|
220
|
-
|
|
220
|
+
client_cls = Client.get_implementation(path)
|
|
221
|
+
source, rel_path = client_cls.split_url(path)
|
|
222
|
+
|
|
223
|
+
client = catalog.get_client(client_cls.get_uri(source))
|
|
224
|
+
file = client.upload(data, rel_path)
|
|
221
225
|
if not isinstance(file, cls):
|
|
222
226
|
file = cls(**file.model_dump())
|
|
223
227
|
file._set_stream(catalog)
|
|
@@ -267,8 +271,9 @@ class File(DataModel):
|
|
|
267
271
|
|
|
268
272
|
def save(self, destination: str):
|
|
269
273
|
"""Writes it's content to destination"""
|
|
270
|
-
|
|
271
|
-
|
|
274
|
+
destination = stringify_path(destination)
|
|
275
|
+
client: Client = self._catalog.get_client(str(destination))
|
|
276
|
+
client.upload(self.read(), str(destination))
|
|
272
277
|
|
|
273
278
|
def _symlink_to(self, destination: str):
|
|
274
279
|
if self.location:
|
|
@@ -282,6 +287,7 @@ class File(DataModel):
|
|
|
282
287
|
source = self.get_path()
|
|
283
288
|
else:
|
|
284
289
|
raise OSError(errno.EXDEV, "can't link across filesystems")
|
|
290
|
+
|
|
285
291
|
return os.symlink(source, destination)
|
|
286
292
|
|
|
287
293
|
def export(
|
|
@@ -296,7 +302,8 @@ class File(DataModel):
|
|
|
296
302
|
self._caching_enabled = use_cache
|
|
297
303
|
dst = self.get_destination_path(output, placement)
|
|
298
304
|
dst_dir = os.path.dirname(dst)
|
|
299
|
-
|
|
305
|
+
client: Client = self._catalog.get_client(dst_dir)
|
|
306
|
+
client.fs.makedirs(dst_dir, exist_ok=True)
|
|
300
307
|
|
|
301
308
|
if link_type == "symlink":
|
|
302
309
|
try:
|
|
@@ -493,7 +500,10 @@ class TextFile(File):
|
|
|
493
500
|
|
|
494
501
|
def save(self, destination: str):
|
|
495
502
|
"""Writes it's content to destination"""
|
|
496
|
-
|
|
503
|
+
destination = stringify_path(destination)
|
|
504
|
+
|
|
505
|
+
client: Client = self._catalog.get_client(destination)
|
|
506
|
+
with client.fs.open(destination, mode="w") as f:
|
|
497
507
|
f.write(self.read_text())
|
|
498
508
|
|
|
499
509
|
|
|
@@ -507,7 +517,11 @@ class ImageFile(File):
|
|
|
507
517
|
|
|
508
518
|
def save(self, destination: str):
|
|
509
519
|
"""Writes it's content to destination"""
|
|
510
|
-
|
|
520
|
+
destination = stringify_path(destination)
|
|
521
|
+
|
|
522
|
+
client: Client = self._catalog.get_client(destination)
|
|
523
|
+
with client.fs.open(destination, mode="wb") as f:
|
|
524
|
+
self.read().save(f)
|
|
511
525
|
|
|
512
526
|
|
|
513
527
|
class Image(DataModel):
|
datachain/query/dataset.py
CHANGED
|
@@ -1085,7 +1085,7 @@ class DatasetQuery:
|
|
|
1085
1085
|
session: Optional[Session] = None,
|
|
1086
1086
|
indexing_column_types: Optional[dict[str, Any]] = None,
|
|
1087
1087
|
in_memory: bool = False,
|
|
1088
|
-
|
|
1088
|
+
fallback_to_studio: bool = True,
|
|
1089
1089
|
) -> None:
|
|
1090
1090
|
self.session = Session.get(session, catalog=catalog, in_memory=in_memory)
|
|
1091
1091
|
self.catalog = catalog or self.session.catalog
|
|
@@ -1103,7 +1103,7 @@ class DatasetQuery:
|
|
|
1103
1103
|
|
|
1104
1104
|
self.name = name
|
|
1105
1105
|
|
|
1106
|
-
if
|
|
1106
|
+
if fallback_to_studio and is_token_set():
|
|
1107
1107
|
ds = self.catalog.get_dataset_with_remote_fallback(name, version)
|
|
1108
1108
|
else:
|
|
1109
1109
|
ds = self.catalog.get_dataset(name)
|
datachain/query/session.py
CHANGED
|
@@ -139,21 +139,33 @@ class Session:
|
|
|
139
139
|
|
|
140
140
|
# Access the active (most recent) context from the stack
|
|
141
141
|
if cls.SESSION_CONTEXTS:
|
|
142
|
-
|
|
142
|
+
session = cls.SESSION_CONTEXTS[-1]
|
|
143
143
|
|
|
144
|
-
|
|
144
|
+
elif cls.GLOBAL_SESSION_CTX is None:
|
|
145
145
|
cls.GLOBAL_SESSION_CTX = Session(
|
|
146
146
|
cls.GLOBAL_SESSION_NAME,
|
|
147
147
|
catalog,
|
|
148
148
|
client_config=client_config,
|
|
149
149
|
in_memory=in_memory,
|
|
150
150
|
)
|
|
151
|
+
session = cls.GLOBAL_SESSION_CTX
|
|
151
152
|
|
|
152
153
|
atexit.register(cls._global_cleanup)
|
|
153
154
|
cls.ORIGINAL_EXCEPT_HOOK = sys.excepthook
|
|
154
155
|
sys.excepthook = cls.except_hook
|
|
156
|
+
else:
|
|
157
|
+
session = cls.GLOBAL_SESSION_CTX
|
|
155
158
|
|
|
156
|
-
|
|
159
|
+
if client_config and session.catalog.client_config != client_config:
|
|
160
|
+
session = Session(
|
|
161
|
+
"session" + uuid4().hex[:4],
|
|
162
|
+
catalog,
|
|
163
|
+
client_config=client_config,
|
|
164
|
+
in_memory=in_memory,
|
|
165
|
+
)
|
|
166
|
+
session.__enter__()
|
|
167
|
+
|
|
168
|
+
return session
|
|
157
169
|
|
|
158
170
|
@staticmethod
|
|
159
171
|
def except_hook(exc_type, exc_value, exc_traceback):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.10.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -175,7 +175,7 @@ high confidence scores.
|
|
|
175
175
|
|
|
176
176
|
likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
|
|
177
177
|
& (Column("meta.inference.class_") == "cat"))
|
|
178
|
-
likely_cats.
|
|
178
|
+
likely_cats.to_storage("high-confidence-cats/", signal="file")
|
|
179
179
|
|
|
180
180
|
|
|
181
181
|
Example: LLM based text-file evaluation
|
|
@@ -216,7 +216,7 @@ Python code:
|
|
|
216
216
|
)
|
|
217
217
|
|
|
218
218
|
successful_chain = chain.filter(Column("is_success") == True)
|
|
219
|
-
successful_chain.
|
|
219
|
+
successful_chain.to_storage("./output_mistral")
|
|
220
220
|
|
|
221
221
|
print(f"{successful_chain.count()} files were exported")
|
|
222
222
|
|
|
@@ -36,7 +36,7 @@ datachain/cli/parser/utils.py,sha256=GEzxfPJ4i6nt6JhjvZ3PQesXl9islEV3E-N1NZGrLaA
|
|
|
36
36
|
datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
|
|
37
37
|
datachain/client/azure.py,sha256=ma6fJcnveG8wpNy1PSrN5hgvmRdCj8Sf3RKjfd3qCyM,3221
|
|
38
38
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
39
|
-
datachain/client/fsspec.py,sha256=
|
|
39
|
+
datachain/client/fsspec.py,sha256=N_n3_DtZuKsLst8-XVda2xYCUHreUU3ld0MNTl8L9f4,14008
|
|
40
40
|
datachain/client/gcs.py,sha256=TY5K5INORKknTnoWDYv0EUztVLmuY1hHmdf2wUB_9uE,5114
|
|
41
41
|
datachain/client/hf.py,sha256=XeVJVbiNViZCpn3sfb90Fr8SYO3BdLmfE3hOWMoqInE,951
|
|
42
42
|
datachain/client/local.py,sha256=Pv67SYdkNkkNExBoKJF9AnNu0FSrt4JqLRkSVsUnveU,4672
|
|
@@ -68,8 +68,8 @@ datachain/lib/arrow.py,sha256=9UBCF-lftQaz0yxdsjbLKbyzVSmrF_QSWdhp2oBDPqs,9486
|
|
|
68
68
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
69
69
|
datachain/lib/data_model.py,sha256=zS4lmXHVBXc9ntcyea2a1CRLXGSAN_0glXcF88CohgY,2685
|
|
70
70
|
datachain/lib/dataset_info.py,sha256=IjdF1E0TQNOq9YyynfWiCFTeZpbyGfyJvxgJY4YN810,2493
|
|
71
|
-
datachain/lib/dc.py,sha256=
|
|
72
|
-
datachain/lib/file.py,sha256=
|
|
71
|
+
datachain/lib/dc.py,sha256=QQPnrS_OB1d3CfjLnYtRByGc7wNX_YT24WOjaoFPJgw,95372
|
|
72
|
+
datachain/lib/file.py,sha256=8OblP_hYJLh0z7MWGo3AiyO48eEJ13tzgla1UQf9A8I,27517
|
|
73
73
|
datachain/lib/hf.py,sha256=gjxuStZBlKtNk3-4yYSlWZDv9zBGblOdvEy_Lwap5hA,5882
|
|
74
74
|
datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
|
|
75
75
|
datachain/lib/listing.py,sha256=auodM0HitYZsL0DybdgQUYhne_LgkVW-LKGYYOACP90,7272
|
|
@@ -103,13 +103,13 @@ datachain/model/ultralytics/pose.py,sha256=71KBTcoST2wcEtsyGXqLVpvUtqbp9gwZGA15p
|
|
|
103
103
|
datachain/model/ultralytics/segment.py,sha256=Z1ab0tZRJubSYNH4KkFlzhYeGNTfAyC71KmkQcToHDQ,2760
|
|
104
104
|
datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
|
|
105
105
|
datachain/query/batch.py,sha256=6w8gzLTmLeylststu-gT5jIqEfi4-djS7_yTYyeo-fw,4190
|
|
106
|
-
datachain/query/dataset.py,sha256=
|
|
106
|
+
datachain/query/dataset.py,sha256=wK_etZkH558pzLKAMBArlj1TQD9n96YK-kpVYBCSR38,57083
|
|
107
107
|
datachain/query/dispatch.py,sha256=_1vjeQ1wjUoxlik55k0JkWqQCUfMjgVWmEOyWRkx0dU,12437
|
|
108
108
|
datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
|
|
109
109
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
110
110
|
datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
|
|
111
111
|
datachain/query/schema.py,sha256=b_KnVy6B26Ol4nYG0LqNNpeQ1QYPk95YRGUjXfdaQWs,6606
|
|
112
|
-
datachain/query/session.py,sha256=
|
|
112
|
+
datachain/query/session.py,sha256=I1KG8jDIaxGAfRfDRucMx8DqsANf_VYWtwtXjeD19lI,6399
|
|
113
113
|
datachain/query/udf.py,sha256=GY8E9pnzPE7ZKl_jvetZpn9R2rlUtMlhoYj4UmrzFzw,594
|
|
114
114
|
datachain/query/utils.py,sha256=u0A_BwG9PNs0DxoDcvSWgWLpj3ByTUv8CqH13CIuGag,1293
|
|
115
115
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -135,9 +135,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
135
135
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
136
136
|
datachain/toolkit/split.py,sha256=z3zRJNzjWrpPuRw-zgFbCOBKInyYxJew8ygrYQRQLNc,2930
|
|
137
137
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
138
|
-
datachain-0.
|
|
139
|
-
datachain-0.
|
|
140
|
-
datachain-0.
|
|
141
|
-
datachain-0.
|
|
142
|
-
datachain-0.
|
|
143
|
-
datachain-0.
|
|
138
|
+
datachain-0.10.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
139
|
+
datachain-0.10.0.dist-info/METADATA,sha256=4Eoe6lnoy_HBYtdzrAIjNnagKXagattQ_mluP9WC-ek,11195
|
|
140
|
+
datachain-0.10.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
141
|
+
datachain-0.10.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
142
|
+
datachain-0.10.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
143
|
+
datachain-0.10.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|