datachain 0.9.1__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/client/fsspec.py +1 -1
- datachain/lib/dc.py +60 -4
- datachain/lib/file.py +16 -5
- datachain/query/dataset.py +2 -2
- datachain/query/session.py +15 -3
- {datachain-0.9.1.dist-info → datachain-0.10.0.dist-info}/METADATA +3 -3
- {datachain-0.9.1.dist-info → datachain-0.10.0.dist-info}/RECORD +11 -11
- {datachain-0.9.1.dist-info → datachain-0.10.0.dist-info}/LICENSE +0 -0
- {datachain-0.9.1.dist-info → datachain-0.10.0.dist-info}/WHEEL +0 -0
- {datachain-0.9.1.dist-info → datachain-0.10.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.9.1.dist-info → datachain-0.10.0.dist-info}/top_level.txt +0 -0
datachain/client/fsspec.py
CHANGED
|
@@ -390,7 +390,7 @@ class Client(ABC):
|
|
|
390
390
|
) # type: ignore[return-value]
|
|
391
391
|
|
|
392
392
|
def upload(self, data: bytes, path: str) -> "File":
|
|
393
|
-
full_path = self.get_full_path(path)
|
|
393
|
+
full_path = path if path.startswith(self.PREFIX) else self.get_full_path(path)
|
|
394
394
|
|
|
395
395
|
parent = posixpath.dirname(full_path)
|
|
396
396
|
self.fs.makedirs(parent, exist_ok=True)
|
datachain/lib/dc.py
CHANGED
|
@@ -411,6 +411,7 @@ class DataChain:
|
|
|
411
411
|
object_name: str = "file",
|
|
412
412
|
update: bool = False,
|
|
413
413
|
anon: bool = False,
|
|
414
|
+
client_config: Optional[dict] = None,
|
|
414
415
|
) -> "Self":
|
|
415
416
|
"""Get data from a storage as a list of file with all file attributes.
|
|
416
417
|
It returns the chain itself as usual.
|
|
@@ -423,15 +424,32 @@ class DataChain:
|
|
|
423
424
|
object_name : Created object column name.
|
|
424
425
|
update : force storage reindexing. Default is False.
|
|
425
426
|
anon : If True, we will treat cloud bucket as public one
|
|
427
|
+
client_config : Optional client configuration for the storage client.
|
|
426
428
|
|
|
427
429
|
Example:
|
|
430
|
+
Simple call from s3
|
|
428
431
|
```py
|
|
429
432
|
chain = DataChain.from_storage("s3://my-bucket/my-dir")
|
|
430
433
|
```
|
|
434
|
+
|
|
435
|
+
With AWS S3-compatible storage
|
|
436
|
+
```py
|
|
437
|
+
chain = DataChain.from_storage(
|
|
438
|
+
"s3://my-bucket/my-dir",
|
|
439
|
+
client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
|
|
440
|
+
)
|
|
441
|
+
```
|
|
442
|
+
|
|
443
|
+
Pass existing session
|
|
444
|
+
```py
|
|
445
|
+
session = Session.get()
|
|
446
|
+
chain = DataChain.from_storage("s3://my-bucket/my-dir", session=session)
|
|
447
|
+
```
|
|
431
448
|
"""
|
|
432
449
|
file_type = get_file_type(type)
|
|
433
450
|
|
|
434
|
-
|
|
451
|
+
if anon:
|
|
452
|
+
client_config = (client_config or {}) | {"anon": True}
|
|
435
453
|
session = Session.get(session, client_config=client_config, in_memory=in_memory)
|
|
436
454
|
cache = session.catalog.cache
|
|
437
455
|
client_config = session.catalog.client_config
|
|
@@ -481,25 +499,56 @@ class DataChain:
|
|
|
481
499
|
version: Optional[int] = None,
|
|
482
500
|
session: Optional[Session] = None,
|
|
483
501
|
settings: Optional[dict] = None,
|
|
484
|
-
|
|
502
|
+
fallback_to_studio: bool = True,
|
|
485
503
|
) -> "Self":
|
|
486
504
|
"""Get data from a saved Dataset. It returns the chain itself.
|
|
505
|
+
If dataset or version is not found locally, it will try to pull it from Studio.
|
|
487
506
|
|
|
488
507
|
Parameters:
|
|
489
508
|
name : dataset name
|
|
490
509
|
version : dataset version
|
|
510
|
+
session : Session to use for the chain.
|
|
511
|
+
settings : Settings to use for the chain.
|
|
512
|
+
fallback_to_studio : Try to pull dataset from Studio if not found locally.
|
|
513
|
+
Default is True.
|
|
491
514
|
|
|
492
515
|
Example:
|
|
493
516
|
```py
|
|
494
517
|
chain = DataChain.from_dataset("my_cats")
|
|
495
518
|
```
|
|
519
|
+
|
|
520
|
+
```py
|
|
521
|
+
chain = DataChain.from_dataset("my_cats", fallback_to_studio=False)
|
|
522
|
+
```
|
|
523
|
+
|
|
524
|
+
```py
|
|
525
|
+
chain = DataChain.from_dataset("my_cats", version=1)
|
|
526
|
+
```
|
|
527
|
+
|
|
528
|
+
```py
|
|
529
|
+
session = Session.get(client_config={"aws_endpoint_url": "<minio-url>"})
|
|
530
|
+
settings = {
|
|
531
|
+
"cache": True,
|
|
532
|
+
"parallel": 4,
|
|
533
|
+
"workers": 4,
|
|
534
|
+
"min_task_size": 1000,
|
|
535
|
+
"prefetch": 10,
|
|
536
|
+
}
|
|
537
|
+
chain = DataChain.from_dataset(
|
|
538
|
+
name="my_cats",
|
|
539
|
+
version=1,
|
|
540
|
+
session=session,
|
|
541
|
+
settings=settings,
|
|
542
|
+
fallback_to_studio=True,
|
|
543
|
+
)
|
|
544
|
+
```
|
|
496
545
|
"""
|
|
497
546
|
query = DatasetQuery(
|
|
498
547
|
name=name,
|
|
499
548
|
version=version,
|
|
500
549
|
session=session,
|
|
501
550
|
indexing_column_types=File._datachain_column_types,
|
|
502
|
-
|
|
551
|
+
fallback_to_studio=fallback_to_studio,
|
|
503
552
|
)
|
|
504
553
|
telemetry.send_event_once("class", "datachain_init", name=name, version=version)
|
|
505
554
|
if settings:
|
|
@@ -2444,7 +2493,7 @@ class DataChain:
|
|
|
2444
2493
|
self._setup = self._setup | kwargs
|
|
2445
2494
|
return self
|
|
2446
2495
|
|
|
2447
|
-
def
|
|
2496
|
+
def to_storage(
|
|
2448
2497
|
self,
|
|
2449
2498
|
output: str,
|
|
2450
2499
|
signal: str = "file",
|
|
@@ -2462,6 +2511,13 @@ class DataChain:
|
|
|
2462
2511
|
use_cache: If `True`, cache the files before exporting.
|
|
2463
2512
|
link_type: Method to use for exporting files.
|
|
2464
2513
|
Falls back to `'copy'` if symlinking fails.
|
|
2514
|
+
|
|
2515
|
+
Example:
|
|
2516
|
+
Cross cloud transfer
|
|
2517
|
+
```py
|
|
2518
|
+
ds = DataChain.from_storage("s3://mybucket")
|
|
2519
|
+
ds.to_storage("gs://mybucket", placement="filename")
|
|
2520
|
+
```
|
|
2465
2521
|
"""
|
|
2466
2522
|
if placement == "filename" and (
|
|
2467
2523
|
self._query.distinct(pathfunc.name(C(f"{signal}__path"))).count()
|
datachain/lib/file.py
CHANGED
|
@@ -17,6 +17,7 @@ from urllib.parse import unquote, urlparse
|
|
|
17
17
|
from urllib.request import url2pathname
|
|
18
18
|
|
|
19
19
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
20
|
+
from fsspec.utils import stringify_path
|
|
20
21
|
from PIL import Image as PilImage
|
|
21
22
|
from pydantic import Field, field_validator
|
|
22
23
|
|
|
@@ -270,8 +271,9 @@ class File(DataModel):
|
|
|
270
271
|
|
|
271
272
|
def save(self, destination: str):
|
|
272
273
|
"""Writes it's content to destination"""
|
|
273
|
-
|
|
274
|
-
|
|
274
|
+
destination = stringify_path(destination)
|
|
275
|
+
client: Client = self._catalog.get_client(str(destination))
|
|
276
|
+
client.upload(self.read(), str(destination))
|
|
275
277
|
|
|
276
278
|
def _symlink_to(self, destination: str):
|
|
277
279
|
if self.location:
|
|
@@ -285,6 +287,7 @@ class File(DataModel):
|
|
|
285
287
|
source = self.get_path()
|
|
286
288
|
else:
|
|
287
289
|
raise OSError(errno.EXDEV, "can't link across filesystems")
|
|
290
|
+
|
|
288
291
|
return os.symlink(source, destination)
|
|
289
292
|
|
|
290
293
|
def export(
|
|
@@ -299,7 +302,8 @@ class File(DataModel):
|
|
|
299
302
|
self._caching_enabled = use_cache
|
|
300
303
|
dst = self.get_destination_path(output, placement)
|
|
301
304
|
dst_dir = os.path.dirname(dst)
|
|
302
|
-
|
|
305
|
+
client: Client = self._catalog.get_client(dst_dir)
|
|
306
|
+
client.fs.makedirs(dst_dir, exist_ok=True)
|
|
303
307
|
|
|
304
308
|
if link_type == "symlink":
|
|
305
309
|
try:
|
|
@@ -496,7 +500,10 @@ class TextFile(File):
|
|
|
496
500
|
|
|
497
501
|
def save(self, destination: str):
|
|
498
502
|
"""Writes it's content to destination"""
|
|
499
|
-
|
|
503
|
+
destination = stringify_path(destination)
|
|
504
|
+
|
|
505
|
+
client: Client = self._catalog.get_client(destination)
|
|
506
|
+
with client.fs.open(destination, mode="w") as f:
|
|
500
507
|
f.write(self.read_text())
|
|
501
508
|
|
|
502
509
|
|
|
@@ -510,7 +517,11 @@ class ImageFile(File):
|
|
|
510
517
|
|
|
511
518
|
def save(self, destination: str):
|
|
512
519
|
"""Writes it's content to destination"""
|
|
513
|
-
|
|
520
|
+
destination = stringify_path(destination)
|
|
521
|
+
|
|
522
|
+
client: Client = self._catalog.get_client(destination)
|
|
523
|
+
with client.fs.open(destination, mode="wb") as f:
|
|
524
|
+
self.read().save(f)
|
|
514
525
|
|
|
515
526
|
|
|
516
527
|
class Image(DataModel):
|
datachain/query/dataset.py
CHANGED
|
@@ -1085,7 +1085,7 @@ class DatasetQuery:
|
|
|
1085
1085
|
session: Optional[Session] = None,
|
|
1086
1086
|
indexing_column_types: Optional[dict[str, Any]] = None,
|
|
1087
1087
|
in_memory: bool = False,
|
|
1088
|
-
|
|
1088
|
+
fallback_to_studio: bool = True,
|
|
1089
1089
|
) -> None:
|
|
1090
1090
|
self.session = Session.get(session, catalog=catalog, in_memory=in_memory)
|
|
1091
1091
|
self.catalog = catalog or self.session.catalog
|
|
@@ -1103,7 +1103,7 @@ class DatasetQuery:
|
|
|
1103
1103
|
|
|
1104
1104
|
self.name = name
|
|
1105
1105
|
|
|
1106
|
-
if
|
|
1106
|
+
if fallback_to_studio and is_token_set():
|
|
1107
1107
|
ds = self.catalog.get_dataset_with_remote_fallback(name, version)
|
|
1108
1108
|
else:
|
|
1109
1109
|
ds = self.catalog.get_dataset(name)
|
datachain/query/session.py
CHANGED
|
@@ -139,21 +139,33 @@ class Session:
|
|
|
139
139
|
|
|
140
140
|
# Access the active (most recent) context from the stack
|
|
141
141
|
if cls.SESSION_CONTEXTS:
|
|
142
|
-
|
|
142
|
+
session = cls.SESSION_CONTEXTS[-1]
|
|
143
143
|
|
|
144
|
-
|
|
144
|
+
elif cls.GLOBAL_SESSION_CTX is None:
|
|
145
145
|
cls.GLOBAL_SESSION_CTX = Session(
|
|
146
146
|
cls.GLOBAL_SESSION_NAME,
|
|
147
147
|
catalog,
|
|
148
148
|
client_config=client_config,
|
|
149
149
|
in_memory=in_memory,
|
|
150
150
|
)
|
|
151
|
+
session = cls.GLOBAL_SESSION_CTX
|
|
151
152
|
|
|
152
153
|
atexit.register(cls._global_cleanup)
|
|
153
154
|
cls.ORIGINAL_EXCEPT_HOOK = sys.excepthook
|
|
154
155
|
sys.excepthook = cls.except_hook
|
|
156
|
+
else:
|
|
157
|
+
session = cls.GLOBAL_SESSION_CTX
|
|
155
158
|
|
|
156
|
-
|
|
159
|
+
if client_config and session.catalog.client_config != client_config:
|
|
160
|
+
session = Session(
|
|
161
|
+
"session" + uuid4().hex[:4],
|
|
162
|
+
catalog,
|
|
163
|
+
client_config=client_config,
|
|
164
|
+
in_memory=in_memory,
|
|
165
|
+
)
|
|
166
|
+
session.__enter__()
|
|
167
|
+
|
|
168
|
+
return session
|
|
157
169
|
|
|
158
170
|
@staticmethod
|
|
159
171
|
def except_hook(exc_type, exc_value, exc_traceback):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.10.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -175,7 +175,7 @@ high confidence scores.
|
|
|
175
175
|
|
|
176
176
|
likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
|
|
177
177
|
& (Column("meta.inference.class_") == "cat"))
|
|
178
|
-
likely_cats.
|
|
178
|
+
likely_cats.to_storage("high-confidence-cats/", signal="file")
|
|
179
179
|
|
|
180
180
|
|
|
181
181
|
Example: LLM based text-file evaluation
|
|
@@ -216,7 +216,7 @@ Python code:
|
|
|
216
216
|
)
|
|
217
217
|
|
|
218
218
|
successful_chain = chain.filter(Column("is_success") == True)
|
|
219
|
-
successful_chain.
|
|
219
|
+
successful_chain.to_storage("./output_mistral")
|
|
220
220
|
|
|
221
221
|
print(f"{successful_chain.count()} files were exported")
|
|
222
222
|
|
|
@@ -36,7 +36,7 @@ datachain/cli/parser/utils.py,sha256=GEzxfPJ4i6nt6JhjvZ3PQesXl9islEV3E-N1NZGrLaA
|
|
|
36
36
|
datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
|
|
37
37
|
datachain/client/azure.py,sha256=ma6fJcnveG8wpNy1PSrN5hgvmRdCj8Sf3RKjfd3qCyM,3221
|
|
38
38
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
39
|
-
datachain/client/fsspec.py,sha256=
|
|
39
|
+
datachain/client/fsspec.py,sha256=N_n3_DtZuKsLst8-XVda2xYCUHreUU3ld0MNTl8L9f4,14008
|
|
40
40
|
datachain/client/gcs.py,sha256=TY5K5INORKknTnoWDYv0EUztVLmuY1hHmdf2wUB_9uE,5114
|
|
41
41
|
datachain/client/hf.py,sha256=XeVJVbiNViZCpn3sfb90Fr8SYO3BdLmfE3hOWMoqInE,951
|
|
42
42
|
datachain/client/local.py,sha256=Pv67SYdkNkkNExBoKJF9AnNu0FSrt4JqLRkSVsUnveU,4672
|
|
@@ -68,8 +68,8 @@ datachain/lib/arrow.py,sha256=9UBCF-lftQaz0yxdsjbLKbyzVSmrF_QSWdhp2oBDPqs,9486
|
|
|
68
68
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
69
69
|
datachain/lib/data_model.py,sha256=zS4lmXHVBXc9ntcyea2a1CRLXGSAN_0glXcF88CohgY,2685
|
|
70
70
|
datachain/lib/dataset_info.py,sha256=IjdF1E0TQNOq9YyynfWiCFTeZpbyGfyJvxgJY4YN810,2493
|
|
71
|
-
datachain/lib/dc.py,sha256=
|
|
72
|
-
datachain/lib/file.py,sha256=
|
|
71
|
+
datachain/lib/dc.py,sha256=QQPnrS_OB1d3CfjLnYtRByGc7wNX_YT24WOjaoFPJgw,95372
|
|
72
|
+
datachain/lib/file.py,sha256=8OblP_hYJLh0z7MWGo3AiyO48eEJ13tzgla1UQf9A8I,27517
|
|
73
73
|
datachain/lib/hf.py,sha256=gjxuStZBlKtNk3-4yYSlWZDv9zBGblOdvEy_Lwap5hA,5882
|
|
74
74
|
datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
|
|
75
75
|
datachain/lib/listing.py,sha256=auodM0HitYZsL0DybdgQUYhne_LgkVW-LKGYYOACP90,7272
|
|
@@ -103,13 +103,13 @@ datachain/model/ultralytics/pose.py,sha256=71KBTcoST2wcEtsyGXqLVpvUtqbp9gwZGA15p
|
|
|
103
103
|
datachain/model/ultralytics/segment.py,sha256=Z1ab0tZRJubSYNH4KkFlzhYeGNTfAyC71KmkQcToHDQ,2760
|
|
104
104
|
datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
|
|
105
105
|
datachain/query/batch.py,sha256=6w8gzLTmLeylststu-gT5jIqEfi4-djS7_yTYyeo-fw,4190
|
|
106
|
-
datachain/query/dataset.py,sha256=
|
|
106
|
+
datachain/query/dataset.py,sha256=wK_etZkH558pzLKAMBArlj1TQD9n96YK-kpVYBCSR38,57083
|
|
107
107
|
datachain/query/dispatch.py,sha256=_1vjeQ1wjUoxlik55k0JkWqQCUfMjgVWmEOyWRkx0dU,12437
|
|
108
108
|
datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
|
|
109
109
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
110
110
|
datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
|
|
111
111
|
datachain/query/schema.py,sha256=b_KnVy6B26Ol4nYG0LqNNpeQ1QYPk95YRGUjXfdaQWs,6606
|
|
112
|
-
datachain/query/session.py,sha256=
|
|
112
|
+
datachain/query/session.py,sha256=I1KG8jDIaxGAfRfDRucMx8DqsANf_VYWtwtXjeD19lI,6399
|
|
113
113
|
datachain/query/udf.py,sha256=GY8E9pnzPE7ZKl_jvetZpn9R2rlUtMlhoYj4UmrzFzw,594
|
|
114
114
|
datachain/query/utils.py,sha256=u0A_BwG9PNs0DxoDcvSWgWLpj3ByTUv8CqH13CIuGag,1293
|
|
115
115
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -135,9 +135,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
135
135
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
136
136
|
datachain/toolkit/split.py,sha256=z3zRJNzjWrpPuRw-zgFbCOBKInyYxJew8ygrYQRQLNc,2930
|
|
137
137
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
138
|
-
datachain-0.
|
|
139
|
-
datachain-0.
|
|
140
|
-
datachain-0.
|
|
141
|
-
datachain-0.
|
|
142
|
-
datachain-0.
|
|
143
|
-
datachain-0.
|
|
138
|
+
datachain-0.10.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
139
|
+
datachain-0.10.0.dist-info/METADATA,sha256=4Eoe6lnoy_HBYtdzrAIjNnagKXagattQ_mluP9WC-ek,11195
|
|
140
|
+
datachain-0.10.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
141
|
+
datachain-0.10.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
142
|
+
datachain-0.10.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
143
|
+
datachain-0.10.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|