datachain 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -390,7 +390,7 @@ class Client(ABC):
390
390
  ) # type: ignore[return-value]
391
391
 
392
392
  def upload(self, data: bytes, path: str) -> "File":
393
- full_path = self.get_full_path(path)
393
+ full_path = path if path.startswith(self.PREFIX) else self.get_full_path(path)
394
394
 
395
395
  parent = posixpath.dirname(full_path)
396
396
  self.fs.makedirs(parent, exist_ok=True)
datachain/lib/dc.py CHANGED
@@ -411,6 +411,7 @@ class DataChain:
411
411
  object_name: str = "file",
412
412
  update: bool = False,
413
413
  anon: bool = False,
414
+ client_config: Optional[dict] = None,
414
415
  ) -> "Self":
415
416
  """Get data from a storage as a list of file with all file attributes.
416
417
  It returns the chain itself as usual.
@@ -423,15 +424,32 @@ class DataChain:
423
424
  object_name : Created object column name.
424
425
  update : force storage reindexing. Default is False.
425
426
  anon : If True, we will treat cloud bucket as public one
427
+ client_config : Optional client configuration for the storage client.
426
428
 
427
429
  Example:
430
+ Simple call from s3
428
431
  ```py
429
432
  chain = DataChain.from_storage("s3://my-bucket/my-dir")
430
433
  ```
434
+
435
+ With AWS S3-compatible storage
436
+ ```py
437
+ chain = DataChain.from_storage(
438
+ "s3://my-bucket/my-dir",
439
+ client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
440
+ )
441
+ ```
442
+
443
+ Pass existing session
444
+ ```py
445
+ session = Session.get()
446
+ chain = DataChain.from_storage("s3://my-bucket/my-dir", session=session)
447
+ ```
431
448
  """
432
449
  file_type = get_file_type(type)
433
450
 
434
- client_config = {"anon": True} if anon else None
451
+ if anon:
452
+ client_config = (client_config or {}) | {"anon": True}
435
453
  session = Session.get(session, client_config=client_config, in_memory=in_memory)
436
454
  cache = session.catalog.cache
437
455
  client_config = session.catalog.client_config
@@ -481,25 +499,56 @@ class DataChain:
481
499
  version: Optional[int] = None,
482
500
  session: Optional[Session] = None,
483
501
  settings: Optional[dict] = None,
484
- fallback_to_remote: bool = True,
502
+ fallback_to_studio: bool = True,
485
503
  ) -> "Self":
486
504
  """Get data from a saved Dataset. It returns the chain itself.
505
+ If dataset or version is not found locally, it will try to pull it from Studio.
487
506
 
488
507
  Parameters:
489
508
  name : dataset name
490
509
  version : dataset version
510
+ session : Session to use for the chain.
511
+ settings : Settings to use for the chain.
512
+ fallback_to_studio : Try to pull dataset from Studio if not found locally.
513
+ Default is True.
491
514
 
492
515
  Example:
493
516
  ```py
494
517
  chain = DataChain.from_dataset("my_cats")
495
518
  ```
519
+
520
+ ```py
521
+ chain = DataChain.from_dataset("my_cats", fallback_to_studio=False)
522
+ ```
523
+
524
+ ```py
525
+ chain = DataChain.from_dataset("my_cats", version=1)
526
+ ```
527
+
528
+ ```py
529
+ session = Session.get(client_config={"aws_endpoint_url": "<minio-url>"})
530
+ settings = {
531
+ "cache": True,
532
+ "parallel": 4,
533
+ "workers": 4,
534
+ "min_task_size": 1000,
535
+ "prefetch": 10,
536
+ }
537
+ chain = DataChain.from_dataset(
538
+ name="my_cats",
539
+ version=1,
540
+ session=session,
541
+ settings=settings,
542
+ fallback_to_studio=True,
543
+ )
544
+ ```
496
545
  """
497
546
  query = DatasetQuery(
498
547
  name=name,
499
548
  version=version,
500
549
  session=session,
501
550
  indexing_column_types=File._datachain_column_types,
502
- fallback_to_remote=fallback_to_remote,
551
+ fallback_to_studio=fallback_to_studio,
503
552
  )
504
553
  telemetry.send_event_once("class", "datachain_init", name=name, version=version)
505
554
  if settings:
@@ -2444,7 +2493,7 @@ class DataChain:
2444
2493
  self._setup = self._setup | kwargs
2445
2494
  return self
2446
2495
 
2447
- def export_files(
2496
+ def to_storage(
2448
2497
  self,
2449
2498
  output: str,
2450
2499
  signal: str = "file",
@@ -2462,6 +2511,13 @@ class DataChain:
2462
2511
  use_cache: If `True`, cache the files before exporting.
2463
2512
  link_type: Method to use for exporting files.
2464
2513
  Falls back to `'copy'` if symlinking fails.
2514
+
2515
+ Example:
2516
+ Cross cloud transfer
2517
+ ```py
2518
+ ds = DataChain.from_storage("s3://mybucket")
2519
+ ds.to_storage("gs://mybucket", placement="filename")
2520
+ ```
2465
2521
  """
2466
2522
  if placement == "filename" and (
2467
2523
  self._query.distinct(pathfunc.name(C(f"{signal}__path"))).count()
datachain/lib/file.py CHANGED
@@ -17,6 +17,7 @@ from urllib.parse import unquote, urlparse
17
17
  from urllib.request import url2pathname
18
18
 
19
19
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
20
+ from fsspec.utils import stringify_path
20
21
  from PIL import Image as PilImage
21
22
  from pydantic import Field, field_validator
22
23
 
@@ -214,10 +215,13 @@ class File(DataModel):
214
215
 
215
216
  catalog = get_catalog()
216
217
 
217
- parent, name = posixpath.split(path)
218
+ from datachain.client.fsspec import Client
218
219
 
219
- client = catalog.get_client(parent)
220
- file = client.upload(data, name)
220
+ client_cls = Client.get_implementation(path)
221
+ source, rel_path = client_cls.split_url(path)
222
+
223
+ client = catalog.get_client(client_cls.get_uri(source))
224
+ file = client.upload(data, rel_path)
221
225
  if not isinstance(file, cls):
222
226
  file = cls(**file.model_dump())
223
227
  file._set_stream(catalog)
@@ -267,8 +271,9 @@ class File(DataModel):
267
271
 
268
272
  def save(self, destination: str):
269
273
  """Writes it's content to destination"""
270
- with open(destination, mode="wb") as f:
271
- f.write(self.read())
274
+ destination = stringify_path(destination)
275
+ client: Client = self._catalog.get_client(str(destination))
276
+ client.upload(self.read(), str(destination))
272
277
 
273
278
  def _symlink_to(self, destination: str):
274
279
  if self.location:
@@ -282,6 +287,7 @@ class File(DataModel):
282
287
  source = self.get_path()
283
288
  else:
284
289
  raise OSError(errno.EXDEV, "can't link across filesystems")
290
+
285
291
  return os.symlink(source, destination)
286
292
 
287
293
  def export(
@@ -296,7 +302,8 @@ class File(DataModel):
296
302
  self._caching_enabled = use_cache
297
303
  dst = self.get_destination_path(output, placement)
298
304
  dst_dir = os.path.dirname(dst)
299
- os.makedirs(dst_dir, exist_ok=True)
305
+ client: Client = self._catalog.get_client(dst_dir)
306
+ client.fs.makedirs(dst_dir, exist_ok=True)
300
307
 
301
308
  if link_type == "symlink":
302
309
  try:
@@ -493,7 +500,10 @@ class TextFile(File):
493
500
 
494
501
  def save(self, destination: str):
495
502
  """Writes it's content to destination"""
496
- with open(destination, mode="w") as f:
503
+ destination = stringify_path(destination)
504
+
505
+ client: Client = self._catalog.get_client(destination)
506
+ with client.fs.open(destination, mode="w") as f:
497
507
  f.write(self.read_text())
498
508
 
499
509
 
@@ -507,7 +517,11 @@ class ImageFile(File):
507
517
 
508
518
  def save(self, destination: str):
509
519
  """Writes it's content to destination"""
510
- self.read().save(destination)
520
+ destination = stringify_path(destination)
521
+
522
+ client: Client = self._catalog.get_client(destination)
523
+ with client.fs.open(destination, mode="wb") as f:
524
+ self.read().save(f)
511
525
 
512
526
 
513
527
  class Image(DataModel):
@@ -1085,7 +1085,7 @@ class DatasetQuery:
1085
1085
  session: Optional[Session] = None,
1086
1086
  indexing_column_types: Optional[dict[str, Any]] = None,
1087
1087
  in_memory: bool = False,
1088
- fallback_to_remote: bool = True,
1088
+ fallback_to_studio: bool = True,
1089
1089
  ) -> None:
1090
1090
  self.session = Session.get(session, catalog=catalog, in_memory=in_memory)
1091
1091
  self.catalog = catalog or self.session.catalog
@@ -1103,7 +1103,7 @@ class DatasetQuery:
1103
1103
 
1104
1104
  self.name = name
1105
1105
 
1106
- if fallback_to_remote and is_token_set():
1106
+ if fallback_to_studio and is_token_set():
1107
1107
  ds = self.catalog.get_dataset_with_remote_fallback(name, version)
1108
1108
  else:
1109
1109
  ds = self.catalog.get_dataset(name)
@@ -139,21 +139,33 @@ class Session:
139
139
 
140
140
  # Access the active (most recent) context from the stack
141
141
  if cls.SESSION_CONTEXTS:
142
- return cls.SESSION_CONTEXTS[-1]
142
+ session = cls.SESSION_CONTEXTS[-1]
143
143
 
144
- if cls.GLOBAL_SESSION_CTX is None:
144
+ elif cls.GLOBAL_SESSION_CTX is None:
145
145
  cls.GLOBAL_SESSION_CTX = Session(
146
146
  cls.GLOBAL_SESSION_NAME,
147
147
  catalog,
148
148
  client_config=client_config,
149
149
  in_memory=in_memory,
150
150
  )
151
+ session = cls.GLOBAL_SESSION_CTX
151
152
 
152
153
  atexit.register(cls._global_cleanup)
153
154
  cls.ORIGINAL_EXCEPT_HOOK = sys.excepthook
154
155
  sys.excepthook = cls.except_hook
156
+ else:
157
+ session = cls.GLOBAL_SESSION_CTX
155
158
 
156
- return cls.GLOBAL_SESSION_CTX
159
+ if client_config and session.catalog.client_config != client_config:
160
+ session = Session(
161
+ "session" + uuid4().hex[:4],
162
+ catalog,
163
+ client_config=client_config,
164
+ in_memory=in_memory,
165
+ )
166
+ session.__enter__()
167
+
168
+ return session
157
169
 
158
170
  @staticmethod
159
171
  def except_hook(exc_type, exc_value, exc_traceback):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: datachain
3
- Version: 0.9.0
3
+ Version: 0.10.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -175,7 +175,7 @@ high confidence scores.
175
175
 
176
176
  likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
177
177
  & (Column("meta.inference.class_") == "cat"))
178
- likely_cats.export_files("high-confidence-cats/", signal="file")
178
+ likely_cats.to_storage("high-confidence-cats/", signal="file")
179
179
 
180
180
 
181
181
  Example: LLM based text-file evaluation
@@ -216,7 +216,7 @@ Python code:
216
216
  )
217
217
 
218
218
  successful_chain = chain.filter(Column("is_success") == True)
219
- successful_chain.export_files("./output_mistral")
219
+ successful_chain.to_storage("./output_mistral")
220
220
 
221
221
  print(f"{successful_chain.count()} files were exported")
222
222
 
@@ -36,7 +36,7 @@ datachain/cli/parser/utils.py,sha256=GEzxfPJ4i6nt6JhjvZ3PQesXl9islEV3E-N1NZGrLaA
36
36
  datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
37
37
  datachain/client/azure.py,sha256=ma6fJcnveG8wpNy1PSrN5hgvmRdCj8Sf3RKjfd3qCyM,3221
38
38
  datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
39
- datachain/client/fsspec.py,sha256=whQWKD0tGQUjc8FpA0irxc31wiJuQS4x2arHc98-Lv0,13966
39
+ datachain/client/fsspec.py,sha256=N_n3_DtZuKsLst8-XVda2xYCUHreUU3ld0MNTl8L9f4,14008
40
40
  datachain/client/gcs.py,sha256=TY5K5INORKknTnoWDYv0EUztVLmuY1hHmdf2wUB_9uE,5114
41
41
  datachain/client/hf.py,sha256=XeVJVbiNViZCpn3sfb90Fr8SYO3BdLmfE3hOWMoqInE,951
42
42
  datachain/client/local.py,sha256=Pv67SYdkNkkNExBoKJF9AnNu0FSrt4JqLRkSVsUnveU,4672
@@ -68,8 +68,8 @@ datachain/lib/arrow.py,sha256=9UBCF-lftQaz0yxdsjbLKbyzVSmrF_QSWdhp2oBDPqs,9486
68
68
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
69
69
  datachain/lib/data_model.py,sha256=zS4lmXHVBXc9ntcyea2a1CRLXGSAN_0glXcF88CohgY,2685
70
70
  datachain/lib/dataset_info.py,sha256=IjdF1E0TQNOq9YyynfWiCFTeZpbyGfyJvxgJY4YN810,2493
71
- datachain/lib/dc.py,sha256=rQZgFLFSIde6wfiAnBnlSE4qnjNjNXQ0F3TGhDQ6ap8,93459
72
- datachain/lib/file.py,sha256=HXH4pgPN_Zx9rPI0jy-Cjl2F3ItJchH7ycrIXnOgaGk,26892
71
+ datachain/lib/dc.py,sha256=QQPnrS_OB1d3CfjLnYtRByGc7wNX_YT24WOjaoFPJgw,95372
72
+ datachain/lib/file.py,sha256=8OblP_hYJLh0z7MWGo3AiyO48eEJ13tzgla1UQf9A8I,27517
73
73
  datachain/lib/hf.py,sha256=gjxuStZBlKtNk3-4yYSlWZDv9zBGblOdvEy_Lwap5hA,5882
74
74
  datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
75
75
  datachain/lib/listing.py,sha256=auodM0HitYZsL0DybdgQUYhne_LgkVW-LKGYYOACP90,7272
@@ -103,13 +103,13 @@ datachain/model/ultralytics/pose.py,sha256=71KBTcoST2wcEtsyGXqLVpvUtqbp9gwZGA15p
103
103
  datachain/model/ultralytics/segment.py,sha256=Z1ab0tZRJubSYNH4KkFlzhYeGNTfAyC71KmkQcToHDQ,2760
104
104
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
105
105
  datachain/query/batch.py,sha256=6w8gzLTmLeylststu-gT5jIqEfi4-djS7_yTYyeo-fw,4190
106
- datachain/query/dataset.py,sha256=tXinFa-0ytC4j3W3XKjSkVW3qX3iFGQyRO800k9JW98,57083
106
+ datachain/query/dataset.py,sha256=wK_etZkH558pzLKAMBArlj1TQD9n96YK-kpVYBCSR38,57083
107
107
  datachain/query/dispatch.py,sha256=_1vjeQ1wjUoxlik55k0JkWqQCUfMjgVWmEOyWRkx0dU,12437
108
108
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
109
109
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
110
110
  datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
111
111
  datachain/query/schema.py,sha256=b_KnVy6B26Ol4nYG0LqNNpeQ1QYPk95YRGUjXfdaQWs,6606
112
- datachain/query/session.py,sha256=fQAtl5zRESRDfRS2d5J9KgrWauunCtrd96vP4Ns1KlE,5998
112
+ datachain/query/session.py,sha256=I1KG8jDIaxGAfRfDRucMx8DqsANf_VYWtwtXjeD19lI,6399
113
113
  datachain/query/udf.py,sha256=GY8E9pnzPE7ZKl_jvetZpn9R2rlUtMlhoYj4UmrzFzw,594
114
114
  datachain/query/utils.py,sha256=u0A_BwG9PNs0DxoDcvSWgWLpj3ByTUv8CqH13CIuGag,1293
115
115
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -135,9 +135,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
135
135
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
136
136
  datachain/toolkit/split.py,sha256=z3zRJNzjWrpPuRw-zgFbCOBKInyYxJew8ygrYQRQLNc,2930
137
137
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
138
- datachain-0.9.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
139
- datachain-0.9.0.dist-info/METADATA,sha256=GJFjVJKy6bvK7OwltIhcQEA3GHdpR2pHHJEAdoeapM8,11198
140
- datachain-0.9.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
141
- datachain-0.9.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
142
- datachain-0.9.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
143
- datachain-0.9.0.dist-info/RECORD,,
138
+ datachain-0.10.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
139
+ datachain-0.10.0.dist-info/METADATA,sha256=4Eoe6lnoy_HBYtdzrAIjNnagKXagattQ_mluP9WC-ek,11195
140
+ datachain-0.10.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
141
+ datachain-0.10.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
142
+ datachain-0.10.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
143
+ datachain-0.10.0.dist-info/RECORD,,