datachain 0.9.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -390,7 +390,7 @@ class Client(ABC):
390
390
  ) # type: ignore[return-value]
391
391
 
392
392
  def upload(self, data: bytes, path: str) -> "File":
393
- full_path = self.get_full_path(path)
393
+ full_path = path if path.startswith(self.PREFIX) else self.get_full_path(path)
394
394
 
395
395
  parent = posixpath.dirname(full_path)
396
396
  self.fs.makedirs(parent, exist_ok=True)
datachain/lib/dc.py CHANGED
@@ -411,6 +411,7 @@ class DataChain:
411
411
  object_name: str = "file",
412
412
  update: bool = False,
413
413
  anon: bool = False,
414
+ client_config: Optional[dict] = None,
414
415
  ) -> "Self":
415
416
  """Get data from a storage as a list of file with all file attributes.
416
417
  It returns the chain itself as usual.
@@ -423,15 +424,32 @@ class DataChain:
423
424
  object_name : Created object column name.
424
425
  update : force storage reindexing. Default is False.
425
426
  anon : If True, we will treat cloud bucket as public one
427
+ client_config : Optional client configuration for the storage client.
426
428
 
427
429
  Example:
430
+ Simple call from s3
428
431
  ```py
429
432
  chain = DataChain.from_storage("s3://my-bucket/my-dir")
430
433
  ```
434
+
435
+ With AWS S3-compatible storage
436
+ ```py
437
+ chain = DataChain.from_storage(
438
+ "s3://my-bucket/my-dir",
439
+ client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
440
+ )
441
+ ```
442
+
443
+ Pass existing session
444
+ ```py
445
+ session = Session.get()
446
+ chain = DataChain.from_storage("s3://my-bucket/my-dir", session=session)
447
+ ```
431
448
  """
432
449
  file_type = get_file_type(type)
433
450
 
434
- client_config = {"anon": True} if anon else None
451
+ if anon:
452
+ client_config = (client_config or {}) | {"anon": True}
435
453
  session = Session.get(session, client_config=client_config, in_memory=in_memory)
436
454
  cache = session.catalog.cache
437
455
  client_config = session.catalog.client_config
@@ -481,25 +499,56 @@ class DataChain:
481
499
  version: Optional[int] = None,
482
500
  session: Optional[Session] = None,
483
501
  settings: Optional[dict] = None,
484
- fallback_to_remote: bool = True,
502
+ fallback_to_studio: bool = True,
485
503
  ) -> "Self":
486
504
  """Get data from a saved Dataset. It returns the chain itself.
505
+ If dataset or version is not found locally, it will try to pull it from Studio.
487
506
 
488
507
  Parameters:
489
508
  name : dataset name
490
509
  version : dataset version
510
+ session : Session to use for the chain.
511
+ settings : Settings to use for the chain.
512
+ fallback_to_studio : Try to pull dataset from Studio if not found locally.
513
+ Default is True.
491
514
 
492
515
  Example:
493
516
  ```py
494
517
  chain = DataChain.from_dataset("my_cats")
495
518
  ```
519
+
520
+ ```py
521
+ chain = DataChain.from_dataset("my_cats", fallback_to_studio=False)
522
+ ```
523
+
524
+ ```py
525
+ chain = DataChain.from_dataset("my_cats", version=1)
526
+ ```
527
+
528
+ ```py
529
+ session = Session.get(client_config={"aws_endpoint_url": "<minio-url>"})
530
+ settings = {
531
+ "cache": True,
532
+ "parallel": 4,
533
+ "workers": 4,
534
+ "min_task_size": 1000,
535
+ "prefetch": 10,
536
+ }
537
+ chain = DataChain.from_dataset(
538
+ name="my_cats",
539
+ version=1,
540
+ session=session,
541
+ settings=settings,
542
+ fallback_to_studio=True,
543
+ )
544
+ ```
496
545
  """
497
546
  query = DatasetQuery(
498
547
  name=name,
499
548
  version=version,
500
549
  session=session,
501
550
  indexing_column_types=File._datachain_column_types,
502
- fallback_to_remote=fallback_to_remote,
551
+ fallback_to_studio=fallback_to_studio,
503
552
  )
504
553
  telemetry.send_event_once("class", "datachain_init", name=name, version=version)
505
554
  if settings:
@@ -2444,7 +2493,7 @@ class DataChain:
2444
2493
  self._setup = self._setup | kwargs
2445
2494
  return self
2446
2495
 
2447
- def export_files(
2496
+ def to_storage(
2448
2497
  self,
2449
2498
  output: str,
2450
2499
  signal: str = "file",
@@ -2462,6 +2511,13 @@ class DataChain:
2462
2511
  use_cache: If `True`, cache the files before exporting.
2463
2512
  link_type: Method to use for exporting files.
2464
2513
  Falls back to `'copy'` if symlinking fails.
2514
+
2515
+ Example:
2516
+ Cross cloud transfer
2517
+ ```py
2518
+ ds = DataChain.from_storage("s3://mybucket")
2519
+ ds.to_storage("gs://mybucket", placement="filename")
2520
+ ```
2465
2521
  """
2466
2522
  if placement == "filename" and (
2467
2523
  self._query.distinct(pathfunc.name(C(f"{signal}__path"))).count()
datachain/lib/file.py CHANGED
@@ -17,6 +17,7 @@ from urllib.parse import unquote, urlparse
17
17
  from urllib.request import url2pathname
18
18
 
19
19
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
20
+ from fsspec.utils import stringify_path
20
21
  from PIL import Image as PilImage
21
22
  from pydantic import Field, field_validator
22
23
 
@@ -270,8 +271,9 @@ class File(DataModel):
270
271
 
271
272
  def save(self, destination: str):
272
273
  """Writes it's content to destination"""
273
- with open(destination, mode="wb") as f:
274
- f.write(self.read())
274
+ destination = stringify_path(destination)
275
+ client: Client = self._catalog.get_client(str(destination))
276
+ client.upload(self.read(), str(destination))
275
277
 
276
278
  def _symlink_to(self, destination: str):
277
279
  if self.location:
@@ -285,6 +287,7 @@ class File(DataModel):
285
287
  source = self.get_path()
286
288
  else:
287
289
  raise OSError(errno.EXDEV, "can't link across filesystems")
290
+
288
291
  return os.symlink(source, destination)
289
292
 
290
293
  def export(
@@ -299,7 +302,8 @@ class File(DataModel):
299
302
  self._caching_enabled = use_cache
300
303
  dst = self.get_destination_path(output, placement)
301
304
  dst_dir = os.path.dirname(dst)
302
- os.makedirs(dst_dir, exist_ok=True)
305
+ client: Client = self._catalog.get_client(dst_dir)
306
+ client.fs.makedirs(dst_dir, exist_ok=True)
303
307
 
304
308
  if link_type == "symlink":
305
309
  try:
@@ -496,7 +500,10 @@ class TextFile(File):
496
500
 
497
501
  def save(self, destination: str):
498
502
  """Writes it's content to destination"""
499
- with open(destination, mode="w") as f:
503
+ destination = stringify_path(destination)
504
+
505
+ client: Client = self._catalog.get_client(destination)
506
+ with client.fs.open(destination, mode="w") as f:
500
507
  f.write(self.read_text())
501
508
 
502
509
 
@@ -510,7 +517,11 @@ class ImageFile(File):
510
517
 
511
518
  def save(self, destination: str):
512
519
  """Writes it's content to destination"""
513
- self.read().save(destination)
520
+ destination = stringify_path(destination)
521
+
522
+ client: Client = self._catalog.get_client(destination)
523
+ with client.fs.open(destination, mode="wb") as f:
524
+ self.read().save(f)
514
525
 
515
526
 
516
527
  class Image(DataModel):
@@ -1085,7 +1085,7 @@ class DatasetQuery:
1085
1085
  session: Optional[Session] = None,
1086
1086
  indexing_column_types: Optional[dict[str, Any]] = None,
1087
1087
  in_memory: bool = False,
1088
- fallback_to_remote: bool = True,
1088
+ fallback_to_studio: bool = True,
1089
1089
  ) -> None:
1090
1090
  self.session = Session.get(session, catalog=catalog, in_memory=in_memory)
1091
1091
  self.catalog = catalog or self.session.catalog
@@ -1103,7 +1103,7 @@ class DatasetQuery:
1103
1103
 
1104
1104
  self.name = name
1105
1105
 
1106
- if fallback_to_remote and is_token_set():
1106
+ if fallback_to_studio and is_token_set():
1107
1107
  ds = self.catalog.get_dataset_with_remote_fallback(name, version)
1108
1108
  else:
1109
1109
  ds = self.catalog.get_dataset(name)
@@ -139,21 +139,33 @@ class Session:
139
139
 
140
140
  # Access the active (most recent) context from the stack
141
141
  if cls.SESSION_CONTEXTS:
142
- return cls.SESSION_CONTEXTS[-1]
142
+ session = cls.SESSION_CONTEXTS[-1]
143
143
 
144
- if cls.GLOBAL_SESSION_CTX is None:
144
+ elif cls.GLOBAL_SESSION_CTX is None:
145
145
  cls.GLOBAL_SESSION_CTX = Session(
146
146
  cls.GLOBAL_SESSION_NAME,
147
147
  catalog,
148
148
  client_config=client_config,
149
149
  in_memory=in_memory,
150
150
  )
151
+ session = cls.GLOBAL_SESSION_CTX
151
152
 
152
153
  atexit.register(cls._global_cleanup)
153
154
  cls.ORIGINAL_EXCEPT_HOOK = sys.excepthook
154
155
  sys.excepthook = cls.except_hook
156
+ else:
157
+ session = cls.GLOBAL_SESSION_CTX
155
158
 
156
- return cls.GLOBAL_SESSION_CTX
159
+ if client_config and session.catalog.client_config != client_config:
160
+ session = Session(
161
+ "session" + uuid4().hex[:4],
162
+ catalog,
163
+ client_config=client_config,
164
+ in_memory=in_memory,
165
+ )
166
+ session.__enter__()
167
+
168
+ return session
157
169
 
158
170
  @staticmethod
159
171
  def except_hook(exc_type, exc_value, exc_traceback):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: datachain
3
- Version: 0.9.1
3
+ Version: 0.10.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -175,7 +175,7 @@ high confidence scores.
175
175
 
176
176
  likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
177
177
  & (Column("meta.inference.class_") == "cat"))
178
- likely_cats.export_files("high-confidence-cats/", signal="file")
178
+ likely_cats.to_storage("high-confidence-cats/", signal="file")
179
179
 
180
180
 
181
181
  Example: LLM based text-file evaluation
@@ -216,7 +216,7 @@ Python code:
216
216
  )
217
217
 
218
218
  successful_chain = chain.filter(Column("is_success") == True)
219
- successful_chain.export_files("./output_mistral")
219
+ successful_chain.to_storage("./output_mistral")
220
220
 
221
221
  print(f"{successful_chain.count()} files were exported")
222
222
 
@@ -36,7 +36,7 @@ datachain/cli/parser/utils.py,sha256=GEzxfPJ4i6nt6JhjvZ3PQesXl9islEV3E-N1NZGrLaA
36
36
  datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
37
37
  datachain/client/azure.py,sha256=ma6fJcnveG8wpNy1PSrN5hgvmRdCj8Sf3RKjfd3qCyM,3221
38
38
  datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
39
- datachain/client/fsspec.py,sha256=whQWKD0tGQUjc8FpA0irxc31wiJuQS4x2arHc98-Lv0,13966
39
+ datachain/client/fsspec.py,sha256=N_n3_DtZuKsLst8-XVda2xYCUHreUU3ld0MNTl8L9f4,14008
40
40
  datachain/client/gcs.py,sha256=TY5K5INORKknTnoWDYv0EUztVLmuY1hHmdf2wUB_9uE,5114
41
41
  datachain/client/hf.py,sha256=XeVJVbiNViZCpn3sfb90Fr8SYO3BdLmfE3hOWMoqInE,951
42
42
  datachain/client/local.py,sha256=Pv67SYdkNkkNExBoKJF9AnNu0FSrt4JqLRkSVsUnveU,4672
@@ -68,8 +68,8 @@ datachain/lib/arrow.py,sha256=9UBCF-lftQaz0yxdsjbLKbyzVSmrF_QSWdhp2oBDPqs,9486
68
68
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
69
69
  datachain/lib/data_model.py,sha256=zS4lmXHVBXc9ntcyea2a1CRLXGSAN_0glXcF88CohgY,2685
70
70
  datachain/lib/dataset_info.py,sha256=IjdF1E0TQNOq9YyynfWiCFTeZpbyGfyJvxgJY4YN810,2493
71
- datachain/lib/dc.py,sha256=rQZgFLFSIde6wfiAnBnlSE4qnjNjNXQ0F3TGhDQ6ap8,93459
72
- datachain/lib/file.py,sha256=jtjypMGEvXw3S1oJHeWYiAKJREhhdI7cLGKME8obA78,27030
71
+ datachain/lib/dc.py,sha256=QQPnrS_OB1d3CfjLnYtRByGc7wNX_YT24WOjaoFPJgw,95372
72
+ datachain/lib/file.py,sha256=8OblP_hYJLh0z7MWGo3AiyO48eEJ13tzgla1UQf9A8I,27517
73
73
  datachain/lib/hf.py,sha256=gjxuStZBlKtNk3-4yYSlWZDv9zBGblOdvEy_Lwap5hA,5882
74
74
  datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
75
75
  datachain/lib/listing.py,sha256=auodM0HitYZsL0DybdgQUYhne_LgkVW-LKGYYOACP90,7272
@@ -103,13 +103,13 @@ datachain/model/ultralytics/pose.py,sha256=71KBTcoST2wcEtsyGXqLVpvUtqbp9gwZGA15p
103
103
  datachain/model/ultralytics/segment.py,sha256=Z1ab0tZRJubSYNH4KkFlzhYeGNTfAyC71KmkQcToHDQ,2760
104
104
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
105
105
  datachain/query/batch.py,sha256=6w8gzLTmLeylststu-gT5jIqEfi4-djS7_yTYyeo-fw,4190
106
- datachain/query/dataset.py,sha256=tXinFa-0ytC4j3W3XKjSkVW3qX3iFGQyRO800k9JW98,57083
106
+ datachain/query/dataset.py,sha256=wK_etZkH558pzLKAMBArlj1TQD9n96YK-kpVYBCSR38,57083
107
107
  datachain/query/dispatch.py,sha256=_1vjeQ1wjUoxlik55k0JkWqQCUfMjgVWmEOyWRkx0dU,12437
108
108
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
109
109
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
110
110
  datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
111
111
  datachain/query/schema.py,sha256=b_KnVy6B26Ol4nYG0LqNNpeQ1QYPk95YRGUjXfdaQWs,6606
112
- datachain/query/session.py,sha256=fQAtl5zRESRDfRS2d5J9KgrWauunCtrd96vP4Ns1KlE,5998
112
+ datachain/query/session.py,sha256=I1KG8jDIaxGAfRfDRucMx8DqsANf_VYWtwtXjeD19lI,6399
113
113
  datachain/query/udf.py,sha256=GY8E9pnzPE7ZKl_jvetZpn9R2rlUtMlhoYj4UmrzFzw,594
114
114
  datachain/query/utils.py,sha256=u0A_BwG9PNs0DxoDcvSWgWLpj3ByTUv8CqH13CIuGag,1293
115
115
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -135,9 +135,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
135
135
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
136
136
  datachain/toolkit/split.py,sha256=z3zRJNzjWrpPuRw-zgFbCOBKInyYxJew8ygrYQRQLNc,2930
137
137
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
138
- datachain-0.9.1.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
139
- datachain-0.9.1.dist-info/METADATA,sha256=WC_qkAVg28I5GkwG_XDGoNC_2e0hKcBC7kqEzmna71A,11198
140
- datachain-0.9.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
141
- datachain-0.9.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
142
- datachain-0.9.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
143
- datachain-0.9.1.dist-info/RECORD,,
138
+ datachain-0.10.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
139
+ datachain-0.10.0.dist-info/METADATA,sha256=4Eoe6lnoy_HBYtdzrAIjNnagKXagattQ_mluP9WC-ek,11195
140
+ datachain-0.10.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
141
+ datachain-0.10.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
142
+ datachain-0.10.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
143
+ datachain-0.10.0.dist-info/RECORD,,