datachain 0.8.2__py3-none-any.whl → 0.8.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/cache.py CHANGED
@@ -61,14 +61,16 @@ class DataChainCache:
61
61
  tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
62
62
  size = file.size
63
63
  if size < 0:
64
- size = await client.get_size(from_path)
64
+ size = await client.get_size(from_path, version_id=file.version)
65
65
  cb = callback or TqdmCallback(
66
66
  tqdm_kwargs={"desc": odb_fs.name(from_path), "bytes": True},
67
67
  tqdm_cls=Tqdm,
68
68
  size=size,
69
69
  )
70
70
  try:
71
- await client.get_file(from_path, tmp_info, callback=cb)
71
+ await client.get_file(
72
+ from_path, tmp_info, callback=cb, version_id=file.version
73
+ )
72
74
  finally:
73
75
  if not callback:
74
76
  cb.close()
@@ -240,7 +240,8 @@ class DatasetRowsFetcher(NodesThreadPool):
240
240
  class NodeGroup:
241
241
  """Class for a group of nodes from the same source"""
242
242
 
243
- listing: "Listing"
243
+ listing: Optional["Listing"]
244
+ client: "Client"
244
245
  sources: list[DataSource]
245
246
 
246
247
  # The source path within the bucket
@@ -268,9 +269,7 @@ class NodeGroup:
268
269
  Download this node group to cache.
269
270
  """
270
271
  if self.sources:
271
- self.listing.client.fetch_nodes(
272
- self.iternodes(recursive), shared_progress_bar=pbar
273
- )
272
+ self.client.fetch_nodes(self.iternodes(recursive), shared_progress_bar=pbar)
274
273
 
275
274
 
276
275
  def check_output_dataset_file(
@@ -375,7 +374,7 @@ def collect_nodes_for_cp(
375
374
 
376
375
  # Collect all sources to process
377
376
  for node_group in node_groups:
378
- listing: Listing = node_group.listing
377
+ listing: Optional[Listing] = node_group.listing
379
378
  valid_sources: list[DataSource] = []
380
379
  for dsrc in node_group.sources:
381
380
  if dsrc.is_single_object():
@@ -383,6 +382,7 @@ def collect_nodes_for_cp(
383
382
  total_files += 1
384
383
  valid_sources.append(dsrc)
385
384
  else:
385
+ assert listing
386
386
  node = dsrc.node
387
387
  if not recursive:
388
388
  print(f"{node.full_path} is a directory (not copied).")
@@ -433,37 +433,51 @@ def instantiate_node_groups(
433
433
  )
434
434
 
435
435
  output_dir = output
436
+ output_file = None
436
437
  if copy_to_filename:
437
438
  output_dir = os.path.dirname(output)
438
439
  if not output_dir:
439
440
  output_dir = "."
441
+ output_file = os.path.basename(output)
440
442
 
441
443
  # Instantiate these nodes
442
444
  for node_group in node_groups:
443
445
  if not node_group.sources:
444
446
  continue
445
- listing: Listing = node_group.listing
447
+ listing: Optional[Listing] = node_group.listing
446
448
  source_path: str = node_group.source_path
447
449
 
448
450
  copy_dir_contents = always_copy_dir_contents or source_path.endswith("/")
449
- instantiated_nodes = listing.collect_nodes_to_instantiate(
450
- node_group.sources,
451
- copy_to_filename,
452
- recursive,
453
- copy_dir_contents,
454
- source_path,
455
- node_group.is_edatachain,
456
- node_group.is_dataset,
457
- )
458
- if not virtual_only:
459
- listing.instantiate_nodes(
460
- instantiated_nodes,
461
- output_dir,
462
- total_files,
463
- force=force,
464
- shared_progress_bar=instantiate_progress_bar,
451
+ if not listing:
452
+ source = node_group.sources[0]
453
+ client = source.client
454
+ node = NodeWithPath(source.node, [output_file or source.node.path])
455
+ instantiated_nodes = [node]
456
+ if not virtual_only:
457
+ node.instantiate(
458
+ client, output_dir, instantiate_progress_bar, force=force
459
+ )
460
+ else:
461
+ instantiated_nodes = listing.collect_nodes_to_instantiate(
462
+ node_group.sources,
463
+ copy_to_filename,
464
+ recursive,
465
+ copy_dir_contents,
466
+ source_path,
467
+ node_group.is_edatachain,
468
+ node_group.is_dataset,
465
469
  )
470
+ if not virtual_only:
471
+ listing.instantiate_nodes(
472
+ instantiated_nodes,
473
+ output_dir,
474
+ total_files,
475
+ force=force,
476
+ shared_progress_bar=instantiate_progress_bar,
477
+ )
478
+
466
479
  node_group.instantiated_nodes = instantiated_nodes
480
+
467
481
  if instantiate_progress_bar:
468
482
  instantiate_progress_bar.close()
469
483
 
@@ -592,7 +606,7 @@ class Catalog:
592
606
  client_config=None,
593
607
  object_name="file",
594
608
  skip_indexing=False,
595
- ) -> tuple["Listing", str]:
609
+ ) -> tuple[Optional["Listing"], "Client", str]:
596
610
  from datachain.lib.dc import DataChain
597
611
  from datachain.listing import Listing
598
612
 
@@ -603,16 +617,19 @@ class Catalog:
603
617
  list_ds_name, list_uri, list_path, _ = get_listing(
604
618
  source, self.session, update=update
605
619
  )
620
+ lst = None
621
+ client = Client.get_client(list_uri, self.cache, **self.client_config)
622
+
623
+ if list_ds_name:
624
+ lst = Listing(
625
+ self.metastore.clone(),
626
+ self.warehouse.clone(),
627
+ client,
628
+ dataset_name=list_ds_name,
629
+ object_name=object_name,
630
+ )
606
631
 
607
- lst = Listing(
608
- self.metastore.clone(),
609
- self.warehouse.clone(),
610
- Client.get_client(list_uri, self.cache, **self.client_config),
611
- dataset_name=list_ds_name,
612
- object_name=object_name,
613
- )
614
-
615
- return lst, list_path
632
+ return lst, client, list_path
616
633
 
617
634
  def _remove_dataset_rows_and_warehouse_info(
618
635
  self, dataset: DatasetRecord, version: int, **kwargs
@@ -635,13 +652,13 @@ class Catalog:
635
652
  ) -> Optional[list["DataSource"]]:
636
653
  enlisted_sources = []
637
654
  for src in sources: # Opt: parallel
638
- listing, file_path = self.enlist_source(
655
+ listing, client, file_path = self.enlist_source(
639
656
  src,
640
657
  update,
641
658
  client_config=client_config or self.client_config,
642
659
  skip_indexing=skip_indexing,
643
660
  )
644
- enlisted_sources.append((listing, file_path))
661
+ enlisted_sources.append((listing, client, file_path))
645
662
 
646
663
  if only_index:
647
664
  # sometimes we don't really need listing result (e.g on indexing process)
@@ -649,10 +666,16 @@ class Catalog:
649
666
  return None
650
667
 
651
668
  dsrc_all: list[DataSource] = []
652
- for listing, file_path in enlisted_sources:
653
- nodes = listing.expand_path(file_path)
654
- dir_only = file_path.endswith("/")
655
- dsrc_all.extend(DataSource(listing, node, dir_only) for node in nodes)
669
+ for listing, client, file_path in enlisted_sources:
670
+ if not listing:
671
+ nodes = [Node.from_file(client.get_file_info(file_path))]
672
+ dir_only = False
673
+ else:
674
+ nodes = listing.expand_path(file_path)
675
+ dir_only = file_path.endswith("/")
676
+ dsrc_all.extend(
677
+ DataSource(listing, client, node, dir_only) for node in nodes
678
+ )
656
679
  return dsrc_all
657
680
 
658
681
  def enlist_sources_grouped(
@@ -667,7 +690,7 @@ class Catalog:
667
690
 
668
691
  def _row_to_node(d: dict[str, Any]) -> Node:
669
692
  del d["file__source"]
670
- return Node.from_dict(d)
693
+ return Node.from_row(d)
671
694
 
672
695
  enlisted_sources: list[tuple[bool, bool, Any]] = []
673
696
  client_config = client_config or self.client_config
@@ -677,7 +700,7 @@ class Catalog:
677
700
  edatachain_data = parse_edatachain_file(src)
678
701
  indexed_sources = []
679
702
  for ds in edatachain_data:
680
- listing, source_path = self.enlist_source(
703
+ listing, _, source_path = self.enlist_source(
681
704
  ds["data-source"]["uri"],
682
705
  update,
683
706
  client_config=client_config,
@@ -701,6 +724,7 @@ class Catalog:
701
724
  client = self.get_client(source, **client_config)
702
725
  uri = client.uri
703
726
  dataset_name, _, _, _ = get_listing(uri, self.session)
727
+ assert dataset_name
704
728
  listing = Listing(
705
729
  self.metastore.clone(),
706
730
  self.warehouse.clone(),
@@ -713,6 +737,7 @@ class Catalog:
713
737
  indexed_sources.append(
714
738
  (
715
739
  listing,
740
+ client,
716
741
  source,
717
742
  [_row_to_node(r) for r in rows],
718
743
  ds_name,
@@ -722,25 +747,28 @@ class Catalog:
722
747
 
723
748
  enlisted_sources.append((False, True, indexed_sources))
724
749
  else:
725
- listing, source_path = self.enlist_source(
750
+ listing, client, source_path = self.enlist_source(
726
751
  src, update, client_config=client_config
727
752
  )
728
- enlisted_sources.append((False, False, (listing, source_path)))
753
+ enlisted_sources.append((False, False, (listing, client, source_path)))
729
754
 
730
755
  node_groups = []
731
756
  for is_datachain, is_dataset, payload in enlisted_sources: # Opt: parallel
732
757
  if is_dataset:
733
758
  for (
734
759
  listing,
760
+ client,
735
761
  source_path,
736
762
  nodes,
737
763
  dataset_name,
738
764
  dataset_version,
739
765
  ) in payload:
740
- dsrc = [DataSource(listing, node) for node in nodes]
766
+ assert listing
767
+ dsrc = [DataSource(listing, client, node) for node in nodes]
741
768
  node_groups.append(
742
769
  NodeGroup(
743
770
  listing,
771
+ client,
744
772
  dsrc,
745
773
  source_path,
746
774
  dataset_name=dataset_name,
@@ -749,18 +777,30 @@ class Catalog:
749
777
  )
750
778
  elif is_datachain:
751
779
  for listing, source_path, paths in payload:
752
- dsrc = [DataSource(listing, listing.resolve_path(p)) for p in paths]
780
+ assert listing
781
+ dsrc = [
782
+ DataSource(listing, listing.client, listing.resolve_path(p))
783
+ for p in paths
784
+ ]
753
785
  node_groups.append(
754
- NodeGroup(listing, dsrc, source_path, is_edatachain=True)
786
+ NodeGroup(
787
+ listing,
788
+ listing.client,
789
+ dsrc,
790
+ source_path,
791
+ is_edatachain=True,
792
+ )
755
793
  )
756
794
  else:
757
- listing, source_path = payload
758
- as_container = source_path.endswith("/")
759
- dsrc = [
760
- DataSource(listing, n, as_container)
761
- for n in listing.expand_path(source_path, use_glob=not no_glob)
762
- ]
763
- node_groups.append(NodeGroup(listing, dsrc, source_path))
795
+ listing, client, source_path = payload
796
+ if not listing:
797
+ nodes = [Node.from_file(client.get_file_info(source_path))]
798
+ as_container = False
799
+ else:
800
+ as_container = source_path.endswith("/")
801
+ nodes = listing.expand_path(source_path, use_glob=not no_glob)
802
+ dsrc = [DataSource(listing, client, n, as_container) for n in nodes]
803
+ node_groups.append(NodeGroup(listing, client, dsrc, source_path))
764
804
 
765
805
  return node_groups
766
806
 
@@ -1196,10 +1236,16 @@ class Catalog:
1196
1236
 
1197
1237
  return q.to_db_records()
1198
1238
 
1199
- def signed_url(self, source: str, path: str, client_config=None) -> str:
1239
+ def signed_url(
1240
+ self,
1241
+ source: str,
1242
+ path: str,
1243
+ version_id: Optional[str] = None,
1244
+ client_config=None,
1245
+ ) -> str:
1200
1246
  client_config = client_config or self.client_config
1201
1247
  client = Client.get_client(source, self.cache, **client_config)
1202
- return client.url(path)
1248
+ return client.url(path, version_id=version_id)
1203
1249
 
1204
1250
  def export_dataset_table(
1205
1251
  self,
@@ -4,21 +4,19 @@ from datachain.node import DirType, NodeWithPath
4
4
 
5
5
 
6
6
  class DataSource:
7
- def __init__(self, listing, node, as_container=False):
7
+ def __init__(self, listing, client, node, as_container=False):
8
8
  self.listing = listing
9
+ self.client = client
9
10
  self.node = node
10
11
  self.as_container = (
11
12
  as_container # Indicates whether a .tar file is handled as a container
12
13
  )
13
14
 
14
- def get_full_path(self):
15
- return self.get_node_full_path(self.node)
16
-
17
15
  def get_node_full_path(self, node):
18
- return self.listing.client.get_full_path(node.full_path)
16
+ return self.client.get_full_path(node.full_path)
19
17
 
20
18
  def get_node_full_path_from_path(self, full_path):
21
- return self.listing.client.get_full_path(full_path)
19
+ return self.client.get_full_path(full_path)
22
20
 
23
21
  def is_single_object(self):
24
22
  return self.node.dir_type == DirType.FILE or (
datachain/client/azure.py CHANGED
@@ -1,4 +1,5 @@
1
- from typing import Any
1
+ from typing import Any, Optional
2
+ from urllib.parse import parse_qs, urlsplit, urlunsplit
2
3
 
3
4
  from adlfs import AzureBlobFileSystem
4
5
  from tqdm import tqdm
@@ -25,6 +26,16 @@ class AzureClient(Client):
25
26
  size=v.get("size", ""),
26
27
  )
27
28
 
29
+ def url(self, path: str, expires: int = 3600, **kwargs) -> str:
30
+ """
31
+ Generate a signed URL for the given path.
32
+ """
33
+ version_id = kwargs.pop("version_id", None)
34
+ result = self.fs.sign(
35
+ self.get_full_path(path, version_id), expiration=expires, **kwargs
36
+ )
37
+ return result + (f"&versionid={version_id}" if version_id else "")
38
+
28
39
  async def _fetch_flat(self, start_prefix: str, result_queue: ResultQueue) -> None:
29
40
  prefix = start_prefix
30
41
  if prefix:
@@ -57,4 +68,13 @@ class AzureClient(Client):
57
68
  finally:
58
69
  result_queue.put_nowait(None)
59
70
 
71
+ @classmethod
72
+ def version_path(cls, path: str, version_id: Optional[str]) -> str:
73
+ parts = list(urlsplit(path))
74
+ query = parse_qs(parts[3])
75
+ if "versionid" in query:
76
+ raise ValueError("path already includes a version query")
77
+ parts[3] = f"versionid={version_id}" if version_id else ""
78
+ return urlunsplit(parts)
79
+
60
80
  _fetch_default = _fetch_flat
@@ -137,6 +137,10 @@ class Client(ABC):
137
137
  fs.invalidate_cache()
138
138
  return fs
139
139
 
140
+ @classmethod
141
+ def version_path(cls, path: str, version_id: Optional[str]) -> str:
142
+ return path
143
+
140
144
  @classmethod
141
145
  def from_name(
142
146
  cls,
@@ -198,17 +202,37 @@ class Client(ABC):
198
202
  return self._fs
199
203
 
200
204
  def url(self, path: str, expires: int = 3600, **kwargs) -> str:
201
- return self.fs.sign(self.get_full_path(path), expiration=expires, **kwargs)
205
+ return self.fs.sign(
206
+ self.get_full_path(path, kwargs.pop("version_id", None)),
207
+ expiration=expires,
208
+ **kwargs,
209
+ )
202
210
 
203
211
  async def get_current_etag(self, file: "File") -> str:
204
- info = await self.fs._info(self.get_full_path(file.path))
212
+ kwargs = {}
213
+ if self.fs.version_aware:
214
+ kwargs["version_id"] = file.version
215
+ info = await self.fs._info(
216
+ self.get_full_path(file.path, file.version), **kwargs
217
+ )
205
218
  return self.info_to_file(info, "").etag
206
219
 
207
- async def get_size(self, path: str) -> int:
208
- return await self.fs._size(path)
220
+ def get_file_info(self, path: str, version_id: Optional[str] = None) -> "File":
221
+ info = self.fs.info(self.get_full_path(path, version_id), version_id=version_id)
222
+ return self.info_to_file(info, path)
223
+
224
+ async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
225
+ return await self.fs._size(
226
+ self.version_path(path, version_id), version_id=version_id
227
+ )
209
228
 
210
- async def get_file(self, lpath, rpath, callback):
211
- return await self.fs._get_file(lpath, rpath, callback=callback)
229
+ async def get_file(self, lpath, rpath, callback, version_id: Optional[str] = None):
230
+ return await self.fs._get_file(
231
+ self.version_path(lpath, version_id),
232
+ rpath,
233
+ callback=callback,
234
+ version_id=version_id,
235
+ )
212
236
 
213
237
  async def scandir(
214
238
  self, start_prefix: str, method: str = "default"
@@ -315,8 +339,8 @@ class Client(ABC):
315
339
  def rel_path(self, path: str) -> str:
316
340
  return self.fs.split_path(path)[1]
317
341
 
318
- def get_full_path(self, rel_path: str) -> str:
319
- return f"{self.PREFIX}{self.name}/{rel_path}"
342
+ def get_full_path(self, rel_path: str, version_id: Optional[str] = None) -> str:
343
+ return self.version_path(f"{self.PREFIX}{self.name}/{rel_path}", version_id)
320
344
 
321
345
  @abstractmethod
322
346
  def info_to_file(self, v: dict[str, Any], parent: str) -> "File": ...
@@ -362,7 +386,9 @@ class Client(ABC):
362
386
  if use_cache and (cache_path := self.cache.get_path(file)):
363
387
  return open(cache_path, mode="rb")
364
388
  assert not file.location
365
- return FileWrapper(self.fs.open(self.get_full_path(file.path)), cb) # type: ignore[return-value]
389
+ return FileWrapper(
390
+ self.fs.open(self.get_full_path(file.path, file.version)), cb
391
+ ) # type: ignore[return-value]
366
392
 
367
393
  def download(self, file: "File", *, callback: Callback = DEFAULT_CALLBACK) -> None:
368
394
  sync(get_loop(), functools.partial(self._download, file, callback=callback))
datachain/client/gcs.py CHANGED
@@ -38,9 +38,13 @@ class GCSClient(Client):
38
38
  If the client is anonymous, a public URL is returned instead
39
39
  (see https://cloud.google.com/storage/docs/access-public-data#api-link).
40
40
  """
41
+ version_id = kwargs.pop("version_id", None)
41
42
  if self.fs.storage_options.get("token") == "anon":
42
- return f"https://storage.googleapis.com/{self.name}/{path}"
43
- return self.fs.sign(self.get_full_path(path), expiration=expires, **kwargs)
43
+ query = f"?generation={version_id}" if version_id else ""
44
+ return f"https://storage.googleapis.com/{self.name}/{path}{query}"
45
+ return self.fs.sign(
46
+ self.get_full_path(path, version_id), expiration=expires, **kwargs
47
+ )
44
48
 
45
49
  @staticmethod
46
50
  def parse_timestamp(timestamp: str) -> datetime:
@@ -131,3 +135,7 @@ class GCSClient(Client):
131
135
  last_modified=self.parse_timestamp(v["updated"]),
132
136
  size=v.get("size", ""),
133
137
  )
138
+
139
+ @classmethod
140
+ def version_path(cls, path: str, version_id: Optional[str]) -> str:
141
+ return f"{path}#{version_id}" if version_id else path
datachain/client/local.py CHANGED
@@ -2,7 +2,7 @@ import os
2
2
  import posixpath
3
3
  from datetime import datetime, timezone
4
4
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Any
5
+ from typing import TYPE_CHECKING, Any, Optional
6
6
  from urllib.parse import urlparse
7
7
 
8
8
  from fsspec.implementations.local import LocalFileSystem
@@ -105,10 +105,10 @@ class FileClient(Client):
105
105
  info = self.fs.info(self.get_full_path(file.path))
106
106
  return self.info_to_file(info, "").etag
107
107
 
108
- async def get_size(self, path: str) -> int:
108
+ async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
109
109
  return self.fs.size(path)
110
110
 
111
- async def get_file(self, lpath, rpath, callback):
111
+ async def get_file(self, lpath, rpath, callback, version_id: Optional[str] = None):
112
112
  return self.fs.get_file(lpath, rpath, callback=callback)
113
113
 
114
114
  async def ls_dir(self, path):
@@ -117,7 +117,7 @@ class FileClient(Client):
117
117
  def rel_path(self, path):
118
118
  return posixpath.relpath(path, self.name)
119
119
 
120
- def get_full_path(self, rel_path):
120
+ def get_full_path(self, rel_path, version_id: Optional[str] = None):
121
121
  full_path = Path(self.name, rel_path).as_posix()
122
122
  if rel_path.endswith("/") or not rel_path:
123
123
  full_path += "/"
datachain/client/s3.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import asyncio
2
2
  from typing import Any, Optional, cast
3
+ from urllib.parse import parse_qs, urlsplit, urlunsplit
3
4
 
4
5
  from botocore.exceptions import NoCredentialsError
5
6
  from s3fs import S3FileSystem
@@ -121,6 +122,15 @@ class ClientS3(Client):
121
122
  size=v["Size"],
122
123
  )
123
124
 
125
+ @classmethod
126
+ def version_path(cls, path: str, version_id: Optional[str]) -> str:
127
+ parts = list(urlsplit(path))
128
+ query = parse_qs(parts[3])
129
+ if "versionId" in query:
130
+ raise ValueError("path already includes a version query")
131
+ parts[3] = f"versionId={version_id}" if version_id else ""
132
+ return urlunsplit(parts)
133
+
124
134
  async def _fetch_dir(
125
135
  self,
126
136
  prefix,
datachain/dataset.py CHANGED
@@ -92,6 +92,7 @@ class DatasetDependency:
92
92
  return self.name
93
93
 
94
94
  list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"), None, {})
95
+ assert list_dataset_name
95
96
  return list_dataset_name
96
97
 
97
98
  @classmethod
datachain/lib/dc.py CHANGED
@@ -32,7 +32,7 @@ from datachain.lib.data_model import DataModel, DataType, DataValue, dict_to_dat
32
32
  from datachain.lib.dataset_info import DatasetInfo
33
33
  from datachain.lib.file import ArrowRow, File, FileType, get_file_type
34
34
  from datachain.lib.file import ExportPlacement as FileExportPlacement
35
- from datachain.lib.listing import get_listing, list_bucket, ls
35
+ from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
36
36
  from datachain.lib.listing_info import ListingInfo
37
37
  from datachain.lib.meta_formats import read_meta
38
38
  from datachain.lib.model_store import ModelStore
@@ -438,6 +438,18 @@ class DataChain:
438
438
  uri, session, update=update
439
439
  )
440
440
 
441
+ # ds_name is None if object is a file, we don't want to use cache
442
+ # or do listing in that case - just read that single object
443
+ if not list_ds_name:
444
+ dc = cls.from_values(
445
+ session=session,
446
+ settings=settings,
447
+ in_memory=in_memory,
448
+ file=[get_file_info(list_uri, cache, client_config=client_config)],
449
+ )
450
+ dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
451
+ return dc
452
+
441
453
  if update or not list_ds_exists:
442
454
  (
443
455
  cls.from_records(
@@ -1634,7 +1646,7 @@ class DataChain:
1634
1646
  output: OutputType = None,
1635
1647
  object_name: str = "",
1636
1648
  **fr_map,
1637
- ) -> "DataChain":
1649
+ ) -> "Self":
1638
1650
  """Generate chain from list of values.
1639
1651
 
1640
1652
  Example:
@@ -1647,7 +1659,7 @@ class DataChain:
1647
1659
  def _func_fr() -> Iterator[tuple_type]: # type: ignore[valid-type]
1648
1660
  yield from tuples
1649
1661
 
1650
- chain = DataChain.from_records(
1662
+ chain = cls.from_records(
1651
1663
  DataChain.DEFAULT_FILE_RECORD,
1652
1664
  session=session,
1653
1665
  settings=settings,
datachain/lib/listing.py CHANGED
@@ -39,6 +39,15 @@ def list_bucket(uri: str, cache, client_config=None) -> Callable:
39
39
  return list_func
40
40
 
41
41
 
42
+ def get_file_info(uri: str, cache, client_config=None) -> File:
43
+ """
44
+ Wrapper to return File object by its URI
45
+ """
46
+ client = Client.get_client(uri, cache, **(client_config or {})) # type: ignore[arg-type]
47
+ _, path = Client.parse_url(uri)
48
+ return client.get_file_info(path)
49
+
50
+
42
51
  def ls(
43
52
  dc: D,
44
53
  path: str,
@@ -76,7 +85,7 @@ def ls(
76
85
  return dc.filter(pathfunc.parent(_file_c("path")) == path.lstrip("/").rstrip("/*"))
77
86
 
78
87
 
79
- def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
88
+ def parse_listing_uri(uri: str, cache, client_config) -> tuple[Optional[str], str, str]:
80
89
  """
81
90
  Parsing uri and returns listing dataset name, listing uri and listing path
82
91
  """
@@ -85,7 +94,9 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
85
94
  storage_uri, path = Client.parse_url(uri)
86
95
  telemetry.log_param("client", client.PREFIX)
87
96
 
88
- if uses_glob(path) or client.fs.isfile(uri):
97
+ if not uri.endswith("/") and client.fs.isfile(uri):
98
+ return None, f'{storage_uri}/{path.lstrip("/")}', path
99
+ if uses_glob(path):
89
100
  lst_uri_path = posixpath.dirname(path)
90
101
  else:
91
102
  storage_uri, path = Client.parse_url(f'{uri.rstrip("/")}/')
@@ -113,7 +124,7 @@ def listing_uri_from_name(dataset_name: str) -> str:
113
124
 
114
125
  def get_listing(
115
126
  uri: str, session: "Session", update: bool = False
116
- ) -> tuple[str, str, str, bool]:
127
+ ) -> tuple[Optional[str], str, str, bool]:
117
128
  """Returns correct listing dataset name that must be used for saving listing
118
129
  operation. It takes into account existing listings and reusability of those.
119
130
  It also returns boolean saying if returned dataset name is reused / already
@@ -131,6 +142,10 @@ def get_listing(
131
142
  ds_name, list_uri, list_path = parse_listing_uri(uri, cache, client_config)
132
143
  listing = None
133
144
 
145
+ # if we don't want to use cached dataset (e.g. for a single file listing)
146
+ if not ds_name:
147
+ return None, list_uri, list_path, False
148
+
134
149
  listings = [
135
150
  ls for ls in catalog.listings() if not ls.is_expired and ls.contains(ds_name)
136
151
  ]
datachain/listing.py CHANGED
@@ -157,11 +157,7 @@ class Listing:
157
157
 
158
158
  counter = 0
159
159
  for node in all_nodes:
160
- dst = os.path.join(output, *node.path)
161
- dst_dir = os.path.dirname(dst)
162
- os.makedirs(dst_dir, exist_ok=True)
163
- file = node.n.to_file(self.client.uri)
164
- self.client.instantiate_object(file, dst, progress_bar, force)
160
+ node.instantiate(self.client, output, progress_bar, force=force)
165
161
  counter += 1
166
162
  if counter > 1000:
167
163
  progress_bar.update(counter)
datachain/node.py CHANGED
@@ -1,3 +1,4 @@
1
+ import os
1
2
  from datetime import datetime
2
3
  from typing import TYPE_CHECKING, Any, Optional
3
4
 
@@ -10,6 +11,8 @@ from datachain.utils import TIME_ZERO, time_to_str
10
11
  if TYPE_CHECKING:
11
12
  from typing_extensions import Self
12
13
 
14
+ from datachain.client import Client
15
+
13
16
 
14
17
  class DirType:
15
18
  FILE = 0
@@ -114,7 +117,21 @@ class Node:
114
117
  )
115
118
 
116
119
  @classmethod
117
- def from_dict(cls, d: dict[str, Any], file_prefix: str = "file") -> "Self":
120
+ def from_file(cls, f: File) -> "Self":
121
+ return cls(
122
+ source=StorageURI(f.source),
123
+ path=f.path,
124
+ etag=f.etag,
125
+ is_latest=f.is_latest,
126
+ size=f.size,
127
+ last_modified=f.last_modified,
128
+ version=f.version,
129
+ location=str(f.location) if f.location else None,
130
+ dir_type=DirType.FILE,
131
+ )
132
+
133
+ @classmethod
134
+ def from_row(cls, d: dict[str, Any], file_prefix: str = "file") -> "Self":
118
135
  def _dval(field_name: str):
119
136
  return d.get(f"{file_prefix}__{field_name}")
120
137
 
@@ -174,6 +191,15 @@ class NodeWithPath:
174
191
  path += "/"
175
192
  return path
176
193
 
194
+ def instantiate(
195
+ self, client: "Client", output: str, progress_bar, *, force: bool = False
196
+ ):
197
+ dst = os.path.join(output, *self.path)
198
+ dst_dir = os.path.dirname(dst)
199
+ os.makedirs(dst_dir, exist_ok=True)
200
+ file = self.n.to_file(client.uri)
201
+ client.instantiate_object(file, dst, progress_bar, force)
202
+
177
203
 
178
204
  TIME_FMT = "%Y-%m-%d %H:%M"
179
205
 
@@ -55,7 +55,7 @@ class Session:
55
55
  client_config: Optional[dict] = None,
56
56
  in_memory: bool = False,
57
57
  ):
58
- if re.match(r"^[0-9a-zA-Z]+$", name) is None:
58
+ if re.match(r"^[0-9a-zA-Z]*$", name) is None:
59
59
  raise ValueError(
60
60
  f"Session name can contain only letters or numbers - '{name}' given."
61
61
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.8.2
3
+ Version: 0.8.3
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -72,7 +72,7 @@ Requires-Dist: pytest<9,>=8; extra == "tests"
72
72
  Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
73
73
  Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
74
74
  Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
75
- Requires-Dist: pytest-servers[all]>=0.5.8; extra == "tests"
75
+ Requires-Dist: pytest-servers[all]>=0.5.9; extra == "tests"
76
76
  Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
77
77
  Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
78
78
  Requires-Dist: virtualenv; extra == "tests"
@@ -1,15 +1,15 @@
1
1
  datachain/__init__.py,sha256=ofPJ6B-d-ybSDRrE7J6wqF_ZRAB2W9U8l-eeuBtqPLg,865
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=5aKrjnUxk0mtnZeFKNJd1DCE0MsnSoyJBZkr0y9H_a0,9313
4
- datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
4
+ datachain/cache.py,sha256=4xI0Ct2gVXuLZPqKdbjmfb_KD2klou-9WnL1WNhIuCA,3077
5
5
  datachain/cli.py,sha256=gNXVoMfKINUhKjOpYN48tpyNBK13M0hkQWqra4jNSJQ,43137
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
8
- datachain/dataset.py,sha256=P-pDBgvPqJGDhq_I7fwCfb6hY8E8mIAO8Q0NT7SNlNE,19128
8
+ datachain/dataset.py,sha256=5HtqZBRaaToa_C74g62bACjBaCRf2Y6BDgIACLhK1ZA,19161
9
9
  datachain/error.py,sha256=bxAAL32lSeMgzsQDEHbGTGORj-mPzzpCRvWDPueJNN4,1092
10
10
  datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
11
- datachain/listing.py,sha256=TgKg25ZWAP5enzKgw2_2GUPJVdnQUh6uySHB5SJrUY4,7773
12
- datachain/node.py,sha256=o8Sqy92QkzzcLK6XmIFLyDSE6Rw6kUTmGRhEmfLFdhg,5211
11
+ datachain/listing.py,sha256=WdiWMVa0xZ-LtR3SJ0gFLgYUI6VaLI0DSEE_KvfikXs,7582
12
+ datachain/node.py,sha256=HSpjBUBQBWXUUpbUEq839dsSc5KR2O8ww1Udl4jQemY,6023
13
13
  datachain/nodes_fetcher.py,sha256=ILMzUW5o4_6lUOVrLDC9gJPCXfcgKnMG68plrc7dAOA,1113
14
14
  datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
15
15
  datachain/progress.py,sha256=5KotcvvzAUL_RF0GEj4JY0IB1lyImnmHxe89YkT1XO4,4330
@@ -18,17 +18,17 @@ datachain/studio.py,sha256=BegIXunW1n-sZtHSe3a30Mw2MXexVGRn_GU-OzjRRKM,8725
18
18
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
19
19
  datachain/utils.py,sha256=9iP8RGm3MHemj13qt1VxRGYAsA6v-627M22o0fr76_M,13906
20
20
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
21
- datachain/catalog/catalog.py,sha256=CacK-tfgMM-ZpE0cW7Rfosx1aqXV0shyUy0TfHZnBOQ,58385
22
- datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
21
+ datachain/catalog/catalog.py,sha256=ixXJKftUIG_ZBPdie1dJAPPHddWV6HZwb3GO-TRHtxY,60103
22
+ datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
23
23
  datachain/catalog/loader.py,sha256=HA_mBC7q_My8j2WnSvIjUGuJpl6SIdg5vvy_lagxJlA,5733
24
24
  datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
25
- datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
25
+ datachain/client/azure.py,sha256=D-mfLtpiq6O-DaSs-ofEEYhjIZBNfgRw1l9R7UgxEM4,3055
26
26
  datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
27
- datachain/client/fsspec.py,sha256=kf1blSGNcEXJ0tra3y5i35jc1aAy-67wMHXkqjlRMXg,12736
28
- datachain/client/gcs.py,sha256=tAm5CCO86UNuSwTCHVPOiPbj1fBhnEYDoEVLKvv9H5I,4632
27
+ datachain/client/fsspec.py,sha256=rr6-M1iu30x8PAXpOD84U2Vh4CHU0-SdfJFdVZF3ouA,13650
28
+ datachain/client/gcs.py,sha256=MI94GXpCRqAlaF56HNrzQbXA-yR7bn2FOBPzO-lG_SI,4947
29
29
  datachain/client/hf.py,sha256=XeVJVbiNViZCpn3sfb90Fr8SYO3BdLmfE3hOWMoqInE,951
30
- datachain/client/local.py,sha256=f2HBqWH8SQM5CyiJ0ljfePVROg2FszWaAn6E2c8RiLE,4596
31
- datachain/client/s3.py,sha256=CVHBUZ1Ic2Q3370nl-Bbe69phuWjFlrVv9dTJKBpRT0,6019
30
+ datachain/client/local.py,sha256=iHQKh-HhoNzqZ2yaiuIfZWGXtt_X9FMSA-TN_03zjPc,4708
31
+ datachain/client/s3.py,sha256=67XISS6tW9bnhlbRtKJEAYd_JQvtLHqdPBxm8ySrJl8,6440
32
32
  datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
33
33
  datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
34
34
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
@@ -53,12 +53,12 @@ datachain/lib/arrow.py,sha256=33Od_XECCfWR9PUDBdevSooXS4mpMdPx_hoMLjpaELU,9734
53
53
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
54
54
  datachain/lib/data_model.py,sha256=zS4lmXHVBXc9ntcyea2a1CRLXGSAN_0glXcF88CohgY,2685
55
55
  datachain/lib/dataset_info.py,sha256=IjdF1E0TQNOq9YyynfWiCFTeZpbyGfyJvxgJY4YN810,2493
56
- datachain/lib/dc.py,sha256=dLBvM_Fr26AQBD3t_-oLSymLyoJQxmkJQs3cC-mZCu8,90492
56
+ datachain/lib/dc.py,sha256=UhyNLYVuCPJPz-EamMVPFjYwzJzbFrDzXvb07PscykI,91015
57
57
  datachain/lib/diff.py,sha256=Yurzyi7PzZzY80HOnVTpwtbWzSJ1LqN8NgZWwZOh_UU,6732
58
58
  datachain/lib/file.py,sha256=KeccxOulTQCLitdHZoTaq96xpE-5kmWZCrT9X9bRkD0,15049
59
59
  datachain/lib/hf.py,sha256=a-zFpDmZIR4r8dlNNTjfpAKSnuJ9xyRXlgcdENiXt3E,5864
60
60
  datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
61
- datachain/lib/listing.py,sha256=C-JmWMJcErbjNJ7ygwaXjP7Ak3nS-MjYTgSn7vvkICg,5536
61
+ datachain/lib/listing.py,sha256=8OPAJZbjPIGQ7qJPyfJEI1s9j9tP0GkKfyHebjQxPx0,6092
62
62
  datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
63
63
  datachain/lib/meta_formats.py,sha256=hDPfEkcmiLZOjhBBXuareMdnq65Wj8vZvxjmum6cROM,6377
64
64
  datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
@@ -95,7 +95,7 @@ datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,93
95
95
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
96
96
  datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
97
97
  datachain/query/schema.py,sha256=b_KnVy6B26Ol4nYG0LqNNpeQ1QYPk95YRGUjXfdaQWs,6606
98
- datachain/query/session.py,sha256=09FtSS3cvfRv2iBQZcbCBMIiRywv7Guhy5nmLEiltq4,5998
98
+ datachain/query/session.py,sha256=fQAtl5zRESRDfRS2d5J9KgrWauunCtrd96vP4Ns1KlE,5998
99
99
  datachain/query/udf.py,sha256=GY8E9pnzPE7ZKl_jvetZpn9R2rlUtMlhoYj4UmrzFzw,594
100
100
  datachain/query/utils.py,sha256=u0A_BwG9PNs0DxoDcvSWgWLpj3ByTUv8CqH13CIuGag,1293
101
101
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -121,9 +121,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
121
121
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
122
122
  datachain/toolkit/split.py,sha256=z3zRJNzjWrpPuRw-zgFbCOBKInyYxJew8ygrYQRQLNc,2930
123
123
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
124
- datachain-0.8.2.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
125
- datachain-0.8.2.dist-info/METADATA,sha256=MFVRJVJBLh_Cq3aV_1V4dHKkf15HTZHLUWWQNbRId3I,11066
126
- datachain-0.8.2.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
127
- datachain-0.8.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
128
- datachain-0.8.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
129
- datachain-0.8.2.dist-info/RECORD,,
124
+ datachain-0.8.3.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
125
+ datachain-0.8.3.dist-info/METADATA,sha256=gAPCEMlRQirhIDHK61LPuF1NNNaZQxwMlTVG-8fZDnM,11066
126
+ datachain-0.8.3.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
127
+ datachain-0.8.3.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
128
+ datachain-0.8.3.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
129
+ datachain-0.8.3.dist-info/RECORD,,