datachain 0.8.10__py3-none-any.whl → 0.8.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -3,6 +3,7 @@ import functools
3
3
  import logging
4
4
  import multiprocessing
5
5
  import os
6
+ import posixpath
6
7
  import re
7
8
  import sys
8
9
  from abc import ABC, abstractmethod
@@ -25,7 +26,7 @@ from fsspec.asyn import get_loop, sync
25
26
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
26
27
  from tqdm.auto import tqdm
27
28
 
28
- from datachain.cache import DataChainCache
29
+ from datachain.cache import Cache
29
30
  from datachain.client.fileslice import FileWrapper
30
31
  from datachain.error import ClientError as DataChainClientError
31
32
  from datachain.nodes_fetcher import NodesFetcher
@@ -74,9 +75,7 @@ class Client(ABC):
74
75
  PREFIX: ClassVar[str]
75
76
  protocol: ClassVar[str]
76
77
 
77
- def __init__(
78
- self, name: str, fs_kwargs: dict[str, Any], cache: DataChainCache
79
- ) -> None:
78
+ def __init__(self, name: str, fs_kwargs: dict[str, Any], cache: Cache) -> None:
80
79
  self.name = name
81
80
  self.fs_kwargs = fs_kwargs
82
81
  self._fs: Optional[AbstractFileSystem] = None
@@ -122,7 +121,7 @@ class Client(ABC):
122
121
  return cls.get_uri(storage_name), rel_path
123
122
 
124
123
  @staticmethod
125
- def get_client(source: str, cache: DataChainCache, **kwargs) -> "Client":
124
+ def get_client(source: str, cache: Cache, **kwargs) -> "Client":
126
125
  cls = Client.get_implementation(source)
127
126
  storage_url, _ = cls.split_url(source)
128
127
  if os.name == "nt":
@@ -145,7 +144,7 @@ class Client(ABC):
145
144
  def from_name(
146
145
  cls,
147
146
  name: str,
148
- cache: DataChainCache,
147
+ cache: Cache,
149
148
  kwargs: dict[str, Any],
150
149
  ) -> "Client":
151
150
  return cls(name, kwargs, cache)
@@ -154,7 +153,7 @@ class Client(ABC):
154
153
  def from_source(
155
154
  cls,
156
155
  uri: "StorageURI",
157
- cache: DataChainCache,
156
+ cache: Cache,
158
157
  **kwargs,
159
158
  ) -> "Client":
160
159
  return cls(cls.FS_CLASS._strip_protocol(uri), kwargs, cache)
@@ -390,8 +389,12 @@ class Client(ABC):
390
389
  self.fs.open(self.get_full_path(file.path, file.version)), cb
391
390
  ) # type: ignore[return-value]
392
391
 
393
- def upload(self, path: str, data: bytes) -> "File":
392
+ def upload(self, data: bytes, path: str) -> "File":
394
393
  full_path = self.get_full_path(path)
394
+
395
+ parent = posixpath.dirname(full_path)
396
+ self.fs.makedirs(parent, exist_ok=True)
397
+
395
398
  self.fs.pipe_file(full_path, data)
396
399
  file_info = self.fs.info(full_path)
397
400
  return self.info_to_file(file_info, path)
datachain/client/local.py CHANGED
@@ -12,7 +12,7 @@ from datachain.lib.file import File
12
12
  from .fsspec import Client
13
13
 
14
14
  if TYPE_CHECKING:
15
- from datachain.cache import DataChainCache
15
+ from datachain.cache import Cache
16
16
  from datachain.dataset import StorageURI
17
17
 
18
18
 
@@ -25,7 +25,7 @@ class FileClient(Client):
25
25
  self,
26
26
  name: str,
27
27
  fs_kwargs: dict[str, Any],
28
- cache: "DataChainCache",
28
+ cache: "Cache",
29
29
  use_symlinks: bool = False,
30
30
  ) -> None:
31
31
  super().__init__(name, fs_kwargs, cache)
@@ -82,7 +82,7 @@ class FileClient(Client):
82
82
  return bucket, path
83
83
 
84
84
  @classmethod
85
- def from_name(cls, name: str, cache: "DataChainCache", kwargs) -> "FileClient":
85
+ def from_name(cls, name: str, cache: "Cache", kwargs) -> "FileClient":
86
86
  use_symlinks = kwargs.pop("use_symlinks", False)
87
87
  return cls(name, kwargs, cache, use_symlinks=use_symlinks)
88
88
 
@@ -90,7 +90,7 @@ class FileClient(Client):
90
90
  def from_source(
91
91
  cls,
92
92
  uri: str,
93
- cache: "DataChainCache",
93
+ cache: "Cache",
94
94
  use_symlinks: bool = False,
95
95
  **kwargs,
96
96
  ) -> "FileClient":
@@ -200,7 +200,7 @@ class DataTable:
200
200
  columns: Sequence["sa.Column"] = (),
201
201
  metadata: Optional["sa.MetaData"] = None,
202
202
  ):
203
- # copy columns, since re-using the same objects from another table
203
+ # copy columns, since reusing the same objects from another table
204
204
  # may raise an error
205
205
  columns = cls.sys_columns() + [cls.copy_column(c) for c in columns]
206
206
  columns = dedup_columns(columns)
datachain/dataset.py CHANGED
@@ -91,7 +91,7 @@ class DatasetDependency:
91
91
  if self.type == DatasetDependencyType.DATASET:
92
92
  return self.name
93
93
 
94
- list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"), None, {})
94
+ list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"), {})
95
95
  assert list_dataset_name
96
96
  return list_dataset_name
97
97
 
datachain/error.py CHANGED
@@ -1,3 +1,15 @@
1
+ import botocore.errorfactory
2
+ import botocore.exceptions
3
+ import gcsfs.retry
4
+
5
+ REMOTE_ERRORS = (
6
+ gcsfs.retry.HttpError, # GCS
7
+ OSError, # GCS
8
+ botocore.exceptions.BotoCoreError, # S3
9
+ ValueError, # Azure
10
+ )
11
+
12
+
1
13
  class DataChainError(RuntimeError):
2
14
  pass
3
15
 
@@ -16,7 +16,7 @@ from .aggregate import (
16
16
  sum,
17
17
  )
18
18
  from .array import cosine_distance, euclidean_distance, length, sip_hash_64
19
- from .conditional import case, greatest, ifelse, least
19
+ from .conditional import case, greatest, ifelse, isnone, least
20
20
  from .numeric import bit_and, bit_hamming_distance, bit_or, bit_xor, int_hash_64
21
21
  from .random import rand
22
22
  from .string import byte_hamming_distance
@@ -42,6 +42,7 @@ __all__ = [
42
42
  "greatest",
43
43
  "ifelse",
44
44
  "int_hash_64",
45
+ "isnone",
45
46
  "least",
46
47
  "length",
47
48
  "literal",
@@ -1,14 +1,15 @@
1
- from typing import Union
1
+ from typing import Optional, Union
2
2
 
3
+ from sqlalchemy import ColumnElement
3
4
  from sqlalchemy import case as sql_case
4
- from sqlalchemy.sql.elements import BinaryExpression
5
5
 
6
6
  from datachain.lib.utils import DataChainParamsError
7
+ from datachain.query.schema import Column
7
8
  from datachain.sql.functions import conditional
8
9
 
9
10
  from .func import ColT, Func
10
11
 
11
- CaseT = Union[int, float, complex, bool, str]
12
+ CaseT = Union[int, float, complex, bool, str, Func]
12
13
 
13
14
 
14
15
  def greatest(*args: Union[ColT, float]) -> Func:
@@ -87,17 +88,21 @@ def least(*args: Union[ColT, float]) -> Func:
87
88
  )
88
89
 
89
90
 
90
- def case(*args: tuple[BinaryExpression, CaseT], else_=None) -> Func:
91
+ def case(
92
+ *args: tuple[Union[ColumnElement, Func], CaseT], else_: Optional[CaseT] = None
93
+ ) -> Func:
91
94
  """
92
95
  Returns the case function that produces case expression which has a list of
93
- conditions and corresponding results. Results can only be python primitives
94
- like string, numbes or booleans. Result type is inferred from condition results.
96
+ conditions and corresponding results. Results can be python primitives like string,
97
+ numbers or booleans but can also be other nested function (including case function).
98
+ Result type is inferred from condition results.
95
99
 
96
100
  Args:
97
- args (tuple(BinaryExpression, value(str | int | float | complex | bool):
98
- - Tuple of binary expression and values pair which corresponds to one
99
- case condition - value
100
- else_ (str | int | float | complex | bool): else value in case expression
101
+ args (tuple((ColumnElement, Func), (str | int | float | complex | bool, Func))):
102
+ Tuple of condition and values pair.
103
+ else_ (str | int | float | complex | bool, Func): optional else value in case
104
+ expression. If omitted, and no case conditions are satisfied, the result
105
+ will be None (NULL in DB).
101
106
 
102
107
  Returns:
103
108
  Func: A Func object that represents the case function.
@@ -111,15 +116,24 @@ def case(*args: tuple[BinaryExpression, CaseT], else_=None) -> Func:
111
116
  """
112
117
  supported_types = [int, float, complex, str, bool]
113
118
 
114
- type_ = type(else_) if else_ else None
119
+ def _get_type(val):
120
+ if isinstance(val, Func):
121
+ # nested functions
122
+ return val.result_type
123
+ return type(val)
115
124
 
116
125
  if not args:
117
126
  raise DataChainParamsError("Missing statements")
118
127
 
128
+ type_ = _get_type(else_) if else_ is not None else None
129
+
119
130
  for arg in args:
120
- if type_ and not isinstance(arg[1], type_):
121
- raise DataChainParamsError("Statement values must be of the same type")
122
- type_ = type(arg[1])
131
+ arg_type = _get_type(arg[1])
132
+ if type_ and arg_type != type_:
133
+ raise DataChainParamsError(
134
+ f"Statement values must be of the same type, got {type_} and {arg_type}"
135
+ )
136
+ type_ = arg_type
123
137
 
124
138
  if type_ not in supported_types:
125
139
  raise DataChainParamsError(
@@ -127,20 +141,25 @@ def case(*args: tuple[BinaryExpression, CaseT], else_=None) -> Func:
127
141
  )
128
142
 
129
143
  kwargs = {"else_": else_}
130
- return Func("case", inner=sql_case, args=args, kwargs=kwargs, result_type=type_)
144
+
145
+ return Func("case", inner=sql_case, cols=args, kwargs=kwargs, result_type=type_)
131
146
 
132
147
 
133
- def ifelse(condition: BinaryExpression, if_val: CaseT, else_val: CaseT) -> Func:
148
+ def ifelse(
149
+ condition: Union[ColumnElement, Func], if_val: CaseT, else_val: CaseT
150
+ ) -> Func:
134
151
  """
135
152
  Returns the ifelse function that produces if expression which has a condition
136
- and values for true and false outcome. Results can only be python primitives
137
- like string, numbes or booleans. Result type is inferred from the values.
153
+ and values for true and false outcome. Results can be one of python primitives
154
+ like string, numbers or booleans, but can also be nested functions.
155
+ Result type is inferred from the values.
138
156
 
139
157
  Args:
140
- condition: BinaryExpression - condition which is evaluated
141
- if_val: (str | int | float | complex | bool): value for true condition outcome
142
- else_val: (str | int | float | complex | bool): value for false condition
143
- outcome
158
+ condition (ColumnElement, Func): Condition which is evaluated.
159
+ if_val (str | int | float | complex | bool, Func): Value for true
160
+ condition outcome.
161
+ else_val (str | int | float | complex | bool, Func): Value for false condition
162
+ outcome.
144
163
 
145
164
  Returns:
146
165
  Func: A Func object that represents the ifelse function.
@@ -148,8 +167,33 @@ def ifelse(condition: BinaryExpression, if_val: CaseT, else_val: CaseT) -> Func:
148
167
  Example:
149
168
  ```py
150
169
  dc.mutate(
151
- res=func.ifelse(C("num") > 0, "P", "N"),
170
+ res=func.ifelse(isnone("col"), "EMPTY", "NOT_EMPTY")
152
171
  )
153
172
  ```
154
173
  """
155
174
  return case((condition, if_val), else_=else_val)
175
+
176
+
177
+ def isnone(col: Union[str, Column]) -> Func:
178
+ """
179
+ Returns True if column value is None, otherwise False.
180
+
181
+ Args:
182
+ col (str | Column): Column to check if it's None or not.
183
+ If a string is provided, it is assumed to be the name of the column.
184
+
185
+ Returns:
186
+ Func: A Func object that represents the conditional to check if column is None.
187
+
188
+ Example:
189
+ ```py
190
+ dc.mutate(test=ifelse(isnone("col"), "EMPTY", "NOT_EMPTY"))
191
+ ```
192
+ """
193
+ from datachain import C
194
+
195
+ if isinstance(col, str):
196
+ # if string, it is assumed to be the name of the column
197
+ col = C(col)
198
+
199
+ return case((col.is_(None) if col is not None else True, True), else_=False)
datachain/func/func.py CHANGED
@@ -23,7 +23,7 @@ if TYPE_CHECKING:
23
23
  from .window import Window
24
24
 
25
25
 
26
- ColT = Union[str, ColumnElement, "Func"]
26
+ ColT = Union[str, ColumnElement, "Func", tuple]
27
27
 
28
28
 
29
29
  class Func(Function):
@@ -78,7 +78,7 @@ class Func(Function):
78
78
  return (
79
79
  [
80
80
  col
81
- if isinstance(col, (Func, BindParameter, Case, Comparator))
81
+ if isinstance(col, (Func, BindParameter, Case, Comparator, tuple))
82
82
  else ColumnMeta.to_db_name(
83
83
  col.name if isinstance(col, ColumnElement) else col
84
84
  )
@@ -381,17 +381,24 @@ class Func(Function):
381
381
  col_type = self.get_result_type(signals_schema)
382
382
  sql_type = python_to_sql(col_type)
383
383
 
384
- def get_col(col: ColT) -> ColT:
384
+ def get_col(col: ColT, string_as_literal=False) -> ColT:
385
+ # string_as_literal is used only for conditionals like `case()` where
386
+ # literals are nested inside ColT as we have tuples of condition - values
387
+ # and if user wants to set some case value as column, explicit `C("col")`
388
+ # syntax must be used to distinguish from literals
389
+ if isinstance(col, tuple):
390
+ return tuple(get_col(x, string_as_literal=True) for x in col)
385
391
  if isinstance(col, Func):
386
392
  return col.get_column(signals_schema, table=table)
387
- if isinstance(col, str):
393
+ if isinstance(col, str) and not string_as_literal:
388
394
  column = Column(col, sql_type)
389
395
  column.table = table
390
396
  return column
391
397
  return col
392
398
 
393
399
  cols = [get_col(col) for col in self._db_cols]
394
- func_col = self.inner(*cols, *self.args, **self.kwargs)
400
+ kwargs = {k: get_col(v, string_as_literal=True) for k, v in self.kwargs.items()}
401
+ func_col = self.inner(*cols, *self.args, **kwargs)
395
402
 
396
403
  if self.is_window:
397
404
  if not self.window:
@@ -416,6 +423,11 @@ class Func(Function):
416
423
 
417
424
 
418
425
  def get_db_col_type(signals_schema: "SignalSchema", col: ColT) -> "DataType":
426
+ if isinstance(col, tuple):
427
+ raise DataChainParamsError(
428
+ "Cannot get type from tuple, please provide type hint to the function"
429
+ )
430
+
419
431
  if isinstance(col, Func):
420
432
  return col.get_result_type(signals_schema)
421
433
 
datachain/lib/dc.py CHANGED
@@ -25,6 +25,7 @@ from sqlalchemy.sql.functions import GenericFunction
25
25
  from sqlalchemy.sql.sqltypes import NullType
26
26
 
27
27
  from datachain.dataset import DatasetRecord
28
+ from datachain.func import literal
28
29
  from datachain.func.base import Function
29
30
  from datachain.func.func import Func
30
31
  from datachain.lib.convert.python_to_sql import python_to_sql
@@ -1129,8 +1130,12 @@ class DataChain:
1129
1130
  )
1130
1131
  ```
1131
1132
  """
1133
+ primitives = (bool, str, int, float)
1134
+
1132
1135
  for col_name, expr in kwargs.items():
1133
- if not isinstance(expr, (Column, Func)) and isinstance(expr.type, NullType):
1136
+ if not isinstance(expr, (*primitives, Column, Func)) and isinstance(
1137
+ expr.type, NullType
1138
+ ):
1134
1139
  raise DataChainColumnError(
1135
1140
  col_name, f"Cannot infer type with expression {expr}"
1136
1141
  )
@@ -1145,6 +1150,11 @@ class DataChain:
1145
1150
  elif isinstance(value, Func):
1146
1151
  # adding new signal
1147
1152
  mutated[name] = value.get_column(schema)
1153
+ elif isinstance(value, primitives):
1154
+ # adding simple python constant primitives like str, int, float, bool
1155
+ val = literal(value)
1156
+ val.type = python_to_sql(type(value))()
1157
+ mutated[name] = val # type: ignore[assignment]
1148
1158
  else:
1149
1159
  # adding new signal
1150
1160
  mutated[name] = value
@@ -1942,7 +1952,7 @@ class DataChain:
1942
1952
  def from_csv(
1943
1953
  cls,
1944
1954
  path,
1945
- delimiter: str = ",",
1955
+ delimiter: Optional[str] = None,
1946
1956
  header: bool = True,
1947
1957
  output: OutputType = None,
1948
1958
  object_name: str = "",
@@ -1952,6 +1962,7 @@ class DataChain:
1952
1962
  session: Optional[Session] = None,
1953
1963
  settings: Optional[dict] = None,
1954
1964
  column_types: Optional[dict[str, "Union[str, ArrowDataType]"]] = None,
1965
+ parse_options: Optional[dict[str, "Union[str, Union[bool, Callable]]"]] = None,
1955
1966
  **kwargs,
1956
1967
  ) -> "DataChain":
1957
1968
  """Generate chain from csv files.
@@ -1959,7 +1970,8 @@ class DataChain:
1959
1970
  Parameters:
1960
1971
  path : Storage URI with directory. URI must start with storage prefix such
1961
1972
  as `s3://`, `gs://`, `az://` or "file:///".
1962
- delimiter : Character for delimiting columns.
1973
+ delimiter : Character for delimiting columns. Takes precedence if also
1974
+ specified in `parse_options`. Defaults to ",".
1963
1975
  header : Whether the files include a header row.
1964
1976
  output : Dictionary or feature class defining column names and their
1965
1977
  corresponding types. List of column names is also accepted, in which
@@ -1973,6 +1985,8 @@ class DataChain:
1973
1985
  column_types : Dictionary of column names and their corresponding types.
1974
1986
  It is passed to CSV reader and for each column specified type auto
1975
1987
  inference is disabled.
1988
+ parse_options: Tells the parser how to process lines.
1989
+ See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html
1976
1990
 
1977
1991
  Example:
1978
1992
  Reading a csv file:
@@ -1990,6 +2004,12 @@ class DataChain:
1990
2004
  from pyarrow.dataset import CsvFileFormat
1991
2005
  from pyarrow.lib import type_for_alias
1992
2006
 
2007
+ parse_options = parse_options or {}
2008
+ if "delimiter" not in parse_options:
2009
+ parse_options["delimiter"] = ","
2010
+ if delimiter:
2011
+ parse_options["delimiter"] = delimiter
2012
+
1993
2013
  if column_types:
1994
2014
  column_types = {
1995
2015
  name: type_for_alias(typ) if isinstance(typ, str) else typ
@@ -2017,7 +2037,7 @@ class DataChain:
2017
2037
  msg = f"error parsing csv - incompatible output type {type(output)}"
2018
2038
  raise DatasetPrepareError(chain.name, msg)
2019
2039
 
2020
- parse_options = ParseOptions(delimiter=delimiter)
2040
+ parse_options = ParseOptions(**parse_options)
2021
2041
  read_options = ReadOptions(column_names=column_names)
2022
2042
  convert_options = ConvertOptions(
2023
2043
  strings_can_be_null=True,
datachain/lib/file.py CHANGED
@@ -190,6 +190,22 @@ class File(DataModel):
190
190
  self._catalog = None
191
191
  self._caching_enabled: bool = False
192
192
 
193
+ @classmethod
194
+ def upload(
195
+ cls, data: bytes, path: str, catalog: Optional["Catalog"] = None
196
+ ) -> "File":
197
+ if catalog is None:
198
+ from datachain.catalog.loader import get_catalog
199
+
200
+ catalog = get_catalog()
201
+
202
+ parent, name = posixpath.split(path)
203
+
204
+ client = catalog.get_client(parent)
205
+ file = client.upload(data, name)
206
+ file._set_stream(catalog)
207
+ return file
208
+
193
209
  @classmethod
194
210
  def _from_row(cls, row: "RowDict") -> "Self":
195
211
  return cls(**{key: row[key] for key in cls._datachain_column_types})
datachain/lib/listing.py CHANGED
@@ -1,3 +1,5 @@
1
+ import logging
2
+ import os
1
3
  import posixpath
2
4
  from collections.abc import Iterator
3
5
  from typing import TYPE_CHECKING, Callable, Optional, TypeVar
@@ -7,6 +9,7 @@ from sqlalchemy.sql.expression import true
7
9
 
8
10
  from datachain.asyn import iter_over_async
9
11
  from datachain.client import Client
12
+ from datachain.error import REMOTE_ERRORS, ClientError
10
13
  from datachain.lib.file import File
11
14
  from datachain.query.schema import Column
12
15
  from datachain.sql.functions import path as pathfunc
@@ -22,6 +25,10 @@ LISTING_PREFIX = "lst__" # listing datasets start with this name
22
25
 
23
26
  D = TypeVar("D", bound="DataChain")
24
27
 
28
+ # Disable warnings for remote errors in clients
29
+ logging.getLogger("aiobotocore.credentials").setLevel(logging.CRITICAL)
30
+ logging.getLogger("gcsfs").setLevel(logging.CRITICAL)
31
+
25
32
 
26
33
  def list_bucket(uri: str, cache, client_config=None) -> Callable:
27
34
  """
@@ -90,6 +97,15 @@ def _isfile(client: "Client", path: str) -> bool:
90
97
  Returns True if uri points to a file
91
98
  """
92
99
  try:
100
+ if "://" in path:
101
+ # This makes sure that the uppercase scheme is converted to lowercase
102
+ scheme, path = path.split("://", 1)
103
+ path = f"{scheme.lower()}://{path}"
104
+
105
+ if os.name == "nt" and "*" in path:
106
+ # On Windows, the glob pattern "*" is not supported
107
+ return False
108
+
93
109
  info = client.fs.info(path)
94
110
  name = info.get("name")
95
111
  # case for special simulated directories on some clouds
@@ -99,21 +115,21 @@ def _isfile(client: "Client", path: str) -> bool:
99
115
  return False
100
116
 
101
117
  return info["type"] == "file"
102
- except: # noqa: E722
118
+ except FileNotFoundError:
103
119
  return False
120
+ except REMOTE_ERRORS as e:
121
+ raise ClientError(
122
+ message=str(e),
123
+ error_code=getattr(e, "code", None),
124
+ ) from e
104
125
 
105
126
 
106
- def parse_listing_uri(uri: str, cache, client_config) -> tuple[Optional[str], str, str]:
127
+ def parse_listing_uri(uri: str, client_config) -> tuple[str, str, str]:
107
128
  """
108
129
  Parsing uri and returns listing dataset name, listing uri and listing path
109
130
  """
110
131
  client_config = client_config or {}
111
- client = Client.get_client(uri, cache, **client_config)
112
132
  storage_uri, path = Client.parse_url(uri)
113
- telemetry.log_param("client", client.PREFIX)
114
-
115
- if not uri.endswith("/") and _isfile(client, uri):
116
- return None, f"{storage_uri}/{path.lstrip('/')}", path
117
133
  if uses_glob(path):
118
134
  lst_uri_path = posixpath.dirname(path)
119
135
  else:
@@ -157,13 +173,15 @@ def get_listing(
157
173
  client_config = catalog.client_config
158
174
 
159
175
  client = Client.get_client(uri, cache, **client_config)
160
- ds_name, list_uri, list_path = parse_listing_uri(uri, cache, client_config)
161
- listing = None
176
+ telemetry.log_param("client", client.PREFIX)
162
177
 
163
- # if we don't want to use cached dataset (e.g. for a single file listing)
164
- if not ds_name:
165
- return None, list_uri, list_path, False
178
+ # we don't want to use cached dataset (e.g. for a single file listing)
179
+ if not uri.endswith("/") and _isfile(client, uri):
180
+ storage_uri, path = Client.parse_url(uri)
181
+ return None, f"{storage_uri}/{path.lstrip('/')}", path, False
166
182
 
183
+ ds_name, list_uri, list_path = parse_listing_uri(uri, client_config)
184
+ listing = None
167
185
  listings = [
168
186
  ls for ls in catalog.listings() if not ls.is_expired and ls.contains(ds_name)
169
187
  ]
datachain/lib/pytorch.py CHANGED
@@ -23,7 +23,7 @@ from datachain.query.dataset import get_download_callback
23
23
  if TYPE_CHECKING:
24
24
  from torchvision.transforms.v2 import Transform
25
25
 
26
- from datachain.cache import DataChainCache as Cache
26
+ from datachain.cache import Cache
27
27
 
28
28
 
29
29
  logger = logging.getLogger("datachain")
datachain/lib/udf.py CHANGED
@@ -32,7 +32,7 @@ if TYPE_CHECKING:
32
32
 
33
33
  from typing_extensions import Self
34
34
 
35
- from datachain.cache import DataChainCache as Cache
35
+ from datachain.cache import Cache
36
36
  from datachain.catalog import Catalog
37
37
  from datachain.lib.signal_schema import SignalSchema
38
38
  from datachain.lib.udf_signature import UdfSignature
datachain/listing.py CHANGED
@@ -2,7 +2,6 @@ import glob
2
2
  import os
3
3
  from collections.abc import Iterable, Iterator
4
4
  from functools import cached_property
5
- from itertools import zip_longest
6
5
  from typing import TYPE_CHECKING, Optional
7
6
 
8
7
  from sqlalchemy import Column
@@ -101,11 +100,8 @@ class Listing:
101
100
  copy_to_filename: Optional[str],
102
101
  recursive=False,
103
102
  copy_dir_contents=False,
104
- relative_path=None,
105
- from_edatachain=False,
106
103
  from_dataset=False,
107
104
  ) -> list[NodeWithPath]:
108
- rel_path_elements = relative_path.split("/") if relative_path else []
109
105
  all_nodes: list[NodeWithPath] = []
110
106
  for src in sources:
111
107
  node = src.node
@@ -119,15 +115,7 @@ class Listing:
119
115
  )
120
116
  else:
121
117
  node_path = []
122
- if from_edatachain:
123
- for rpe, npe in zip_longest(
124
- rel_path_elements, node.path.split("/")
125
- ):
126
- if rpe == npe:
127
- continue
128
- if npe:
129
- node_path.append(npe)
130
- elif copy_to_filename:
118
+ if copy_to_filename:
131
119
  node_path = [os.path.basename(copy_to_filename)]
132
120
  elif from_dataset:
133
121
  node_path = [
datachain/node.py CHANGED
@@ -84,18 +84,6 @@ class Node:
84
84
  fd.write(f" size: {self.size}\n")
85
85
  return size
86
86
 
87
- def get_metafile_data(self, path: str):
88
- data: dict[str, Any] = {
89
- "name": path,
90
- "etag": self.etag,
91
- }
92
- version = self.version
93
- if version:
94
- data["version"] = version
95
- data["last_modified"] = time_to_str(self.last_modified)
96
- data["size"] = self.size
97
- return data
98
-
99
87
  @property
100
88
  def full_path(self) -> str:
101
89
  if self.is_dir and self.path:
@@ -181,9 +169,6 @@ class NodeWithPath:
181
169
  def append_to_file(self, fd):
182
170
  return self.n.append_to_file(fd, "/".join(self.path))
183
171
 
184
- def get_metafile_data(self):
185
- return self.n.get_metafile_data("/".join(self.path))
186
-
187
172
  @property
188
173
  def full_path(self) -> str:
189
174
  path = "/".join(self.path)
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
5
5
  from datachain.nodes_thread_pool import NodesThreadPool
6
6
 
7
7
  if TYPE_CHECKING:
8
- from datachain.cache import DataChainCache
8
+ from datachain.cache import Cache
9
9
  from datachain.client.fsspec import Client
10
10
  from datachain.node import Node
11
11
 
@@ -13,7 +13,7 @@ logger = logging.getLogger("datachain")
13
13
 
14
14
 
15
15
  class NodesFetcher(NodesThreadPool):
16
- def __init__(self, client: "Client", max_threads: int, cache: "DataChainCache"):
16
+ def __init__(self, client: "Client", max_threads: int, cache: "Cache"):
17
17
  super().__init__(max_threads)
18
18
  self.client = client
19
19
  self.cache = cache