datachain 0.14.0__py3-none-any.whl → 0.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -588,7 +588,7 @@ class Catalog:
588
588
 
589
589
  from_storage(
590
590
  source, session=self.session, update=update, object_name=object_name
591
- )
591
+ ).exec()
592
592
 
593
593
  list_ds_name, list_uri, list_path, _ = get_listing(
594
594
  source, self.session, update=update
@@ -89,9 +89,9 @@ class Client(ABC):
89
89
  from .local import FileClient
90
90
  from .s3 import ClientS3
91
91
 
92
- protocol = urlparse(str(url)).scheme
92
+ protocol = urlparse(os.fspath(url)).scheme
93
93
 
94
- if not protocol or _is_win_local_path(str(url)):
94
+ if not protocol or _is_win_local_path(os.fspath(url)):
95
95
  return FileClient
96
96
  if protocol == ClientS3.protocol:
97
97
  return ClientS3
@@ -122,7 +122,7 @@ class Client(ABC):
122
122
  source: Union[str, os.PathLike[str]], cache: Cache, **kwargs
123
123
  ) -> "Client":
124
124
  cls = Client.get_implementation(source)
125
- storage_url, _ = cls.split_url(str(source))
125
+ storage_url, _ = cls.split_url(os.fspath(source))
126
126
  if os.name == "nt":
127
127
  storage_url = storage_url.removeprefix("/")
128
128
 
datachain/lib/dc/json.py CHANGED
@@ -64,7 +64,7 @@ def from_json(
64
64
  from .storage import from_storage
65
65
 
66
66
  if schema_from == "auto":
67
- schema_from = str(path)
67
+ schema_from = os.fspath(path)
68
68
 
69
69
  def jmespath_to_name(s: str):
70
70
  name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
@@ -6,11 +6,15 @@ from typing import (
6
6
  )
7
7
 
8
8
  from datachain.lib.file import (
9
- File,
10
9
  FileType,
11
10
  get_file_type,
12
11
  )
13
- from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
12
+ from datachain.lib.listing import (
13
+ get_file_info,
14
+ get_listing,
15
+ list_bucket,
16
+ ls,
17
+ )
14
18
  from datachain.query import Session
15
19
 
16
20
  if TYPE_CHECKING:
@@ -18,7 +22,7 @@ if TYPE_CHECKING:
18
22
 
19
23
 
20
24
  def from_storage(
21
- uri: Union[str, os.PathLike[str]],
25
+ uri: Union[str, os.PathLike[str], list[str], list[os.PathLike[str]]],
22
26
  *,
23
27
  type: FileType = "binary",
24
28
  session: Optional[Session] = None,
@@ -30,11 +34,12 @@ def from_storage(
30
34
  anon: bool = False,
31
35
  client_config: Optional[dict] = None,
32
36
  ) -> "DataChain":
33
- """Get data from a storage as a list of file with all file attributes.
37
+ """Get data from storage(s) as a list of file with all file attributes.
34
38
  It returns the chain itself as usual.
35
39
 
36
40
  Parameters:
37
- uri : storage URI with directory. URI must start with storage prefix such
41
+ uri : storage URI with directory or list of URIs.
42
+ URIs must start with storage prefix such
38
43
  as `s3://`, `gs://`, `az://` or "file:///"
39
44
  type : read file as "binary", "text", or "image" data. Default is "binary".
40
45
  recursive : search recursively for the given path.
@@ -43,16 +48,26 @@ def from_storage(
43
48
  anon : If True, we will treat cloud bucket as public one
44
49
  client_config : Optional client configuration for the storage client.
45
50
 
46
- Example:
47
- Simple call from s3
48
- ```py
51
+ Returns:
52
+ DataChain: A DataChain object containing the file information.
53
+
54
+ Examples:
55
+ Simple call from s3:
56
+ ```python
49
57
  import datachain as dc
50
58
  chain = dc.from_storage("s3://my-bucket/my-dir")
51
59
  ```
52
60
 
53
- With AWS S3-compatible storage
54
- ```py
55
- import datachain as dc
61
+ Multiple URIs:
62
+ ```python
63
+ chain = dc.from_storage([
64
+ "s3://bucket1/dir1",
65
+ "s3://bucket2/dir2"
66
+ ])
67
+ ```
68
+
69
+ With AWS S3-compatible storage:
70
+ ```python
56
71
  chain = dc.from_storage(
57
72
  "s3://my-bucket/my-dir",
58
73
  client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
@@ -62,9 +77,15 @@ def from_storage(
62
77
  Pass existing session
63
78
  ```py
64
79
  session = Session.get()
65
- import datachain as dc
66
- chain = dc.from_storage("s3://my-bucket/my-dir", session=session)
80
+ chain = dc.from_storage([
81
+ "path/to/dir1",
82
+ "path/to/dir2"
83
+ ], session=session, recursive=True)
67
84
  ```
85
+
86
+ Note:
87
+ When using multiple URIs with `update=True`, the function optimizes by
88
+ avoiding redundant updates for URIs pointing to the same storage location.
68
89
  """
69
90
  from .datachain import DataChain
70
91
  from .datasets import from_dataset
@@ -79,40 +100,71 @@ def from_storage(
79
100
  cache = session.catalog.cache
80
101
  client_config = session.catalog.client_config
81
102
 
82
- list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
83
- uri, session, update=update
84
- )
103
+ uris = uri if isinstance(uri, (list, tuple)) else [uri]
104
+
105
+ if not uris:
106
+ raise ValueError("No URIs provided")
107
+
108
+ storage_chain = None
109
+ listed_ds_name = set()
110
+ file_values = []
111
+
112
+ for single_uri in uris:
113
+ list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
114
+ single_uri, session, update=update
115
+ )
116
+
117
+ # list_ds_name is None if object is a file, we don't want to use cache
118
+ # or do listing in that case - just read that single object
119
+ if not list_ds_name:
120
+ file_values.append(
121
+ get_file_info(list_uri, cache, client_config=client_config)
122
+ )
123
+ continue
124
+
125
+ dc = from_dataset(list_ds_name, session=session, settings=settings)
126
+ dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
85
127
 
86
- # ds_name is None if object is a file, we don't want to use cache
87
- # or do listing in that case - just read that single object
88
- if not list_ds_name:
89
- dc = from_values(
128
+ if update or not list_ds_exists:
129
+
130
+ def lst_fn(ds_name, lst_uri):
131
+ # disable prefetch for listing, as it pre-downloads all files
132
+ (
133
+ from_records(
134
+ DataChain.DEFAULT_FILE_RECORD,
135
+ session=session,
136
+ settings=settings,
137
+ in_memory=in_memory,
138
+ )
139
+ .settings(prefetch=0)
140
+ .gen(
141
+ list_bucket(lst_uri, cache, client_config=client_config),
142
+ output={f"{object_name}": file_type},
143
+ )
144
+ .save(ds_name, listing=True)
145
+ )
146
+
147
+ dc._query.add_before_steps(
148
+ lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
149
+ )
150
+
151
+ chain = ls(dc, list_path, recursive=recursive, object_name=object_name)
152
+
153
+ storage_chain = storage_chain.union(chain) if storage_chain else chain
154
+ listed_ds_name.add(list_ds_name)
155
+
156
+ if file_values:
157
+ file_chain = from_values(
90
158
  session=session,
91
159
  settings=settings,
92
160
  in_memory=in_memory,
93
- file=[get_file_info(list_uri, cache, client_config=client_config)],
161
+ file=file_values,
94
162
  )
95
- dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
96
- return dc
97
-
98
- if update or not list_ds_exists:
99
- # disable prefetch for listing, as it pre-downloads all files
100
- (
101
- from_records(
102
- DataChain.DEFAULT_FILE_RECORD,
103
- session=session,
104
- settings=settings,
105
- in_memory=in_memory,
106
- )
107
- .settings(prefetch=0)
108
- .gen(
109
- list_bucket(list_uri, cache, client_config=client_config),
110
- output={f"{object_name}": File},
111
- )
112
- .save(list_ds_name, listing=True)
163
+ file_chain.signals_schema = file_chain.signals_schema.mutate(
164
+ {f"{object_name}": file_type}
113
165
  )
166
+ storage_chain = storage_chain.union(file_chain) if storage_chain else file_chain
114
167
 
115
- dc = from_dataset(list_ds_name, session=session, settings=settings)
116
- dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
168
+ assert storage_chain is not None
117
169
 
118
- return ls(dc, list_path, recursive=recursive, object_name=object_name)
170
+ return storage_chain
@@ -47,6 +47,7 @@ from datachain.error import (
47
47
  QueryScriptCancelError,
48
48
  )
49
49
  from datachain.func.base import Function
50
+ from datachain.lib.listing import is_listing_dataset
50
51
  from datachain.lib.udf import UDFAdapter, _get_cache
51
52
  from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
52
53
  from datachain.query.schema import C, UDFParamSpec, normalize_param
@@ -151,13 +152,6 @@ def step_result(
151
152
  )
152
153
 
153
154
 
154
- class StartingStep(ABC):
155
- """An initial query processing step, referencing a data source."""
156
-
157
- @abstractmethod
158
- def apply(self) -> "StepResult": ...
159
-
160
-
161
155
  @frozen
162
156
  class Step(ABC):
163
157
  """A query processing step (filtering, mutation, etc.)"""
@@ -170,7 +164,7 @@ class Step(ABC):
170
164
 
171
165
 
172
166
  @frozen
173
- class QueryStep(StartingStep):
167
+ class QueryStep:
174
168
  catalog: "Catalog"
175
169
  dataset_name: str
176
170
  dataset_version: int
@@ -1097,26 +1091,42 @@ class DatasetQuery:
1097
1091
  self.temp_table_names: list[str] = []
1098
1092
  self.dependencies: set[DatasetDependencyType] = set()
1099
1093
  self.table = self.get_table()
1100
- self.starting_step: StartingStep
1094
+ self.starting_step: Optional[QueryStep] = None
1101
1095
  self.name: Optional[str] = None
1102
1096
  self.version: Optional[int] = None
1103
1097
  self.feature_schema: Optional[dict] = None
1104
1098
  self.column_types: Optional[dict[str, Any]] = None
1099
+ self.before_steps: list[Callable] = []
1105
1100
 
1106
- self.name = name
1101
+ self.list_ds_name: Optional[str] = None
1107
1102
 
1108
- if fallback_to_studio and is_token_set():
1109
- ds = self.catalog.get_dataset_with_remote_fallback(name, version)
1103
+ self.name = name
1104
+ self.dialect = self.catalog.warehouse.db.dialect
1105
+ if version:
1106
+ self.version = version
1107
+
1108
+ if is_listing_dataset(name):
1109
+ # not setting query step yet as listing dataset might not exist at
1110
+ # this point
1111
+ self.list_ds_name = name
1112
+ elif fallback_to_studio and is_token_set():
1113
+ self._set_starting_step(
1114
+ self.catalog.get_dataset_with_remote_fallback(name, version)
1115
+ )
1110
1116
  else:
1111
- ds = self.catalog.get_dataset(name)
1117
+ self._set_starting_step(self.catalog.get_dataset(name))
1118
+
1119
+ def _set_starting_step(self, ds: "DatasetRecord") -> None:
1120
+ if not self.version:
1121
+ self.version = ds.latest_version
1112
1122
 
1113
- self.version = version or ds.latest_version
1123
+ self.starting_step = QueryStep(self.catalog, ds.name, self.version)
1124
+
1125
+ # at this point we know our starting dataset so setting up schemas
1114
1126
  self.feature_schema = ds.get_version(self.version).feature_schema
1115
1127
  self.column_types = copy(ds.schema)
1116
1128
  if "sys__id" in self.column_types:
1117
1129
  self.column_types.pop("sys__id")
1118
- self.starting_step = QueryStep(self.catalog, name, self.version)
1119
- self.dialect = self.catalog.warehouse.db.dialect
1120
1130
 
1121
1131
  def __iter__(self):
1122
1132
  return iter(self.db_results())
@@ -1180,11 +1190,23 @@ class DatasetQuery:
1180
1190
  col.table = self.table
1181
1191
  return col
1182
1192
 
1193
+ def add_before_steps(self, fn: Callable) -> None:
1194
+ """
1195
+ Setting custom function to be run before applying steps
1196
+ """
1197
+ self.before_steps.append(fn)
1198
+
1183
1199
  def apply_steps(self) -> QueryGenerator:
1184
1200
  """
1185
1201
  Apply the steps in the query and return the resulting
1186
1202
  sqlalchemy.SelectBase.
1187
1203
  """
1204
+ for fn in self.before_steps:
1205
+ fn()
1206
+
1207
+ if self.list_ds_name:
1208
+ # at this point we know what is our starting listing dataset name
1209
+ self._set_starting_step(self.catalog.get_dataset(self.list_ds_name)) # type: ignore [arg-type]
1188
1210
  query = self.clone()
1189
1211
 
1190
1212
  index = os.getenv("DATACHAIN_QUERY_CHUNK_INDEX", self._chunk_index)
@@ -1203,6 +1225,7 @@ class DatasetQuery:
1203
1225
  query = query.filter(C.sys__rand % total == index)
1204
1226
  query.steps = query.steps[-1:] + query.steps[:-1]
1205
1227
 
1228
+ assert query.starting_step
1206
1229
  result = query.starting_step.apply()
1207
1230
  self.dependencies.update(result.dependencies)
1208
1231
 
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.14.0
3
+ Version: 0.14.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
- License: Apache-2.0
6
+ License-Expression: Apache-2.0
7
7
  Project-URL: Documentation, https://datachain.dvc.ai
8
8
  Project-URL: Issues, https://github.com/iterative/datachain/issues
9
9
  Project-URL: Source, https://github.com/iterative/datachain
@@ -17,7 +17,7 @@ datachain/studio.py,sha256=9MEpFPLKI3gG4isKklcfD5BMLeNsSXhtOUboOjW4Fdc,10017
17
17
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
18
18
  datachain/utils.py,sha256=CLAYkI7iPbLYw3Pjh5EkWuc2UOs8wEbuXQnqIs4UyV8,14173
19
19
  datachain/catalog/__init__.py,sha256=cMZzSz3VoUi-6qXSVaHYN-agxQuAcz2XSqnEPZ55crE,353
20
- datachain/catalog/catalog.py,sha256=6dDTbSom8JzxLD_cbFboKtsiYtGR5WIOEOQTtCQ5mws,60722
20
+ datachain/catalog/catalog.py,sha256=FGW2cEOysgVMyokqIFAJ1PB-RYJrqDEFGfHP5qLYO-k,60729
21
21
  datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
22
22
  datachain/catalog/loader.py,sha256=AhSQR_-S-9lY3DcXn3PVZv9UtarHOMlDy2x75iDwUjo,6035
23
23
  datachain/cli/__init__.py,sha256=YPVkuQ7IezNhtzo5xrfca1hEIiZtFxOlJCOzAOEuxmA,8335
@@ -37,7 +37,7 @@ datachain/cli/parser/utils.py,sha256=rETdD-9Hq9A4OolgfT7jQw4aoawtbfmkdtH6E7nkhpI
37
37
  datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
38
38
  datachain/client/azure.py,sha256=ma6fJcnveG8wpNy1PSrN5hgvmRdCj8Sf3RKjfd3qCyM,3221
39
39
  datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
40
- datachain/client/fsspec.py,sha256=VutCpF8MDisDwdnJvJpiTuDU9BRRAa0Km3ZkD0sKaI0,13834
40
+ datachain/client/fsspec.py,sha256=UJ7PDq1F11gf7OMjfXYqzrS1GHL3FZctOwXI0S_LU74,13852
41
41
  datachain/client/gcs.py,sha256=tepsstv-6WkkJ16SVXIPKPlWdNyFlTqrUlDwulWlWGQ,5116
42
42
  datachain/client/hf.py,sha256=posnI5WOKOMG1yY_ZiV9Orcd24QsUPKZlOXgJVLxxrM,1558
43
43
  datachain/client/local.py,sha256=cGoCYflribzexiOe-Y1qbaE2fJRh-_EgQrfCSa0yK_E,4568
@@ -99,12 +99,12 @@ datachain/lib/dc/csv.py,sha256=OaVHYnOZiYEfsUcispXuGcIYQKF03u4XrRf6Fgce6Kk,4401
99
99
  datachain/lib/dc/datachain.py,sha256=NdGCRNk3NZCGQHs-sq0jiKkvsXiowiqDQTY_X4AbL6o,76390
100
100
  datachain/lib/dc/datasets.py,sha256=0vdgNpA_xakFgnfm78I1yU98u2hvOawOXS872pg2F48,4329
101
101
  datachain/lib/dc/hf.py,sha256=F_ME1IpUlQfhqVGe__Uz7jLwd-fp-O7pu50OLhkaG0w,2170
102
- datachain/lib/dc/json.py,sha256=gVH69oP8b5FR1YX3c_4Z_G1nFsAQ_xFz6fBg0J-U9ak,2719
102
+ datachain/lib/dc/json.py,sha256=mlrqsmxLDYNP7dmde3IDYP01QlbUzP8Pj5UDqlqJcZ0,2725
103
103
  datachain/lib/dc/listings.py,sha256=c2ASPhwRhPDMbA5esYp3kMVw6sQ7vsWEflHWh9x7tkw,1044
104
104
  datachain/lib/dc/pandas.py,sha256=eteVB6DqRGAU2tDF_Bep7JRU4nny3uyVPbGKOZ6PVq0,1249
105
105
  datachain/lib/dc/parquet.py,sha256=tO0rDL3XZ24rqkUJYAYn_yAyZgIYV5N6r28MTlPE0Z0,1809
106
106
  datachain/lib/dc/records.py,sha256=zV4vPJvCEd5mBv-E_q-VfrSXNjcfu74QY884z3QuftM,2524
107
- datachain/lib/dc/storage.py,sha256=PIz6K2VOtrVV7XUNd3BESp3P5WovgaG1RgBYut0OBNA,3789
107
+ datachain/lib/dc/storage.py,sha256=mIAlNEYRJ8r3yHA2sJyt8duwuSfehbPro7WqMQvezIc,5295
108
108
  datachain/lib/dc/utils.py,sha256=Ct-0FqCaDhNWHx09gJFcCXJGPjMI-VZr4t-GJyqTi44,3984
109
109
  datachain/lib/dc/values.py,sha256=PLBZew0BYO3mv7W3n8OF5Ad-5tp5eWPqlbiVxG5pJ30,1409
110
110
  datachain/model/__init__.py,sha256=R9faX5OHV1xh2EW-g2MPedwbtEqt3LodJRyluB-QylI,189
@@ -118,7 +118,7 @@ datachain/model/ultralytics/pose.py,sha256=gXAWfAk4OWZl93hKcQPKZvqJa3nIrECB4RM8K
118
118
  datachain/model/ultralytics/segment.py,sha256=koq1HASo29isf0in6oSlzmU4IzsmOXe87F1ajQQVfh4,2911
119
119
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
120
120
  datachain/query/batch.py,sha256=6w8gzLTmLeylststu-gT5jIqEfi4-djS7_yTYyeo-fw,4190
121
- datachain/query/dataset.py,sha256=J3NgcrzSP2dFg8JVqDodyBh1QEia_B-alcyfI3xKlZE,57256
121
+ datachain/query/dataset.py,sha256=Em5vfKkZygzXCiWRYUBGLSh3eWlIamMBvh328YNnmww,58201
122
122
  datachain/query/dispatch.py,sha256=_1vjeQ1wjUoxlik55k0JkWqQCUfMjgVWmEOyWRkx0dU,12437
123
123
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
124
124
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -150,9 +150,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
150
150
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
151
151
  datachain/toolkit/split.py,sha256=VdcP_zVLqAxuSrze3BaR-dBzTmyKkCUAiAremw3OEPU,2914
152
152
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
153
- datachain-0.14.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
154
- datachain-0.14.0.dist-info/METADATA,sha256=lC1I5lSWJX7a9oNpsRnEOM_L1W3hfnY8Op7iGWaNNcM,11324
155
- datachain-0.14.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
156
- datachain-0.14.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
157
- datachain-0.14.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
158
- datachain-0.14.0.dist-info/RECORD,,
153
+ datachain-0.14.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
154
+ datachain-0.14.1.dist-info/METADATA,sha256=UPk0v7fsYz_eTsJf5YpexjD4jrjpWsKEyAVNSXN3KvE,11335
155
+ datachain-0.14.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
156
+ datachain-0.14.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
157
+ datachain-0.14.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
158
+ datachain-0.14.1.dist-info/RECORD,,