datachain 0.14.0__py3-none-any.whl → 0.14.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -15,7 +15,7 @@ if TYPE_CHECKING:
15
15
  P = ParamSpec("P")
16
16
 
17
17
 
18
- def from_parquet(
18
+ def read_parquet(
19
19
  path,
20
20
  partitioning: Any = "hive",
21
21
  output: Optional[dict[str, DataType]] = None,
@@ -43,18 +43,18 @@ def from_parquet(
43
43
  Reading a single file:
44
44
  ```py
45
45
  import datachain as dc
46
- dc.from_parquet("s3://mybucket/file.parquet")
46
+ dc.read_parquet("s3://mybucket/file.parquet")
47
47
  ```
48
48
 
49
49
  Reading a partitioned dataset from a directory:
50
50
  ```py
51
51
  import datachain as dc
52
- dc.from_parquet("s3://mybucket/dir")
52
+ dc.read_parquet("s3://mybucket/dir")
53
53
  ```
54
54
  """
55
- from .storage import from_storage
55
+ from .storage import read_storage
56
56
 
57
- chain = from_storage(path, session=session, settings=settings, **kwargs)
57
+ chain = read_storage(path, session=session, settings=settings, **kwargs)
58
58
  return chain.parse_tabular(
59
59
  output=output,
60
60
  object_name=object_name,
@@ -21,7 +21,7 @@ if TYPE_CHECKING:
21
21
  P = ParamSpec("P")
22
22
 
23
23
 
24
- def from_records(
24
+ def read_records(
25
25
  to_insert: Optional[Union[dict, list[dict]]],
26
26
  session: Optional[Session] = None,
27
27
  settings: Optional[dict] = None,
@@ -40,10 +40,10 @@ def from_records(
40
40
  Example:
41
41
  ```py
42
42
  import datachain as dc
43
- single_record = dc.from_records(dc.DEFAULT_FILE_RECORD)
43
+ single_record = dc.read_records(dc.DEFAULT_FILE_RECORD)
44
44
  ```
45
45
  """
46
- from .datasets import from_dataset
46
+ from .datasets import read_dataset
47
47
 
48
48
  session = Session.get(session, in_memory=in_memory)
49
49
  catalog = session.catalog
@@ -87,4 +87,4 @@ def from_records(
87
87
  insert_q = dr.get_table().insert()
88
88
  for record in to_insert:
89
89
  db.execute(insert_q.values(**record))
90
- return from_dataset(name=dsr.name, session=session, settings=settings)
90
+ return read_dataset(name=dsr.name, session=session, settings=settings)
@@ -6,19 +6,23 @@ from typing import (
6
6
  )
7
7
 
8
8
  from datachain.lib.file import (
9
- File,
10
9
  FileType,
11
10
  get_file_type,
12
11
  )
13
- from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
12
+ from datachain.lib.listing import (
13
+ get_file_info,
14
+ get_listing,
15
+ list_bucket,
16
+ ls,
17
+ )
14
18
  from datachain.query import Session
15
19
 
16
20
  if TYPE_CHECKING:
17
21
  from .datachain import DataChain
18
22
 
19
23
 
20
- def from_storage(
21
- uri: Union[str, os.PathLike[str]],
24
+ def read_storage(
25
+ uri: Union[str, os.PathLike[str], list[str], list[os.PathLike[str]]],
22
26
  *,
23
27
  type: FileType = "binary",
24
28
  session: Optional[Session] = None,
@@ -30,11 +34,12 @@ def from_storage(
30
34
  anon: bool = False,
31
35
  client_config: Optional[dict] = None,
32
36
  ) -> "DataChain":
33
- """Get data from a storage as a list of file with all file attributes.
37
+ """Get data from storage(s) as a list of file with all file attributes.
34
38
  It returns the chain itself as usual.
35
39
 
36
40
  Parameters:
37
- uri : storage URI with directory. URI must start with storage prefix such
41
+ uri : storage URI with directory or list of URIs.
42
+ URIs must start with storage prefix such
38
43
  as `s3://`, `gs://`, `az://` or "file:///"
39
44
  type : read file as "binary", "text", or "image" data. Default is "binary".
40
45
  recursive : search recursively for the given path.
@@ -43,17 +48,27 @@ def from_storage(
43
48
  anon : If True, we will treat cloud bucket as public one
44
49
  client_config : Optional client configuration for the storage client.
45
50
 
46
- Example:
47
- Simple call from s3
48
- ```py
51
+ Returns:
52
+ DataChain: A DataChain object containing the file information.
53
+
54
+ Examples:
55
+ Simple call from s3:
56
+ ```python
49
57
  import datachain as dc
50
- chain = dc.from_storage("s3://my-bucket/my-dir")
58
+ chain = dc.read_storage("s3://my-bucket/my-dir")
51
59
  ```
52
60
 
53
- With AWS S3-compatible storage
54
- ```py
55
- import datachain as dc
56
- chain = dc.from_storage(
61
+ Multiple URIs:
62
+ ```python
63
+ chain = dc.read_storage([
64
+ "s3://bucket1/dir1",
65
+ "s3://bucket2/dir2"
66
+ ])
67
+ ```
68
+
69
+ With AWS S3-compatible storage:
70
+ ```python
71
+ chain = dc.read_storage(
57
72
  "s3://my-bucket/my-dir",
58
73
  client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
59
74
  )
@@ -62,14 +77,20 @@ def from_storage(
62
77
  Pass existing session
63
78
  ```py
64
79
  session = Session.get()
65
- import datachain as dc
66
- chain = dc.from_storage("s3://my-bucket/my-dir", session=session)
80
+ chain = dc.read_storage([
81
+ "path/to/dir1",
82
+ "path/to/dir2"
83
+ ], session=session, recursive=True)
67
84
  ```
85
+
86
+ Note:
87
+ When using multiple URIs with `update=True`, the function optimizes by
88
+ avoiding redundant updates for URIs pointing to the same storage location.
68
89
  """
69
90
  from .datachain import DataChain
70
- from .datasets import from_dataset
71
- from .records import from_records
72
- from .values import from_values
91
+ from .datasets import read_dataset
92
+ from .records import read_records
93
+ from .values import read_values
73
94
 
74
95
  file_type = get_file_type(type)
75
96
 
@@ -79,40 +100,72 @@ def from_storage(
79
100
  cache = session.catalog.cache
80
101
  client_config = session.catalog.client_config
81
102
 
82
- list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
83
- uri, session, update=update
84
- )
103
+ uris = uri if isinstance(uri, (list, tuple)) else [uri]
104
+
105
+ if not uris:
106
+ raise ValueError("No URIs provided")
107
+
108
+ storage_chain = None
109
+ listed_ds_name = set()
110
+ file_values = []
111
+
112
+ for single_uri in uris:
113
+ list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
114
+ single_uri, session, update=update
115
+ )
116
+
117
+ # list_ds_name is None if object is a file, we don't want to use cache
118
+ # or do listing in that case - just read that single object
119
+ if not list_ds_name:
120
+ file_values.append(
121
+ get_file_info(list_uri, cache, client_config=client_config)
122
+ )
123
+ continue
124
+
125
+ dc = read_dataset(list_ds_name, session=session, settings=settings)
126
+ dc._query.update = update
127
+ dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
85
128
 
86
- # ds_name is None if object is a file, we don't want to use cache
87
- # or do listing in that case - just read that single object
88
- if not list_ds_name:
89
- dc = from_values(
129
+ if update or not list_ds_exists:
130
+
131
+ def lst_fn(ds_name, lst_uri):
132
+ # disable prefetch for listing, as it pre-downloads all files
133
+ (
134
+ read_records(
135
+ DataChain.DEFAULT_FILE_RECORD,
136
+ session=session,
137
+ settings=settings,
138
+ in_memory=in_memory,
139
+ )
140
+ .settings(prefetch=0)
141
+ .gen(
142
+ list_bucket(lst_uri, cache, client_config=client_config),
143
+ output={f"{object_name}": file_type},
144
+ )
145
+ .save(ds_name, listing=True)
146
+ )
147
+
148
+ dc._query.set_listing_fn(
149
+ lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
150
+ )
151
+
152
+ chain = ls(dc, list_path, recursive=recursive, object_name=object_name)
153
+
154
+ storage_chain = storage_chain.union(chain) if storage_chain else chain
155
+ listed_ds_name.add(list_ds_name)
156
+
157
+ if file_values:
158
+ file_chain = read_values(
90
159
  session=session,
91
160
  settings=settings,
92
161
  in_memory=in_memory,
93
- file=[get_file_info(list_uri, cache, client_config=client_config)],
162
+ file=file_values,
94
163
  )
95
- dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
96
- return dc
97
-
98
- if update or not list_ds_exists:
99
- # disable prefetch for listing, as it pre-downloads all files
100
- (
101
- from_records(
102
- DataChain.DEFAULT_FILE_RECORD,
103
- session=session,
104
- settings=settings,
105
- in_memory=in_memory,
106
- )
107
- .settings(prefetch=0)
108
- .gen(
109
- list_bucket(list_uri, cache, client_config=client_config),
110
- output={f"{object_name}": File},
111
- )
112
- .save(list_ds_name, listing=True)
164
+ file_chain.signals_schema = file_chain.signals_schema.mutate(
165
+ {f"{object_name}": file_type}
113
166
  )
167
+ storage_chain = storage_chain.union(file_chain) if storage_chain else file_chain
114
168
 
115
- dc = from_dataset(list_ds_name, session=session, settings=settings)
116
- dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
169
+ assert storage_chain is not None
117
170
 
118
- return ls(dc, list_path, recursive=recursive, object_name=object_name)
171
+ return storage_chain
@@ -6,7 +6,7 @@ from typing import (
6
6
 
7
7
  from datachain.lib.convert.values_to_tuples import values_to_tuples
8
8
  from datachain.lib.data_model import dict_to_data_model
9
- from datachain.lib.dc.records import from_records
9
+ from datachain.lib.dc.records import read_records
10
10
  from datachain.lib.dc.utils import OutputType
11
11
  from datachain.query import Session
12
12
 
@@ -18,7 +18,7 @@ if TYPE_CHECKING:
18
18
  P = ParamSpec("P")
19
19
 
20
20
 
21
- def from_values(
21
+ def read_values(
22
22
  ds_name: str = "",
23
23
  session: Optional[Session] = None,
24
24
  settings: Optional[dict] = None,
@@ -32,7 +32,7 @@ def from_values(
32
32
  Example:
33
33
  ```py
34
34
  import datachain as dc
35
- dc.from_values(fib=[1, 2, 3, 5, 8])
35
+ dc.read_values(fib=[1, 2, 3, 5, 8])
36
36
  ```
37
37
  """
38
38
  from .datachain import DataChain
@@ -42,7 +42,7 @@ def from_values(
42
42
  def _func_fr() -> Iterator[tuple_type]: # type: ignore[valid-type]
43
43
  yield from tuples
44
44
 
45
- chain = from_records(
45
+ chain = read_records(
46
46
  DataChain.DEFAULT_FILE_RECORD,
47
47
  session=session,
48
48
  settings=settings,
datachain/lib/listing.py CHANGED
@@ -4,6 +4,7 @@ import os
4
4
  import posixpath
5
5
  from collections.abc import Iterator
6
6
  from contextlib import contextmanager
7
+ from datetime import datetime, timedelta, timezone
7
8
  from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
8
9
 
9
10
  from fsspec.asyn import get_loop
@@ -32,6 +33,16 @@ logging.getLogger("aiobotocore.credentials").setLevel(logging.CRITICAL)
32
33
  logging.getLogger("gcsfs").setLevel(logging.CRITICAL)
33
34
 
34
35
 
36
+ def listing_dataset_expired(lst_ds) -> bool:
37
+ """Function that checks if listing dataset is expired or not"""
38
+ lst_version = lst_ds.versions[-1]
39
+ if not lst_version.finished_at:
40
+ return False
41
+
42
+ expires = lst_version.finished_at + timedelta(seconds=LISTING_TTL)
43
+ return datetime.now(timezone.utc) > expires
44
+
45
+
35
46
  def list_bucket(uri: str, cache, client_config=None) -> Callable:
36
47
  """
37
48
  Function that returns another generator function that yields File objects
@@ -103,10 +103,10 @@ def read_meta( # noqa: C901
103
103
  model_name=None,
104
104
  nrows=None,
105
105
  ) -> Callable:
106
- from datachain import from_storage
106
+ from datachain import read_storage
107
107
 
108
108
  if schema_from:
109
- file = next(from_storage(schema_from, type="text").limit(1).collect("file"))
109
+ file = next(read_storage(schema_from, type="text").limit(1).collect("file"))
110
110
  model_code = gen_datamodel_code(
111
111
  file, format=format, jmespath=jmespath, model_name=model_name
112
112
  )
datachain/lib/pytorch.py CHANGED
@@ -14,7 +14,7 @@ from torchvision.transforms import v2
14
14
  from datachain import Session
15
15
  from datachain.cache import get_temp_cache
16
16
  from datachain.catalog import Catalog, get_catalog
17
- from datachain.lib.dc.datasets import from_dataset
17
+ from datachain.lib.dc.datasets import read_dataset
18
18
  from datachain.lib.settings import Settings
19
19
  from datachain.lib.text import convert_text
20
20
  from datachain.progress import CombinedDownloadCallback
@@ -122,7 +122,7 @@ class PytorchDataset(IterableDataset):
122
122
  ) -> Generator[tuple[Any, ...], None, None]:
123
123
  catalog = self._get_catalog()
124
124
  session = Session("PyTorch", catalog=catalog)
125
- ds = from_dataset(
125
+ ds = read_dataset(
126
126
  name=self.name, version=self.version, session=session
127
127
  ).settings(cache=self.cache, prefetch=self.prefetch)
128
128
  ds = ds.remove_file_signals()
datachain/lib/udf.py CHANGED
@@ -145,7 +145,7 @@ class UDFBase(AbstractUDF):
145
145
  return emb[0].tolist()
146
146
 
147
147
  (
148
- dc.from_storage(
148
+ dc.read_storage(
149
149
  "gs://datachain-demo/fashion-product-images/images", type="image"
150
150
  )
151
151
  .limit(5)
@@ -47,6 +47,10 @@ from datachain.error import (
47
47
  QueryScriptCancelError,
48
48
  )
49
49
  from datachain.func.base import Function
50
+ from datachain.lib.listing import (
51
+ is_listing_dataset,
52
+ listing_dataset_expired,
53
+ )
50
54
  from datachain.lib.udf import UDFAdapter, _get_cache
51
55
  from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
52
56
  from datachain.query.schema import C, UDFParamSpec, normalize_param
@@ -151,13 +155,6 @@ def step_result(
151
155
  )
152
156
 
153
157
 
154
- class StartingStep(ABC):
155
- """An initial query processing step, referencing a data source."""
156
-
157
- @abstractmethod
158
- def apply(self) -> "StepResult": ...
159
-
160
-
161
158
  @frozen
162
159
  class Step(ABC):
163
160
  """A query processing step (filtering, mutation, etc.)"""
@@ -170,7 +167,7 @@ class Step(ABC):
170
167
 
171
168
 
172
169
  @frozen
173
- class QueryStep(StartingStep):
170
+ class QueryStep:
174
171
  catalog: "Catalog"
175
172
  dataset_name: str
176
173
  dataset_version: int
@@ -1086,6 +1083,7 @@ class DatasetQuery:
1086
1083
  indexing_column_types: Optional[dict[str, Any]] = None,
1087
1084
  in_memory: bool = False,
1088
1085
  fallback_to_studio: bool = True,
1086
+ update: bool = False,
1089
1087
  ) -> None:
1090
1088
  from datachain.remote.studio import is_token_set
1091
1089
 
@@ -1097,26 +1095,44 @@ class DatasetQuery:
1097
1095
  self.temp_table_names: list[str] = []
1098
1096
  self.dependencies: set[DatasetDependencyType] = set()
1099
1097
  self.table = self.get_table()
1100
- self.starting_step: StartingStep
1098
+ self.starting_step: Optional[QueryStep] = None
1101
1099
  self.name: Optional[str] = None
1102
1100
  self.version: Optional[int] = None
1103
1101
  self.feature_schema: Optional[dict] = None
1104
1102
  self.column_types: Optional[dict[str, Any]] = None
1103
+ self.before_steps: list[Callable] = []
1104
+ self.listing_fn: Optional[Callable] = None
1105
+ self.update = update
1105
1106
 
1106
- self.name = name
1107
+ self.list_ds_name: Optional[str] = None
1107
1108
 
1108
- if fallback_to_studio and is_token_set():
1109
- ds = self.catalog.get_dataset_with_remote_fallback(name, version)
1109
+ self.name = name
1110
+ self.dialect = self.catalog.warehouse.db.dialect
1111
+ if version:
1112
+ self.version = version
1113
+
1114
+ if is_listing_dataset(name):
1115
+ # not setting query step yet as listing dataset might not exist at
1116
+ # this point
1117
+ self.list_ds_name = name
1118
+ elif fallback_to_studio and is_token_set():
1119
+ self._set_starting_step(
1120
+ self.catalog.get_dataset_with_remote_fallback(name, version)
1121
+ )
1110
1122
  else:
1111
- ds = self.catalog.get_dataset(name)
1123
+ self._set_starting_step(self.catalog.get_dataset(name))
1124
+
1125
+ def _set_starting_step(self, ds: "DatasetRecord") -> None:
1126
+ if not self.version:
1127
+ self.version = ds.latest_version
1128
+
1129
+ self.starting_step = QueryStep(self.catalog, ds.name, self.version)
1112
1130
 
1113
- self.version = version or ds.latest_version
1131
+ # at this point we know our starting dataset so setting up schemas
1114
1132
  self.feature_schema = ds.get_version(self.version).feature_schema
1115
1133
  self.column_types = copy(ds.schema)
1116
1134
  if "sys__id" in self.column_types:
1117
1135
  self.column_types.pop("sys__id")
1118
- self.starting_step = QueryStep(self.catalog, name, self.version)
1119
- self.dialect = self.catalog.warehouse.db.dialect
1120
1136
 
1121
1137
  def __iter__(self):
1122
1138
  return iter(self.db_results())
@@ -1180,11 +1196,30 @@ class DatasetQuery:
1180
1196
  col.table = self.table
1181
1197
  return col
1182
1198
 
1199
+ def set_listing_fn(self, fn: Callable) -> None:
1200
+ """Setting listing function to be run if needed"""
1201
+ self.listing_fn = fn
1202
+
1183
1203
  def apply_steps(self) -> QueryGenerator:
1184
1204
  """
1185
1205
  Apply the steps in the query and return the resulting
1186
1206
  sqlalchemy.SelectBase.
1187
1207
  """
1208
+ if self.list_ds_name and not self.starting_step:
1209
+ listing_ds = None
1210
+ try:
1211
+ listing_ds = self.catalog.get_dataset(self.list_ds_name)
1212
+ except DatasetNotFoundError:
1213
+ pass
1214
+
1215
+ if not listing_ds or self.update or listing_dataset_expired(listing_ds):
1216
+ assert self.listing_fn
1217
+ self.listing_fn()
1218
+ listing_ds = self.catalog.get_dataset(self.list_ds_name)
1219
+
1220
+ # at this point we know what is our starting listing dataset name
1221
+ self._set_starting_step(listing_ds) # type: ignore [arg-type]
1222
+
1188
1223
  query = self.clone()
1189
1224
 
1190
1225
  index = os.getenv("DATACHAIN_QUERY_CHUNK_INDEX", self._chunk_index)
@@ -1203,6 +1238,7 @@ class DatasetQuery:
1203
1238
  query = query.filter(C.sys__rand % total == index)
1204
1239
  query.steps = query.steps[-1:] + query.steps[:-1]
1205
1240
 
1241
+ assert query.starting_step
1206
1242
  result = query.starting_step.apply()
1207
1243
  self.dependencies.update(result.dependencies)
1208
1244
 
@@ -41,7 +41,7 @@ def train_test_split(
41
41
  from datachain.toolkit import train_test_split
42
42
 
43
43
  # Load a DataChain from a storage source (e.g., S3 bucket)
44
- dc = dc.from_storage("s3://bucket/dir/")
44
+ dc = dc.read_storage("s3://bucket/dir/")
45
45
 
46
46
  # Perform a 70/30 train-test split
47
47
  train, test = train_test_split(dc, [0.7, 0.3])
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.14.0
3
+ Version: 0.14.2
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
- License: Apache-2.0
6
+ License-Expression: Apache-2.0
7
7
  Project-URL: Documentation, https://datachain.dvc.ai
8
8
  Project-URL: Issues, https://github.com/iterative/datachain/issues
9
9
  Project-URL: Source, https://github.com/iterative/datachain
@@ -38,7 +38,7 @@ Requires-Dist: sqlalchemy>=2
38
38
  Requires-Dist: multiprocess==0.70.16
39
39
  Requires-Dist: cloudpickle
40
40
  Requires-Dist: orjson>=3.10.5
41
- Requires-Dist: pydantic<3,>=2
41
+ Requires-Dist: pydantic<2.11,>=2
42
42
  Requires-Dist: jmespath>=1.0
43
43
  Requires-Dist: datamodel-code-generator>=0.25
44
44
  Requires-Dist: Pillow<12,>=10.0.0
@@ -171,8 +171,8 @@ high confidence scores.
171
171
 
172
172
  import datachain as dc
173
173
 
174
- meta = dc.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
175
- images = dc.from_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
174
+ meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
175
+ images = dc.read_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
176
176
 
177
177
  images_id = images.map(id=lambda file: file.path.split('.')[-2])
178
178
  annotated = images_id.merge(meta, on="id", right_on="meta.id")
@@ -213,7 +213,7 @@ Python code:
213
213
  return result.lower().startswith("success")
214
214
 
215
215
  chain = (
216
- dc.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
216
+ dc.read_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
217
217
  .settings(parallel=4, cache=True)
218
218
  .map(is_success=eval_dialogue)
219
219
  .save("mistral_files")