datachain 0.20.4__py3-none-any.whl → 0.21.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (47) hide show
  1. datachain/__init__.py +0 -2
  2. datachain/cache.py +2 -2
  3. datachain/catalog/catalog.py +65 -180
  4. datachain/cli/__init__.py +11 -2
  5. datachain/cli/commands/datasets.py +28 -43
  6. datachain/cli/commands/ls.py +2 -2
  7. datachain/cli/parser/__init__.py +35 -1
  8. datachain/client/fsspec.py +3 -5
  9. datachain/client/hf.py +0 -10
  10. datachain/client/local.py +4 -4
  11. datachain/data_storage/metastore.py +37 -405
  12. datachain/data_storage/sqlite.py +7 -136
  13. datachain/data_storage/warehouse.py +7 -26
  14. datachain/dataset.py +12 -126
  15. datachain/delta.py +7 -11
  16. datachain/error.py +0 -36
  17. datachain/func/func.py +1 -1
  18. datachain/lib/arrow.py +3 -3
  19. datachain/lib/dataset_info.py +0 -4
  20. datachain/lib/dc/datachain.py +92 -260
  21. datachain/lib/dc/datasets.py +50 -104
  22. datachain/lib/dc/listings.py +3 -3
  23. datachain/lib/dc/records.py +0 -1
  24. datachain/lib/dc/storage.py +40 -38
  25. datachain/lib/file.py +23 -77
  26. datachain/lib/listing.py +1 -3
  27. datachain/lib/meta_formats.py +1 -1
  28. datachain/lib/pytorch.py +1 -1
  29. datachain/lib/settings.py +0 -10
  30. datachain/lib/tar.py +2 -1
  31. datachain/lib/udf_signature.py +1 -1
  32. datachain/lib/webdataset.py +20 -30
  33. datachain/listing.py +1 -3
  34. datachain/query/dataset.py +46 -71
  35. datachain/query/session.py +1 -1
  36. datachain/remote/studio.py +26 -61
  37. datachain/studio.py +20 -27
  38. {datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/METADATA +2 -2
  39. {datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/RECORD +43 -47
  40. datachain/lib/namespaces.py +0 -71
  41. datachain/lib/projects.py +0 -86
  42. datachain/namespace.py +0 -65
  43. datachain/project.py +0 -78
  44. {datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/WHEEL +0 -0
  45. {datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/entry_points.txt +0 -0
  46. {datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/licenses/LICENSE +0 -0
  47. {datachain-0.20.4.dist-info → datachain-0.21.1.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,11 @@
1
1
  from collections.abc import Sequence
2
2
  from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
3
3
 
4
- from datachain.dataset import parse_dataset_name
5
- from datachain.error import (
6
- DatasetNotFoundError,
7
- DatasetVersionNotFoundError,
8
- ProjectNotFoundError,
9
- )
4
+ from datachain.error import DatasetVersionNotFoundError
10
5
  from datachain.lib.dataset_info import DatasetInfo
11
6
  from datachain.lib.file import (
12
7
  File,
13
8
  )
14
- from datachain.lib.projects import get as get_project
15
9
  from datachain.lib.settings import Settings
16
10
  from datachain.lib.signal_schema import SignalSchema
17
11
  from datachain.query import Session
@@ -30,18 +24,12 @@ if TYPE_CHECKING:
30
24
 
31
25
  def read_dataset(
32
26
  name: str,
33
- namespace: Optional[str] = None,
34
- project: Optional[str] = None,
35
27
  version: Optional[Union[str, int]] = None,
36
28
  session: Optional[Session] = None,
37
29
  settings: Optional[dict] = None,
38
30
  fallback_to_studio: bool = True,
39
31
  delta: Optional[bool] = False,
40
- delta_on: Optional[Union[str, Sequence[str]]] = (
41
- "file.path",
42
- "file.etag",
43
- "file.version",
44
- ),
32
+ delta_on: Optional[Union[str, Sequence[str]]] = None,
45
33
  delta_result_on: Optional[Union[str, Sequence[str]]] = None,
46
34
  delta_compare: Optional[Union[str, Sequence[str]]] = None,
47
35
  delta_retry: Optional[Union[bool, str]] = None,
@@ -50,36 +38,47 @@ def read_dataset(
50
38
  If dataset or version is not found locally, it will try to pull it from Studio.
51
39
 
52
40
  Parameters:
53
- name: The dataset name, which can be a fully qualified name including the
54
- namespace and project. Alternatively, it can be a regular name, in which
55
- case the explicitly defined namespace and project will be used if they are
56
- set; otherwise, default values will be applied.
57
- namespace : optional name of namespace in which dataset to read is created
58
- project : optional name of project in which dataset to read is created
41
+ name : dataset name
59
42
  version : dataset version
60
43
  session : Session to use for the chain.
61
44
  settings : Settings to use for the chain.
62
45
  fallback_to_studio : Try to pull dataset from Studio if not found locally.
63
46
  Default is True.
64
- delta: If True, only process new or changed files instead of reprocessing
65
- everything. This saves time by skipping files that were already processed in
66
- previous versions. The optimization is working when a new version of the
67
- dataset is created.
68
- Default is False.
69
- delta_on: Field(s) that uniquely identify each record in the source data.
70
- Used to detect which records are new or changed.
71
- Default is ("file.path", "file.etag", "file.version").
72
- delta_result_on: Field(s) in the result dataset that match `delta_on` fields.
73
- Only needed if you rename the identifying fields during processing.
74
- Default is None.
75
- delta_compare: Field(s) used to detect if a record has changed.
76
- If not specified, all fields except `delta_on` fields are used.
77
- Default is None.
78
- delta_retry: Controls retry behavior for failed records:
79
- - String (field name): Reprocess records where this field is not empty
80
- (error mode)
81
- - True: Reprocess records missing from the result dataset (missing mode)
82
- - None: No retry processing (default)
47
+ delta: If set to True, we optimize the creation of new dataset versions by
48
+ calculating the diff between the latest version of this storage and the
49
+ version used to create the most recent version of the resulting chain
50
+ dataset (the one specified in `.save()`). We then run the "diff" chain
51
+ using only the diff data, rather than the entire storage data, and merge
52
+ that diff chain with the latest version of the resulting dataset to create
53
+ a new version. This approach avoids applying modifications to all records
54
+ from storage every time, which can be an expensive operation.
55
+ The diff is calculated using the `DataChain.compare()` method, which
56
+ compares the `delta_on` fields to find matches and checks the compare
57
+ fields to determine if a record has changed. Note that this process only
58
+ considers added and modified records in storage; deleted records are not
59
+ removed from the new dataset version.
60
+ This calculation is based on the difference between the current version
61
+ of the source and the version used to create the dataset.
62
+ delta_on: A list of fields that uniquely identify rows in the source.
63
+ If two rows have the same values, they are considered the same (e.g., they
64
+ could be different versions of the same row in a versioned source).
65
+ This is used in the delta update to calculate the diff.
66
+ delta_result_on: A list of fields in the resulting dataset that correspond
67
+ to the `delta_on` fields from the source.
68
+ This is needed to identify rows that have changed in the source but are
69
+ already present in the current version of the resulting dataset, in order
70
+ to avoid including outdated versions of those rows in the new dataset.
71
+ We retain only the latest versions of rows to prevent duplication.
72
+ There is no need to define this if the `delta_on` fields are present in
73
+ the final dataset and have not been renamed.
74
+ delta_compare: A list of fields used to check if the same row has been modified
75
+ in the new version of the source.
76
+ If not defined, all fields except those defined in delta_on will be used.
77
+ delta_retry: Specifies retry behavior for delta processing. If a string,
78
+ it's the name of a field in the result dataset that indicates an error
79
+ when not None - records with errors will be reprocessed. If True,
80
+ records that exist in the source dataset but not in the result dataset
81
+ will be reprocessed.
83
82
 
84
83
  Example:
85
84
  ```py
@@ -87,11 +86,6 @@ def read_dataset(
87
86
  chain = dc.read_dataset("my_cats")
88
87
  ```
89
88
 
90
- ```py
91
- import datachain as dc
92
- chain = dc.read_dataset("dev.animals.my_cats")
93
- ```
94
-
95
89
  ```py
96
90
  chain = dc.read_dataset("my_cats", fallback_to_studio=False)
97
91
  ```
@@ -122,15 +116,6 @@ def read_dataset(
122
116
 
123
117
  from .datachain import DataChain
124
118
 
125
- session = Session.get(session)
126
- catalog = session.catalog
127
-
128
- namespace_name, project_name, name = parse_dataset_name(name)
129
- namespace_name = (
130
- namespace_name or namespace or catalog.metastore.default_namespace_name
131
- )
132
- project_name = project_name or project or catalog.metastore.default_project_name
133
-
134
119
  if version is not None:
135
120
  try:
136
121
  # for backward compatibility we still allow users to put version as integer
@@ -140,15 +125,7 @@ def read_dataset(
140
125
  # all 2.* dataset versions). If dataset doesn't have any versions where
141
126
  # major part is equal to that input, exception is thrown.
142
127
  major = int(version)
143
- try:
144
- ds_project = get_project(project_name, namespace_name, session=session)
145
- except ProjectNotFoundError:
146
- raise DatasetNotFoundError(
147
- f"Dataset {name} not found in namespace {namespace_name} and",
148
- f" project {project_name}",
149
- ) from None
150
-
151
- dataset = session.catalog.get_dataset(name, ds_project)
128
+ dataset = Session.get(session).catalog.get_dataset(name)
152
129
  latest_major = dataset.latest_major_version(major)
153
130
  if not latest_major:
154
131
  raise DatasetVersionNotFoundError(
@@ -159,22 +136,19 @@ def read_dataset(
159
136
  # version is in new semver string format, continuing as normal
160
137
  pass
161
138
 
162
- if settings:
163
- _settings = Settings(**settings)
164
- else:
165
- _settings = Settings()
166
-
167
139
  query = DatasetQuery(
168
140
  name=name,
169
- project_name=project_name,
170
- namespace_name=namespace_name,
171
141
  version=version, # type: ignore[arg-type]
172
142
  session=session,
173
143
  indexing_column_types=File._datachain_column_types,
174
144
  fallback_to_studio=fallback_to_studio,
175
145
  )
176
-
177
146
  telemetry.send_event_once("class", "datachain_init", name=name, version=version)
147
+ if settings:
148
+ _settings = Settings(**settings)
149
+ else:
150
+ _settings = Settings()
151
+
178
152
  signals_schema = SignalSchema({"sys": Sys})
179
153
  if query.feature_schema:
180
154
  signals_schema |= SignalSchema.deserialize(query.feature_schema)
@@ -226,7 +200,7 @@ def datasets(
226
200
  import datachain as dc
227
201
 
228
202
  chain = dc.datasets(column="dataset")
229
- for ds in chain.to_iter("dataset"):
203
+ for ds in chain.collect("dataset"):
230
204
  print(f"{ds.name}@v{ds.version}")
231
205
  ```
232
206
  """
@@ -277,8 +251,6 @@ def datasets(
277
251
 
278
252
  def delete_dataset(
279
253
  name: str,
280
- namespace: Optional[str] = None,
281
- project: Optional[str] = None,
282
254
  version: Optional[str] = None,
283
255
  force: Optional[bool] = False,
284
256
  studio: Optional[bool] = False,
@@ -289,16 +261,11 @@ def delete_dataset(
289
261
  a force flag.
290
262
 
291
263
  Args:
292
- name: The dataset name, which can be a fully qualified name including the
293
- namespace and project. Alternatively, it can be a regular name, in which
294
- case the explicitly defined namespace and project will be used if they are
295
- set; otherwise, default values will be applied.
296
- namespace : optional name of namespace in which dataset to delete is created
297
- project : optional name of project in which dataset to delete is created
264
+ name : Dataset name
298
265
  version : Optional dataset version
299
266
  force: If true, all datasets versions will be removed. Defaults to False.
300
- studio: If True, removes dataset from Studio only, otherwise removes local
301
- dataset. Defaults to False.
267
+ studio: If True, removes dataset from Studio only,
268
+ otherwise remove from local. Defaults to False.
302
269
  session: Optional session instance. If not provided, uses default session.
303
270
  in_memory: If True, creates an in-memory session. Defaults to False.
304
271
 
@@ -315,32 +282,11 @@ def delete_dataset(
315
282
  dc.delete_dataset("cats", version="1.0.0")
316
283
  ```
317
284
  """
318
- from datachain.studio import remove_studio_dataset
319
285
 
320
286
  session = Session.get(session, in_memory=in_memory)
321
287
  catalog = session.catalog
322
-
323
- namespace_name, project_name, name = parse_dataset_name(name)
324
- namespace_name = (
325
- namespace_name or namespace or catalog.metastore.default_namespace_name
326
- )
327
- project_name = project_name or project or catalog.metastore.default_project_name
328
-
329
- if not catalog.metastore.is_local_dataset(namespace_name) and studio:
330
- return remove_studio_dataset(
331
- None, name, namespace_name, project_name, version=version, force=force
332
- )
333
-
334
- try:
335
- ds_project = get_project(project_name, namespace_name, session=session)
336
- except ProjectNotFoundError:
337
- raise DatasetNotFoundError(
338
- f"Dataset {name} not found in namespace {namespace_name} and project",
339
- f" {project_name}",
340
- ) from None
341
-
342
288
  if not force:
343
- version = version or catalog.get_dataset(name, ds_project).latest_version
289
+ version = version or catalog.get_dataset(name).latest_version
344
290
  else:
345
291
  version = None
346
- catalog.remove_dataset(name, ds_project, version=version, force=force)
292
+ catalog.remove_dataset(name, version=version, force=force, studio=studio)
@@ -37,7 +37,7 @@ class ReadOnlyQueryStep(QueryStep):
37
37
  return sa.select(*columns)
38
38
 
39
39
  table_name = self.catalog.warehouse.dataset_table_name(
40
- self.dataset, self.dataset_version
40
+ self.dataset_name, self.dataset_version
41
41
  )
42
42
  dataset_row_cls = self.catalog.warehouse.schema.dataset_row_cls
43
43
  table = dataset_row_cls.new_table(
@@ -51,7 +51,7 @@ class ReadOnlyQueryStep(QueryStep):
51
51
  )
52
52
 
53
53
  return step_result(
54
- q, table.columns, dependencies=[(self.dataset, self.dataset_version)]
54
+ q, table.columns, dependencies=[(self.dataset_name, self.dataset_version)]
55
55
  )
56
56
 
57
57
 
@@ -142,7 +142,7 @@ def read_listing_dataset(
142
142
  _settings = Settings(prefetch=0)
143
143
  signal_schema = SignalSchema({"sys": Sys, "file": File})
144
144
 
145
- query.starting_step = ReadOnlyQueryStep(query.catalog, dataset, version)
145
+ query.starting_step = ReadOnlyQueryStep(query.catalog, name, version)
146
146
  query.version = version
147
147
  # We already know that this is a listing dataset,
148
148
  # so we can set the listing function to True
@@ -68,7 +68,6 @@ def read_records(
68
68
 
69
69
  dsr = catalog.create_dataset(
70
70
  name,
71
- catalog.metastore.default_project,
72
71
  columns=columns,
73
72
  feature_schema=(
74
73
  signal_schema.clone_without_sys_signals().serialize()
@@ -35,11 +35,7 @@ def read_storage(
35
35
  update: bool = False,
36
36
  anon: bool = False,
37
37
  delta: Optional[bool] = False,
38
- delta_on: Optional[Union[str, Sequence[str]]] = (
39
- "file.path",
40
- "file.etag",
41
- "file.version",
42
- ),
38
+ delta_on: Optional[Union[str, Sequence[str]]] = None,
43
39
  delta_result_on: Optional[Union[str, Sequence[str]]] = None,
44
40
  delta_compare: Optional[Union[str, Sequence[str]]] = None,
45
41
  delta_retry: Optional[Union[bool, str]] = None,
@@ -58,25 +54,43 @@ def read_storage(
58
54
  update : force storage reindexing. Default is False.
59
55
  anon : If True, we will treat cloud bucket as public one
60
56
  client_config : Optional client configuration for the storage client.
61
- delta: If True, only process new or changed files instead of reprocessing
62
- everything. This saves time by skipping files that were already processed in
63
- previous versions. The optimization is working when a new version of the
64
- dataset is created.
65
- Default is False.
66
- delta_on: Field(s) that uniquely identify each record in the source data.
67
- Used to detect which records are new or changed.
68
- Default is ("file.path", "file.etag", "file.version").
69
- delta_result_on: Field(s) in the result dataset that match `delta_on` fields.
70
- Only needed if you rename the identifying fields during processing.
71
- Default is None.
72
- delta_compare: Field(s) used to detect if a record has changed.
73
- If not specified, all fields except `delta_on` fields are used.
74
- Default is None.
75
- delta_retry: Controls retry behavior for failed records:
76
- - String (field name): Reprocess records where this field is not empty
77
- (error mode)
78
- - True: Reprocess records missing from the result dataset (missing mode)
79
- - None: No retry processing (default)
57
+ delta: If set to True, we optimize the creation of new dataset versions by
58
+ calculating the diff between the latest version of this storage and the
59
+ version used to create the most recent version of the resulting chain
60
+ dataset (the one specified in `.save()`). We then run the "diff" chain
61
+ using only the diff data, rather than the entire storage data, and merge
62
+ that diff chain with the latest version of the resulting dataset to create
63
+ a new version. This approach avoids applying modifications to all records
64
+ from storage every time, which can be an expensive operation.
65
+ The diff is calculated using the `DataChain.compare()` method, which
66
+ compares the `delta_on` fields to find matches and checks the compare
67
+ fields to determine if a record has changed. Note that this process only
68
+ considers added and modified records in storage; deleted records are not
69
+ removed from the new dataset version.
70
+ This calculation is based on the difference between the current version
71
+ of the source and the version used to create the dataset.
72
+ delta_on: A list of fields that uniquely identify rows in the source.
73
+ If two rows have the same values, they are considered the same (e.g., they
74
+ could be different versions of the same row in a versioned source).
75
+ This is used in the delta update to calculate the diff.
76
+ delta_result_on: A list of fields in the resulting dataset that correspond
77
+ to the `delta_on` fields from the source.
78
+ This is needed to identify rows that have changed in the source but are
79
+ already present in the current version of the resulting dataset, in order
80
+ to avoid including outdated versions of those rows in the new dataset.
81
+ We retain only the latest versions of rows to prevent duplication.
82
+ There is no need to define this if the `delta_on` fields are present in
83
+ the final dataset and have not been renamed.
84
+ delta_compare: A list of fields used to check if the same row has been modified
85
+ in the new version of the source.
86
+ If not defined, all fields except those defined in `delta_on` will be used.
87
+ delta_retry: Controls which records to reprocess. Can be:
88
+ - A string specifying a field name: Records where this field is not None
89
+ will be reprocessed (error checking mode).
90
+ - True: Records that exist in the source dataset but not in the result
91
+ dataset (based on delta_on/delta_result_on fields) will be reprocessed
92
+ (missing records mode).
93
+ - False or None: No retry processing.
80
94
 
81
95
  Returns:
82
96
  DataChain: A DataChain object containing the file information.
@@ -130,8 +144,6 @@ def read_storage(
130
144
  catalog = session.catalog
131
145
  cache = catalog.cache
132
146
  client_config = session.catalog.client_config
133
- listing_namespace_name = catalog.metastore.system_namespace_name
134
- listing_project_name = catalog.metastore.listing_project_name
135
147
 
136
148
  uris = uri if isinstance(uri, (list, tuple)) else [uri]
137
149
 
@@ -155,13 +167,7 @@ def read_storage(
155
167
  )
156
168
  continue
157
169
 
158
- dc = read_dataset(
159
- list_ds_name,
160
- namespace=listing_namespace_name,
161
- project=listing_project_name,
162
- session=session,
163
- settings=settings,
164
- )
170
+ dc = read_dataset(list_ds_name, session=session, settings=settings)
165
171
  dc._query.update = update
166
172
  dc.signals_schema = dc.signals_schema.mutate({f"{column}": file_type})
167
173
 
@@ -176,11 +182,7 @@ def read_storage(
176
182
  settings=settings,
177
183
  in_memory=in_memory,
178
184
  )
179
- .settings(
180
- prefetch=0,
181
- namespace=listing_namespace_name,
182
- project=listing_project_name,
183
- )
185
+ .settings(prefetch=0)
184
186
  .gen(
185
187
  list_bucket(lst_uri, cache, client_config=client_config),
186
188
  output={f"{column}": file_type},
datachain/lib/file.py CHANGED
@@ -5,14 +5,13 @@ import json
5
5
  import logging
6
6
  import os
7
7
  import posixpath
8
- import warnings
9
8
  from abc import ABC, abstractmethod
10
9
  from collections.abc import Iterator
11
10
  from contextlib import contextmanager
12
11
  from datetime import datetime
13
12
  from functools import partial
14
13
  from io import BytesIO
15
- from pathlib import Path, PurePath, PurePosixPath
14
+ from pathlib import Path, PurePosixPath
16
15
  from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
17
16
  from urllib.parse import unquote, urlparse
18
17
  from urllib.request import url2pathname
@@ -70,7 +69,7 @@ class FileExporter(NodesThreadPool):
70
69
  for task in done:
71
70
  task.result()
72
71
 
73
- def do_task(self, file: "File"):
72
+ def do_task(self, file):
74
73
  file.export(
75
74
  self.output,
76
75
  self.placement,
@@ -275,8 +274,8 @@ class File(DataModel):
275
274
 
276
275
  @field_validator("path", mode="before")
277
276
  @classmethod
278
- def validate_path(cls, path: str) -> str:
279
- return PurePath(path).as_posix() if path else ""
277
+ def validate_path(cls, path):
278
+ return Path(path).as_posix() if path else ""
280
279
 
281
280
  def model_dump_custom(self):
282
281
  res = self.model_dump()
@@ -338,11 +337,11 @@ class File(DataModel):
338
337
  return cls(**{key: row[key] for key in cls._datachain_column_types})
339
338
 
340
339
  @property
341
- def name(self) -> str:
340
+ def name(self):
342
341
  return PurePosixPath(self.path).name
343
342
 
344
343
  @property
345
- def parent(self) -> str:
344
+ def parent(self):
346
345
  return str(PurePosixPath(self.path).parent)
347
346
 
348
347
  @contextmanager
@@ -392,7 +391,7 @@ class File(DataModel):
392
391
 
393
392
  client.upload(self.read(), destination)
394
393
 
395
- def _symlink_to(self, destination: str) -> None:
394
+ def _symlink_to(self, destination: str):
396
395
  if self.location:
397
396
  raise OSError(errno.ENOTSUP, "Symlinking virtual file is not supported")
398
397
 
@@ -401,7 +400,7 @@ class File(DataModel):
401
400
  source = self.get_local_path()
402
401
  assert source, "File was not cached"
403
402
  elif self.source.startswith("file://"):
404
- source = self.get_fs_path()
403
+ source = self.get_path()
405
404
  else:
406
405
  raise OSError(errno.EXDEV, "can't link across filesystems")
407
406
 
@@ -482,62 +481,27 @@ class File(DataModel):
482
481
 
483
482
  def get_file_ext(self):
484
483
  """Returns last part of file name without `.`."""
485
- return PurePosixPath(self.path).suffix.lstrip(".")
484
+ return PurePosixPath(self.path).suffix.strip(".")
486
485
 
487
486
  def get_file_stem(self):
488
487
  """Returns file name without extension."""
489
488
  return PurePosixPath(self.path).stem
490
489
 
491
490
  def get_full_name(self):
492
- """
493
- [DEPRECATED] Use `file.path` directly instead.
494
-
495
- Returns name with parent directories.
496
- """
497
- warnings.warn(
498
- "file.get_full_name() is deprecated and will be removed "
499
- "in a future version. Use `file.path` directly.",
500
- DeprecationWarning,
501
- stacklevel=2,
502
- )
491
+ """Returns name with parent directories."""
503
492
  return self.path
504
493
 
505
- def get_path_normalized(self) -> str:
506
- if not self.path:
507
- raise FileError("path must not be empty", self.source, self.path)
508
-
509
- if self.path.endswith("/"):
510
- raise FileError("path must not be a directory", self.source, self.path)
511
-
512
- normpath = os.path.normpath(self.path)
513
- normpath = PurePath(normpath).as_posix()
514
-
515
- if normpath == ".":
516
- raise FileError("path must not be a directory", self.source, self.path)
517
-
518
- if any(part == ".." for part in PurePath(normpath).parts):
519
- raise FileError("path must not contain '..'", self.source, self.path)
520
-
521
- return normpath
522
-
523
- def get_uri(self) -> str:
494
+ def get_uri(self):
524
495
  """Returns file URI."""
525
- return f"{self.source}/{self.get_path_normalized()}"
496
+ return f"{self.source}/{self.get_full_name()}"
526
497
 
527
- def get_fs_path(self) -> str:
528
- """
529
- Returns file path with respect to the filescheme.
530
-
531
- If `normalize` is True, the path is normalized to remove any redundant
532
- separators and up-level references.
533
-
534
- If the file scheme is "file", the path is converted to a local file path
535
- using `url2pathname`. Otherwise, the original path with scheme is returned.
536
- """
498
+ def get_path(self) -> str:
499
+ """Returns file path."""
537
500
  path = unquote(self.get_uri())
538
- path_parsed = urlparse(path)
539
- if path_parsed.scheme == "file":
540
- path = url2pathname(path_parsed.path)
501
+ source = urlparse(self.source)
502
+ if source.scheme == "file":
503
+ path = urlparse(path).path
504
+ path = url2pathname(path)
541
505
  return path
542
506
 
543
507
  def get_destination_path(
@@ -552,7 +516,7 @@ class File(DataModel):
552
516
  elif placement == "etag":
553
517
  path = f"{self.etag}{self.get_file_suffix()}"
554
518
  elif placement == "fullpath":
555
- path = unquote(self.get_path_normalized())
519
+ path = unquote(self.get_full_name())
556
520
  source = urlparse(self.source)
557
521
  if source.scheme and source.scheme != "file":
558
522
  path = posixpath.join(source.netloc, path)
@@ -590,9 +554,8 @@ class File(DataModel):
590
554
  ) from e
591
555
 
592
556
  try:
593
- normalized_path = self.get_path_normalized()
594
- info = client.fs.info(client.get_full_path(normalized_path))
595
- converted_info = client.info_to_file(info, normalized_path)
557
+ info = client.fs.info(client.get_full_path(self.path))
558
+ converted_info = client.info_to_file(info, self.path)
596
559
  return type(self)(
597
560
  path=self.path,
598
561
  source=self.source,
@@ -603,17 +566,8 @@ class File(DataModel):
603
566
  last_modified=converted_info.last_modified,
604
567
  location=self.location,
605
568
  )
606
- except FileError as e:
607
- logger.warning(
608
- "File error when resolving %s/%s: %s", self.source, self.path, str(e)
609
- )
610
569
  except (FileNotFoundError, PermissionError, OSError) as e:
611
- logger.warning(
612
- "File system error when resolving %s/%s: %s",
613
- self.source,
614
- self.path,
615
- str(e),
616
- )
570
+ logger.warning("File system error when resolving %s: %s", self.path, str(e))
617
571
 
618
572
  return type(self)(
619
573
  path=self.path,
@@ -629,8 +583,6 @@ class File(DataModel):
629
583
 
630
584
  def resolve(file: File) -> File:
631
585
  """
632
- [DEPRECATED] Use `file.resolve()` directly instead.
633
-
634
586
  Resolve a File object by checking its existence and updating its metadata.
635
587
 
636
588
  This function is a wrapper around the File.resolve() method, designed to be
@@ -646,12 +598,6 @@ def resolve(file: File) -> File:
646
598
  RuntimeError: If the file's catalog is not set or if
647
599
  the file source protocol is unsupported.
648
600
  """
649
- warnings.warn(
650
- "resolve() is deprecated and will be removed "
651
- "in a future version. Use file.resolve() directly.",
652
- DeprecationWarning,
653
- stacklevel=2,
654
- )
655
601
  return file.resolve()
656
602
 
657
603
 
@@ -999,7 +945,7 @@ class ArrowRow(DataModel):
999
945
  ds = dataset(path, **self.kwargs)
1000
946
 
1001
947
  else:
1002
- path = self.file.get_fs_path()
948
+ path = self.file.get_path()
1003
949
  ds = dataset(path, filesystem=self.file.get_fs(), **self.kwargs)
1004
950
 
1005
951
  return ds.take([self.index]).to_reader()
datachain/lib/listing.py CHANGED
@@ -123,9 +123,6 @@ def parse_listing_uri(uri: str) -> tuple[str, str, str]:
123
123
  f"{LISTING_PREFIX}{storage_uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
124
124
  )
125
125
 
126
- # we should remove dots from the name
127
- ds_name = ds_name.replace(".", "_")
128
-
129
126
  return ds_name, lst_uri, path
130
127
 
131
128
 
@@ -198,4 +195,5 @@ def get_listing(
198
195
  list_path = f"{ds_name.strip('/').removeprefix(listing.name)}/{list_path}"
199
196
 
200
197
  ds_name = listing.name if listing else ds_name
198
+
201
199
  return ds_name, list_uri, list_path, bool(listing)
@@ -106,7 +106,7 @@ def read_meta( # noqa: C901
106
106
  from datachain import read_storage
107
107
 
108
108
  if schema_from:
109
- file = read_storage(schema_from, type="text").limit(1).to_values("file")[0]
109
+ file = next(read_storage(schema_from, type="text").limit(1).collect("file"))
110
110
  model_code = gen_datamodel_code(
111
111
  file, format=format, jmespath=jmespath, model_name=model_name
112
112
  )
datachain/lib/pytorch.py CHANGED
@@ -130,7 +130,7 @@ class PytorchDataset(IterableDataset):
130
130
  if self.num_samples > 0:
131
131
  ds = ds.sample(self.num_samples)
132
132
  ds = ds.chunk(total_rank, total_workers)
133
- yield from ds.to_iter()
133
+ yield from ds.collect()
134
134
 
135
135
  def _iter_with_prefetch(self) -> Generator[tuple[Any], None, None]:
136
136
  from datachain.lib.udf import _prefetch_inputs