datachain 0.20.3__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (47) hide show
  1. datachain/__init__.py +0 -2
  2. datachain/cache.py +2 -2
  3. datachain/catalog/catalog.py +65 -180
  4. datachain/cli/__init__.py +7 -0
  5. datachain/cli/commands/datasets.py +28 -43
  6. datachain/cli/commands/ls.py +2 -2
  7. datachain/cli/parser/__init__.py +35 -1
  8. datachain/client/fsspec.py +3 -5
  9. datachain/client/hf.py +0 -10
  10. datachain/client/local.py +4 -4
  11. datachain/data_storage/metastore.py +37 -403
  12. datachain/data_storage/sqlite.py +7 -139
  13. datachain/data_storage/warehouse.py +7 -26
  14. datachain/dataset.py +12 -126
  15. datachain/delta.py +7 -11
  16. datachain/error.py +0 -36
  17. datachain/func/func.py +1 -1
  18. datachain/lib/arrow.py +3 -3
  19. datachain/lib/dataset_info.py +0 -4
  20. datachain/lib/dc/datachain.py +92 -259
  21. datachain/lib/dc/datasets.py +49 -87
  22. datachain/lib/dc/listings.py +3 -3
  23. datachain/lib/dc/records.py +0 -1
  24. datachain/lib/dc/storage.py +40 -38
  25. datachain/lib/file.py +23 -77
  26. datachain/lib/listing.py +1 -3
  27. datachain/lib/meta_formats.py +1 -1
  28. datachain/lib/pytorch.py +1 -1
  29. datachain/lib/settings.py +0 -10
  30. datachain/lib/tar.py +2 -1
  31. datachain/lib/udf_signature.py +1 -1
  32. datachain/lib/webdataset.py +20 -30
  33. datachain/listing.py +1 -3
  34. datachain/query/dataset.py +46 -71
  35. datachain/query/session.py +1 -1
  36. datachain/remote/studio.py +26 -61
  37. datachain/studio.py +7 -23
  38. {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/METADATA +2 -2
  39. {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/RECORD +43 -47
  40. datachain/lib/namespaces.py +0 -71
  41. datachain/lib/projects.py +0 -86
  42. datachain/namespace.py +0 -65
  43. datachain/project.py +0 -78
  44. {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/WHEEL +0 -0
  45. {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/entry_points.txt +0 -0
  46. {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/licenses/LICENSE +0 -0
  47. {datachain-0.20.3.dist-info → datachain-0.21.0.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,11 @@
1
1
  from collections.abc import Sequence
2
2
  from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
3
3
 
4
- from datachain.dataset import parse_dataset_name
5
4
  from datachain.error import DatasetVersionNotFoundError
6
5
  from datachain.lib.dataset_info import DatasetInfo
7
6
  from datachain.lib.file import (
8
7
  File,
9
8
  )
10
- from datachain.lib.projects import get as get_project
11
9
  from datachain.lib.settings import Settings
12
10
  from datachain.lib.signal_schema import SignalSchema
13
11
  from datachain.query import Session
@@ -26,18 +24,12 @@ if TYPE_CHECKING:
26
24
 
27
25
  def read_dataset(
28
26
  name: str,
29
- namespace: Optional[str] = None,
30
- project: Optional[str] = None,
31
27
  version: Optional[Union[str, int]] = None,
32
28
  session: Optional[Session] = None,
33
29
  settings: Optional[dict] = None,
34
30
  fallback_to_studio: bool = True,
35
31
  delta: Optional[bool] = False,
36
- delta_on: Optional[Union[str, Sequence[str]]] = (
37
- "file.path",
38
- "file.etag",
39
- "file.version",
40
- ),
32
+ delta_on: Optional[Union[str, Sequence[str]]] = None,
41
33
  delta_result_on: Optional[Union[str, Sequence[str]]] = None,
42
34
  delta_compare: Optional[Union[str, Sequence[str]]] = None,
43
35
  delta_retry: Optional[Union[bool, str]] = None,
@@ -46,36 +38,47 @@ def read_dataset(
46
38
  If dataset or version is not found locally, it will try to pull it from Studio.
47
39
 
48
40
  Parameters:
49
- name: The dataset name, which can be a fully qualified name including the
50
- namespace and project. Alternatively, it can be a regular name, in which
51
- case the explicitly defined namespace and project will be used if they are
52
- set; otherwise, default values will be applied.
53
- namespace : optional name of namespace in which dataset to read is created
54
- project : optional name of project in which dataset to read is created
41
+ name : dataset name
55
42
  version : dataset version
56
43
  session : Session to use for the chain.
57
44
  settings : Settings to use for the chain.
58
45
  fallback_to_studio : Try to pull dataset from Studio if not found locally.
59
46
  Default is True.
60
- delta: If True, only process new or changed files instead of reprocessing
61
- everything. This saves time by skipping files that were already processed in
62
- previous versions. The optimization is working when a new version of the
63
- dataset is created.
64
- Default is False.
65
- delta_on: Field(s) that uniquely identify each record in the source data.
66
- Used to detect which records are new or changed.
67
- Default is ("file.path", "file.etag", "file.version").
68
- delta_result_on: Field(s) in the result dataset that match `delta_on` fields.
69
- Only needed if you rename the identifying fields during processing.
70
- Default is None.
71
- delta_compare: Field(s) used to detect if a record has changed.
72
- If not specified, all fields except `delta_on` fields are used.
73
- Default is None.
74
- delta_retry: Controls retry behavior for failed records:
75
- - String (field name): Reprocess records where this field is not empty
76
- (error mode)
77
- - True: Reprocess records missing from the result dataset (missing mode)
78
- - None: No retry processing (default)
47
+ delta: If set to True, we optimize the creation of new dataset versions by
48
+ calculating the diff between the latest version of this storage and the
49
+ version used to create the most recent version of the resulting chain
50
+ dataset (the one specified in `.save()`). We then run the "diff" chain
51
+ using only the diff data, rather than the entire storage data, and merge
52
+ that diff chain with the latest version of the resulting dataset to create
53
+ a new version. This approach avoids applying modifications to all records
54
+ from storage every time, which can be an expensive operation.
55
+ The diff is calculated using the `DataChain.compare()` method, which
56
+ compares the `delta_on` fields to find matches and checks the compare
57
+ fields to determine if a record has changed. Note that this process only
58
+ considers added and modified records in storage; deleted records are not
59
+ removed from the new dataset version.
60
+ This calculation is based on the difference between the current version
61
+ of the source and the version used to create the dataset.
62
+ delta_on: A list of fields that uniquely identify rows in the source.
63
+ If two rows have the same values, they are considered the same (e.g., they
64
+ could be different versions of the same row in a versioned source).
65
+ This is used in the delta update to calculate the diff.
66
+ delta_result_on: A list of fields in the resulting dataset that correspond
67
+ to the `delta_on` fields from the source.
68
+ This is needed to identify rows that have changed in the source but are
69
+ already present in the current version of the resulting dataset, in order
70
+ to avoid including outdated versions of those rows in the new dataset.
71
+ We retain only the latest versions of rows to prevent duplication.
72
+ There is no need to define this if the `delta_on` fields are present in
73
+ the final dataset and have not been renamed.
74
+ delta_compare: A list of fields used to check if the same row has been modified
75
+ in the new version of the source.
76
+ If not defined, all fields except those defined in delta_on will be used.
77
+ delta_retry: Specifies retry behavior for delta processing. If a string,
78
+ it's the name of a field in the result dataset that indicates an error
79
+ when not None - records with errors will be reprocessed. If True,
80
+ records that exist in the source dataset but not in the result dataset
81
+ will be reprocessed.
79
82
 
80
83
  Example:
81
84
  ```py
@@ -83,11 +86,6 @@ def read_dataset(
83
86
  chain = dc.read_dataset("my_cats")
84
87
  ```
85
88
 
86
- ```py
87
- import datachain as dc
88
- chain = dc.read_dataset("dev.animals.my_cats")
89
- ```
90
-
91
89
  ```py
92
90
  chain = dc.read_dataset("my_cats", fallback_to_studio=False)
93
91
  ```
@@ -118,15 +116,6 @@ def read_dataset(
118
116
 
119
117
  from .datachain import DataChain
120
118
 
121
- session = Session.get(session)
122
- catalog = session.catalog
123
-
124
- namespace_name, project_name, name = parse_dataset_name(name)
125
- namespace_name = (
126
- namespace_name or namespace or catalog.metastore.default_namespace_name
127
- )
128
- project_name = project_name or project or catalog.metastore.default_project_name
129
-
130
119
  if version is not None:
131
120
  try:
132
121
  # for backward compatibility we still allow users to put version as integer
@@ -136,9 +125,7 @@ def read_dataset(
136
125
  # all 2.* dataset versions). If dataset doesn't have any versions where
137
126
  # major part is equal to that input, exception is thrown.
138
127
  major = int(version)
139
- dataset = session.catalog.get_dataset(
140
- name, get_project(project_name, namespace_name, session=session)
141
- )
128
+ dataset = Session.get(session).catalog.get_dataset(name)
142
129
  latest_major = dataset.latest_major_version(major)
143
130
  if not latest_major:
144
131
  raise DatasetVersionNotFoundError(
@@ -149,22 +136,19 @@ def read_dataset(
149
136
  # version is in new semver string format, continuing as normal
150
137
  pass
151
138
 
152
- if settings:
153
- _settings = Settings(**settings)
154
- else:
155
- _settings = Settings()
156
-
157
139
  query = DatasetQuery(
158
140
  name=name,
159
- project_name=project_name,
160
- namespace_name=namespace_name,
161
141
  version=version, # type: ignore[arg-type]
162
142
  session=session,
163
143
  indexing_column_types=File._datachain_column_types,
164
144
  fallback_to_studio=fallback_to_studio,
165
145
  )
166
-
167
146
  telemetry.send_event_once("class", "datachain_init", name=name, version=version)
147
+ if settings:
148
+ _settings = Settings(**settings)
149
+ else:
150
+ _settings = Settings()
151
+
168
152
  signals_schema = SignalSchema({"sys": Sys})
169
153
  if query.feature_schema:
170
154
  signals_schema |= SignalSchema.deserialize(query.feature_schema)
@@ -216,7 +200,7 @@ def datasets(
216
200
  import datachain as dc
217
201
 
218
202
  chain = dc.datasets(column="dataset")
219
- for ds in chain.to_iter("dataset"):
203
+ for ds in chain.collect("dataset"):
220
204
  print(f"{ds.name}@v{ds.version}")
221
205
  ```
222
206
  """
@@ -267,8 +251,6 @@ def datasets(
267
251
 
268
252
  def delete_dataset(
269
253
  name: str,
270
- namespace: Optional[str] = None,
271
- project: Optional[str] = None,
272
254
  version: Optional[str] = None,
273
255
  force: Optional[bool] = False,
274
256
  studio: Optional[bool] = False,
@@ -279,16 +261,11 @@ def delete_dataset(
279
261
  a force flag.
280
262
 
281
263
  Args:
282
- name: The dataset name, which can be a fully qualified name including the
283
- namespace and project. Alternatively, it can be a regular name, in which
284
- case the explicitly defined namespace and project will be used if they are
285
- set; otherwise, default values will be applied.
286
- namespace : optional name of namespace in which dataset to delete is created
287
- project : optional name of project in which dataset to delete is created
264
+ name : Dataset name
288
265
  version : Optional dataset version
289
266
  force: If true, all datasets versions will be removed. Defaults to False.
290
- studio: If True, removes dataset from Studio only, otherwise removes local
291
- dataset. Defaults to False.
267
+ studio: If True, removes dataset from Studio only,
268
+ otherwise remove from local. Defaults to False.
292
269
  session: Optional session instance. If not provided, uses default session.
293
270
  in_memory: If True, creates an in-memory session. Defaults to False.
294
271
 
@@ -305,26 +282,11 @@ def delete_dataset(
305
282
  dc.delete_dataset("cats", version="1.0.0")
306
283
  ```
307
284
  """
308
- from datachain.studio import remove_studio_dataset
309
285
 
310
286
  session = Session.get(session, in_memory=in_memory)
311
287
  catalog = session.catalog
312
-
313
- namespace_name, project_name, name = parse_dataset_name(name)
314
- namespace_name = (
315
- namespace_name or namespace or catalog.metastore.default_namespace_name
316
- )
317
- project_name = project_name or project or catalog.metastore.default_project_name
318
-
319
- if not catalog.metastore.is_local_dataset(namespace_name) and studio:
320
- return remove_studio_dataset(
321
- None, name, namespace_name, project_name, version=version, force=force
322
- )
323
-
324
- ds_project = get_project(project_name, namespace_name, session=session)
325
-
326
288
  if not force:
327
- version = version or catalog.get_dataset(name, ds_project).latest_version
289
+ version = version or catalog.get_dataset(name).latest_version
328
290
  else:
329
291
  version = None
330
- catalog.remove_dataset(name, ds_project, version=version, force=force)
292
+ catalog.remove_dataset(name, version=version, force=force, studio=studio)
@@ -37,7 +37,7 @@ class ReadOnlyQueryStep(QueryStep):
37
37
  return sa.select(*columns)
38
38
 
39
39
  table_name = self.catalog.warehouse.dataset_table_name(
40
- self.dataset, self.dataset_version
40
+ self.dataset_name, self.dataset_version
41
41
  )
42
42
  dataset_row_cls = self.catalog.warehouse.schema.dataset_row_cls
43
43
  table = dataset_row_cls.new_table(
@@ -51,7 +51,7 @@ class ReadOnlyQueryStep(QueryStep):
51
51
  )
52
52
 
53
53
  return step_result(
54
- q, table.columns, dependencies=[(self.dataset, self.dataset_version)]
54
+ q, table.columns, dependencies=[(self.dataset_name, self.dataset_version)]
55
55
  )
56
56
 
57
57
 
@@ -142,7 +142,7 @@ def read_listing_dataset(
142
142
  _settings = Settings(prefetch=0)
143
143
  signal_schema = SignalSchema({"sys": Sys, "file": File})
144
144
 
145
- query.starting_step = ReadOnlyQueryStep(query.catalog, dataset, version)
145
+ query.starting_step = ReadOnlyQueryStep(query.catalog, name, version)
146
146
  query.version = version
147
147
  # We already know that this is a listing dataset,
148
148
  # so we can set the listing function to True
@@ -68,7 +68,6 @@ def read_records(
68
68
 
69
69
  dsr = catalog.create_dataset(
70
70
  name,
71
- catalog.metastore.default_project,
72
71
  columns=columns,
73
72
  feature_schema=(
74
73
  signal_schema.clone_without_sys_signals().serialize()
@@ -35,11 +35,7 @@ def read_storage(
35
35
  update: bool = False,
36
36
  anon: bool = False,
37
37
  delta: Optional[bool] = False,
38
- delta_on: Optional[Union[str, Sequence[str]]] = (
39
- "file.path",
40
- "file.etag",
41
- "file.version",
42
- ),
38
+ delta_on: Optional[Union[str, Sequence[str]]] = None,
43
39
  delta_result_on: Optional[Union[str, Sequence[str]]] = None,
44
40
  delta_compare: Optional[Union[str, Sequence[str]]] = None,
45
41
  delta_retry: Optional[Union[bool, str]] = None,
@@ -58,25 +54,43 @@ def read_storage(
58
54
  update : force storage reindexing. Default is False.
59
55
  anon : If True, we will treat cloud bucket as public one
60
56
  client_config : Optional client configuration for the storage client.
61
- delta: If True, only process new or changed files instead of reprocessing
62
- everything. This saves time by skipping files that were already processed in
63
- previous versions. The optimization is working when a new version of the
64
- dataset is created.
65
- Default is False.
66
- delta_on: Field(s) that uniquely identify each record in the source data.
67
- Used to detect which records are new or changed.
68
- Default is ("file.path", "file.etag", "file.version").
69
- delta_result_on: Field(s) in the result dataset that match `delta_on` fields.
70
- Only needed if you rename the identifying fields during processing.
71
- Default is None.
72
- delta_compare: Field(s) used to detect if a record has changed.
73
- If not specified, all fields except `delta_on` fields are used.
74
- Default is None.
75
- delta_retry: Controls retry behavior for failed records:
76
- - String (field name): Reprocess records where this field is not empty
77
- (error mode)
78
- - True: Reprocess records missing from the result dataset (missing mode)
79
- - None: No retry processing (default)
57
+ delta: If set to True, we optimize the creation of new dataset versions by
58
+ calculating the diff between the latest version of this storage and the
59
+ version used to create the most recent version of the resulting chain
60
+ dataset (the one specified in `.save()`). We then run the "diff" chain
61
+ using only the diff data, rather than the entire storage data, and merge
62
+ that diff chain with the latest version of the resulting dataset to create
63
+ a new version. This approach avoids applying modifications to all records
64
+ from storage every time, which can be an expensive operation.
65
+ The diff is calculated using the `DataChain.compare()` method, which
66
+ compares the `delta_on` fields to find matches and checks the compare
67
+ fields to determine if a record has changed. Note that this process only
68
+ considers added and modified records in storage; deleted records are not
69
+ removed from the new dataset version.
70
+ This calculation is based on the difference between the current version
71
+ of the source and the version used to create the dataset.
72
+ delta_on: A list of fields that uniquely identify rows in the source.
73
+ If two rows have the same values, they are considered the same (e.g., they
74
+ could be different versions of the same row in a versioned source).
75
+ This is used in the delta update to calculate the diff.
76
+ delta_result_on: A list of fields in the resulting dataset that correspond
77
+ to the `delta_on` fields from the source.
78
+ This is needed to identify rows that have changed in the source but are
79
+ already present in the current version of the resulting dataset, in order
80
+ to avoid including outdated versions of those rows in the new dataset.
81
+ We retain only the latest versions of rows to prevent duplication.
82
+ There is no need to define this if the `delta_on` fields are present in
83
+ the final dataset and have not been renamed.
84
+ delta_compare: A list of fields used to check if the same row has been modified
85
+ in the new version of the source.
86
+ If not defined, all fields except those defined in `delta_on` will be used.
87
+ delta_retry: Controls which records to reprocess. Can be:
88
+ - A string specifying a field name: Records where this field is not None
89
+ will be reprocessed (error checking mode).
90
+ - True: Records that exist in the source dataset but not in the result
91
+ dataset (based on delta_on/delta_result_on fields) will be reprocessed
92
+ (missing records mode).
93
+ - False or None: No retry processing.
80
94
 
81
95
  Returns:
82
96
  DataChain: A DataChain object containing the file information.
@@ -130,8 +144,6 @@ def read_storage(
130
144
  catalog = session.catalog
131
145
  cache = catalog.cache
132
146
  client_config = session.catalog.client_config
133
- listing_namespace_name = catalog.metastore.system_namespace_name
134
- listing_project_name = catalog.metastore.listing_project_name
135
147
 
136
148
  uris = uri if isinstance(uri, (list, tuple)) else [uri]
137
149
 
@@ -155,13 +167,7 @@ def read_storage(
155
167
  )
156
168
  continue
157
169
 
158
- dc = read_dataset(
159
- list_ds_name,
160
- namespace=listing_namespace_name,
161
- project=listing_project_name,
162
- session=session,
163
- settings=settings,
164
- )
170
+ dc = read_dataset(list_ds_name, session=session, settings=settings)
165
171
  dc._query.update = update
166
172
  dc.signals_schema = dc.signals_schema.mutate({f"{column}": file_type})
167
173
 
@@ -176,11 +182,7 @@ def read_storage(
176
182
  settings=settings,
177
183
  in_memory=in_memory,
178
184
  )
179
- .settings(
180
- prefetch=0,
181
- namespace=listing_namespace_name,
182
- project=listing_project_name,
183
- )
185
+ .settings(prefetch=0)
184
186
  .gen(
185
187
  list_bucket(lst_uri, cache, client_config=client_config),
186
188
  output={f"{column}": file_type},
datachain/lib/file.py CHANGED
@@ -5,14 +5,13 @@ import json
5
5
  import logging
6
6
  import os
7
7
  import posixpath
8
- import warnings
9
8
  from abc import ABC, abstractmethod
10
9
  from collections.abc import Iterator
11
10
  from contextlib import contextmanager
12
11
  from datetime import datetime
13
12
  from functools import partial
14
13
  from io import BytesIO
15
- from pathlib import Path, PurePath, PurePosixPath
14
+ from pathlib import Path, PurePosixPath
16
15
  from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
17
16
  from urllib.parse import unquote, urlparse
18
17
  from urllib.request import url2pathname
@@ -70,7 +69,7 @@ class FileExporter(NodesThreadPool):
70
69
  for task in done:
71
70
  task.result()
72
71
 
73
- def do_task(self, file: "File"):
72
+ def do_task(self, file):
74
73
  file.export(
75
74
  self.output,
76
75
  self.placement,
@@ -275,8 +274,8 @@ class File(DataModel):
275
274
 
276
275
  @field_validator("path", mode="before")
277
276
  @classmethod
278
- def validate_path(cls, path: str) -> str:
279
- return PurePath(path).as_posix() if path else ""
277
+ def validate_path(cls, path):
278
+ return Path(path).as_posix() if path else ""
280
279
 
281
280
  def model_dump_custom(self):
282
281
  res = self.model_dump()
@@ -338,11 +337,11 @@ class File(DataModel):
338
337
  return cls(**{key: row[key] for key in cls._datachain_column_types})
339
338
 
340
339
  @property
341
- def name(self) -> str:
340
+ def name(self):
342
341
  return PurePosixPath(self.path).name
343
342
 
344
343
  @property
345
- def parent(self) -> str:
344
+ def parent(self):
346
345
  return str(PurePosixPath(self.path).parent)
347
346
 
348
347
  @contextmanager
@@ -392,7 +391,7 @@ class File(DataModel):
392
391
 
393
392
  client.upload(self.read(), destination)
394
393
 
395
- def _symlink_to(self, destination: str) -> None:
394
+ def _symlink_to(self, destination: str):
396
395
  if self.location:
397
396
  raise OSError(errno.ENOTSUP, "Symlinking virtual file is not supported")
398
397
 
@@ -401,7 +400,7 @@ class File(DataModel):
401
400
  source = self.get_local_path()
402
401
  assert source, "File was not cached"
403
402
  elif self.source.startswith("file://"):
404
- source = self.get_fs_path()
403
+ source = self.get_path()
405
404
  else:
406
405
  raise OSError(errno.EXDEV, "can't link across filesystems")
407
406
 
@@ -482,62 +481,27 @@ class File(DataModel):
482
481
 
483
482
  def get_file_ext(self):
484
483
  """Returns last part of file name without `.`."""
485
- return PurePosixPath(self.path).suffix.lstrip(".")
484
+ return PurePosixPath(self.path).suffix.strip(".")
486
485
 
487
486
  def get_file_stem(self):
488
487
  """Returns file name without extension."""
489
488
  return PurePosixPath(self.path).stem
490
489
 
491
490
  def get_full_name(self):
492
- """
493
- [DEPRECATED] Use `file.path` directly instead.
494
-
495
- Returns name with parent directories.
496
- """
497
- warnings.warn(
498
- "file.get_full_name() is deprecated and will be removed "
499
- "in a future version. Use `file.path` directly.",
500
- DeprecationWarning,
501
- stacklevel=2,
502
- )
491
+ """Returns name with parent directories."""
503
492
  return self.path
504
493
 
505
- def get_path_normalized(self) -> str:
506
- if not self.path:
507
- raise FileError("path must not be empty", self.source, self.path)
508
-
509
- if self.path.endswith("/"):
510
- raise FileError("path must not be a directory", self.source, self.path)
511
-
512
- normpath = os.path.normpath(self.path)
513
- normpath = PurePath(normpath).as_posix()
514
-
515
- if normpath == ".":
516
- raise FileError("path must not be a directory", self.source, self.path)
517
-
518
- if any(part == ".." for part in PurePath(normpath).parts):
519
- raise FileError("path must not contain '..'", self.source, self.path)
520
-
521
- return normpath
522
-
523
- def get_uri(self) -> str:
494
+ def get_uri(self):
524
495
  """Returns file URI."""
525
- return f"{self.source}/{self.get_path_normalized()}"
496
+ return f"{self.source}/{self.get_full_name()}"
526
497
 
527
- def get_fs_path(self) -> str:
528
- """
529
- Returns file path with respect to the filescheme.
530
-
531
- If `normalize` is True, the path is normalized to remove any redundant
532
- separators and up-level references.
533
-
534
- If the file scheme is "file", the path is converted to a local file path
535
- using `url2pathname`. Otherwise, the original path with scheme is returned.
536
- """
498
+ def get_path(self) -> str:
499
+ """Returns file path."""
537
500
  path = unquote(self.get_uri())
538
- path_parsed = urlparse(path)
539
- if path_parsed.scheme == "file":
540
- path = url2pathname(path_parsed.path)
501
+ source = urlparse(self.source)
502
+ if source.scheme == "file":
503
+ path = urlparse(path).path
504
+ path = url2pathname(path)
541
505
  return path
542
506
 
543
507
  def get_destination_path(
@@ -552,7 +516,7 @@ class File(DataModel):
552
516
  elif placement == "etag":
553
517
  path = f"{self.etag}{self.get_file_suffix()}"
554
518
  elif placement == "fullpath":
555
- path = unquote(self.get_path_normalized())
519
+ path = unquote(self.get_full_name())
556
520
  source = urlparse(self.source)
557
521
  if source.scheme and source.scheme != "file":
558
522
  path = posixpath.join(source.netloc, path)
@@ -590,9 +554,8 @@ class File(DataModel):
590
554
  ) from e
591
555
 
592
556
  try:
593
- normalized_path = self.get_path_normalized()
594
- info = client.fs.info(client.get_full_path(normalized_path))
595
- converted_info = client.info_to_file(info, normalized_path)
557
+ info = client.fs.info(client.get_full_path(self.path))
558
+ converted_info = client.info_to_file(info, self.path)
596
559
  return type(self)(
597
560
  path=self.path,
598
561
  source=self.source,
@@ -603,17 +566,8 @@ class File(DataModel):
603
566
  last_modified=converted_info.last_modified,
604
567
  location=self.location,
605
568
  )
606
- except FileError as e:
607
- logger.warning(
608
- "File error when resolving %s/%s: %s", self.source, self.path, str(e)
609
- )
610
569
  except (FileNotFoundError, PermissionError, OSError) as e:
611
- logger.warning(
612
- "File system error when resolving %s/%s: %s",
613
- self.source,
614
- self.path,
615
- str(e),
616
- )
570
+ logger.warning("File system error when resolving %s: %s", self.path, str(e))
617
571
 
618
572
  return type(self)(
619
573
  path=self.path,
@@ -629,8 +583,6 @@ class File(DataModel):
629
583
 
630
584
  def resolve(file: File) -> File:
631
585
  """
632
- [DEPRECATED] Use `file.resolve()` directly instead.
633
-
634
586
  Resolve a File object by checking its existence and updating its metadata.
635
587
 
636
588
  This function is a wrapper around the File.resolve() method, designed to be
@@ -646,12 +598,6 @@ def resolve(file: File) -> File:
646
598
  RuntimeError: If the file's catalog is not set or if
647
599
  the file source protocol is unsupported.
648
600
  """
649
- warnings.warn(
650
- "resolve() is deprecated and will be removed "
651
- "in a future version. Use file.resolve() directly.",
652
- DeprecationWarning,
653
- stacklevel=2,
654
- )
655
601
  return file.resolve()
656
602
 
657
603
 
@@ -999,7 +945,7 @@ class ArrowRow(DataModel):
999
945
  ds = dataset(path, **self.kwargs)
1000
946
 
1001
947
  else:
1002
- path = self.file.get_fs_path()
948
+ path = self.file.get_path()
1003
949
  ds = dataset(path, filesystem=self.file.get_fs(), **self.kwargs)
1004
950
 
1005
951
  return ds.take([self.index]).to_reader()
datachain/lib/listing.py CHANGED
@@ -123,9 +123,6 @@ def parse_listing_uri(uri: str) -> tuple[str, str, str]:
123
123
  f"{LISTING_PREFIX}{storage_uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
124
124
  )
125
125
 
126
- # we should remove dots from the name
127
- ds_name = ds_name.replace(".", "_")
128
-
129
126
  return ds_name, lst_uri, path
130
127
 
131
128
 
@@ -198,4 +195,5 @@ def get_listing(
198
195
  list_path = f"{ds_name.strip('/').removeprefix(listing.name)}/{list_path}"
199
196
 
200
197
  ds_name = listing.name if listing else ds_name
198
+
201
199
  return ds_name, list_uri, list_path, bool(listing)
@@ -106,7 +106,7 @@ def read_meta( # noqa: C901
106
106
  from datachain import read_storage
107
107
 
108
108
  if schema_from:
109
- file = read_storage(schema_from, type="text").limit(1).to_values("file")[0]
109
+ file = next(read_storage(schema_from, type="text").limit(1).collect("file"))
110
110
  model_code = gen_datamodel_code(
111
111
  file, format=format, jmespath=jmespath, model_name=model_name
112
112
  )
datachain/lib/pytorch.py CHANGED
@@ -130,7 +130,7 @@ class PytorchDataset(IterableDataset):
130
130
  if self.num_samples > 0:
131
131
  ds = ds.sample(self.num_samples)
132
132
  ds = ds.chunk(total_rank, total_workers)
133
- yield from ds.to_iter()
133
+ yield from ds.collect()
134
134
 
135
135
  def _iter_with_prefetch(self) -> Generator[tuple[Any], None, None]:
136
136
  from datachain.lib.udf import _prefetch_inputs