datachain 0.21.1__py3-none-any.whl → 0.22.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (48) hide show
  1. datachain/__init__.py +2 -0
  2. datachain/cache.py +2 -2
  3. datachain/catalog/catalog.py +180 -65
  4. datachain/cli/__init__.py +0 -7
  5. datachain/cli/commands/datasets.py +43 -28
  6. datachain/cli/commands/ls.py +2 -2
  7. datachain/cli/parser/__init__.py +1 -35
  8. datachain/client/fsspec.py +5 -3
  9. datachain/client/hf.py +10 -0
  10. datachain/client/local.py +4 -4
  11. datachain/data_storage/metastore.py +422 -37
  12. datachain/data_storage/sqlite.py +136 -7
  13. datachain/data_storage/warehouse.py +26 -7
  14. datachain/dataset.py +126 -12
  15. datachain/delta.py +11 -7
  16. datachain/error.py +36 -0
  17. datachain/func/func.py +1 -1
  18. datachain/lib/arrow.py +3 -3
  19. datachain/lib/dataset_info.py +4 -0
  20. datachain/lib/dc/datachain.py +260 -92
  21. datachain/lib/dc/datasets.py +104 -50
  22. datachain/lib/dc/listings.py +3 -3
  23. datachain/lib/dc/records.py +1 -0
  24. datachain/lib/dc/storage.py +38 -40
  25. datachain/lib/file.py +77 -23
  26. datachain/lib/listing.py +3 -1
  27. datachain/lib/meta_formats.py +1 -1
  28. datachain/lib/namespaces.py +71 -0
  29. datachain/lib/projects.py +86 -0
  30. datachain/lib/pytorch.py +1 -1
  31. datachain/lib/settings.py +10 -0
  32. datachain/lib/tar.py +1 -2
  33. datachain/lib/udf.py +1 -1
  34. datachain/lib/udf_signature.py +1 -1
  35. datachain/lib/webdataset.py +30 -20
  36. datachain/listing.py +3 -1
  37. datachain/namespace.py +65 -0
  38. datachain/project.py +78 -0
  39. datachain/query/dataset.py +71 -46
  40. datachain/query/session.py +1 -1
  41. datachain/remote/studio.py +61 -26
  42. datachain/studio.py +23 -6
  43. {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/METADATA +2 -2
  44. {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/RECORD +48 -44
  45. {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/WHEEL +0 -0
  46. {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/entry_points.txt +0 -0
  47. {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/licenses/LICENSE +0 -0
  48. {datachain-0.21.1.dist-info → datachain-0.22.0.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,17 @@
1
1
  from collections.abc import Sequence
2
2
  from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
3
3
 
4
- from datachain.error import DatasetVersionNotFoundError
4
+ from datachain.dataset import parse_dataset_name
5
+ from datachain.error import (
6
+ DatasetNotFoundError,
7
+ DatasetVersionNotFoundError,
8
+ ProjectNotFoundError,
9
+ )
5
10
  from datachain.lib.dataset_info import DatasetInfo
6
11
  from datachain.lib.file import (
7
12
  File,
8
13
  )
14
+ from datachain.lib.projects import get as get_project
9
15
  from datachain.lib.settings import Settings
10
16
  from datachain.lib.signal_schema import SignalSchema
11
17
  from datachain.query import Session
@@ -24,12 +30,18 @@ if TYPE_CHECKING:
24
30
 
25
31
  def read_dataset(
26
32
  name: str,
33
+ namespace: Optional[str] = None,
34
+ project: Optional[str] = None,
27
35
  version: Optional[Union[str, int]] = None,
28
36
  session: Optional[Session] = None,
29
37
  settings: Optional[dict] = None,
30
38
  fallback_to_studio: bool = True,
31
39
  delta: Optional[bool] = False,
32
- delta_on: Optional[Union[str, Sequence[str]]] = None,
40
+ delta_on: Optional[Union[str, Sequence[str]]] = (
41
+ "file.path",
42
+ "file.etag",
43
+ "file.version",
44
+ ),
33
45
  delta_result_on: Optional[Union[str, Sequence[str]]] = None,
34
46
  delta_compare: Optional[Union[str, Sequence[str]]] = None,
35
47
  delta_retry: Optional[Union[bool, str]] = None,
@@ -38,47 +50,36 @@ def read_dataset(
38
50
  If dataset or version is not found locally, it will try to pull it from Studio.
39
51
 
40
52
  Parameters:
41
- name : dataset name
53
+ name: The dataset name, which can be a fully qualified name including the
54
+ namespace and project. Alternatively, it can be a regular name, in which
55
+ case the explicitly defined namespace and project will be used if they are
56
+ set; otherwise, default values will be applied.
57
+ namespace : optional name of namespace in which dataset to read is created
58
+ project : optional name of project in which dataset to read is created
42
59
  version : dataset version
43
60
  session : Session to use for the chain.
44
61
  settings : Settings to use for the chain.
45
62
  fallback_to_studio : Try to pull dataset from Studio if not found locally.
46
63
  Default is True.
47
- delta: If set to True, we optimize the creation of new dataset versions by
48
- calculating the diff between the latest version of this storage and the
49
- version used to create the most recent version of the resulting chain
50
- dataset (the one specified in `.save()`). We then run the "diff" chain
51
- using only the diff data, rather than the entire storage data, and merge
52
- that diff chain with the latest version of the resulting dataset to create
53
- a new version. This approach avoids applying modifications to all records
54
- from storage every time, which can be an expensive operation.
55
- The diff is calculated using the `DataChain.compare()` method, which
56
- compares the `delta_on` fields to find matches and checks the compare
57
- fields to determine if a record has changed. Note that this process only
58
- considers added and modified records in storage; deleted records are not
59
- removed from the new dataset version.
60
- This calculation is based on the difference between the current version
61
- of the source and the version used to create the dataset.
62
- delta_on: A list of fields that uniquely identify rows in the source.
63
- If two rows have the same values, they are considered the same (e.g., they
64
- could be different versions of the same row in a versioned source).
65
- This is used in the delta update to calculate the diff.
66
- delta_result_on: A list of fields in the resulting dataset that correspond
67
- to the `delta_on` fields from the source.
68
- This is needed to identify rows that have changed in the source but are
69
- already present in the current version of the resulting dataset, in order
70
- to avoid including outdated versions of those rows in the new dataset.
71
- We retain only the latest versions of rows to prevent duplication.
72
- There is no need to define this if the `delta_on` fields are present in
73
- the final dataset and have not been renamed.
74
- delta_compare: A list of fields used to check if the same row has been modified
75
- in the new version of the source.
76
- If not defined, all fields except those defined in delta_on will be used.
77
- delta_retry: Specifies retry behavior for delta processing. If a string,
78
- it's the name of a field in the result dataset that indicates an error
79
- when not None - records with errors will be reprocessed. If True,
80
- records that exist in the source dataset but not in the result dataset
81
- will be reprocessed.
64
+ delta: If True, only process new or changed files instead of reprocessing
65
+ everything. This saves time by skipping files that were already processed in
66
+ previous versions. The optimization is working when a new version of the
67
+ dataset is created.
68
+ Default is False.
69
+ delta_on: Field(s) that uniquely identify each record in the source data.
70
+ Used to detect which records are new or changed.
71
+ Default is ("file.path", "file.etag", "file.version").
72
+ delta_result_on: Field(s) in the result dataset that match `delta_on` fields.
73
+ Only needed if you rename the identifying fields during processing.
74
+ Default is None.
75
+ delta_compare: Field(s) used to detect if a record has changed.
76
+ If not specified, all fields except `delta_on` fields are used.
77
+ Default is None.
78
+ delta_retry: Controls retry behavior for failed records:
79
+ - String (field name): Reprocess records where this field is not empty
80
+ (error mode)
81
+ - True: Reprocess records missing from the result dataset (missing mode)
82
+ - None: No retry processing (default)
82
83
 
83
84
  Example:
84
85
  ```py
@@ -86,6 +87,11 @@ def read_dataset(
86
87
  chain = dc.read_dataset("my_cats")
87
88
  ```
88
89
 
90
+ ```py
91
+ import datachain as dc
92
+ chain = dc.read_dataset("dev.animals.my_cats")
93
+ ```
94
+
89
95
  ```py
90
96
  chain = dc.read_dataset("my_cats", fallback_to_studio=False)
91
97
  ```
@@ -116,6 +122,15 @@ def read_dataset(
116
122
 
117
123
  from .datachain import DataChain
118
124
 
125
+ session = Session.get(session)
126
+ catalog = session.catalog
127
+
128
+ namespace_name, project_name, name = parse_dataset_name(name)
129
+ namespace_name = (
130
+ namespace_name or namespace or catalog.metastore.default_namespace_name
131
+ )
132
+ project_name = project_name or project or catalog.metastore.default_project_name
133
+
119
134
  if version is not None:
120
135
  try:
121
136
  # for backward compatibility we still allow users to put version as integer
@@ -125,7 +140,15 @@ def read_dataset(
125
140
  # all 2.* dataset versions). If dataset doesn't have any versions where
126
141
  # major part is equal to that input, exception is thrown.
127
142
  major = int(version)
128
- dataset = Session.get(session).catalog.get_dataset(name)
143
+ try:
144
+ ds_project = get_project(project_name, namespace_name, session=session)
145
+ except ProjectNotFoundError:
146
+ raise DatasetNotFoundError(
147
+ f"Dataset {name} not found in namespace {namespace_name} and",
148
+ f" project {project_name}",
149
+ ) from None
150
+
151
+ dataset = session.catalog.get_dataset(name, ds_project)
129
152
  latest_major = dataset.latest_major_version(major)
130
153
  if not latest_major:
131
154
  raise DatasetVersionNotFoundError(
@@ -136,19 +159,22 @@ def read_dataset(
136
159
  # version is in new semver string format, continuing as normal
137
160
  pass
138
161
 
162
+ if settings:
163
+ _settings = Settings(**settings)
164
+ else:
165
+ _settings = Settings()
166
+
139
167
  query = DatasetQuery(
140
168
  name=name,
169
+ project_name=project_name,
170
+ namespace_name=namespace_name,
141
171
  version=version, # type: ignore[arg-type]
142
172
  session=session,
143
173
  indexing_column_types=File._datachain_column_types,
144
174
  fallback_to_studio=fallback_to_studio,
145
175
  )
146
- telemetry.send_event_once("class", "datachain_init", name=name, version=version)
147
- if settings:
148
- _settings = Settings(**settings)
149
- else:
150
- _settings = Settings()
151
176
 
177
+ telemetry.send_event_once("class", "datachain_init", name=name, version=version)
152
178
  signals_schema = SignalSchema({"sys": Sys})
153
179
  if query.feature_schema:
154
180
  signals_schema |= SignalSchema.deserialize(query.feature_schema)
@@ -200,7 +226,7 @@ def datasets(
200
226
  import datachain as dc
201
227
 
202
228
  chain = dc.datasets(column="dataset")
203
- for ds in chain.collect("dataset"):
229
+ for ds in chain.to_iter("dataset"):
204
230
  print(f"{ds.name}@v{ds.version}")
205
231
  ```
206
232
  """
@@ -251,6 +277,8 @@ def datasets(
251
277
 
252
278
  def delete_dataset(
253
279
  name: str,
280
+ namespace: Optional[str] = None,
281
+ project: Optional[str] = None,
254
282
  version: Optional[str] = None,
255
283
  force: Optional[bool] = False,
256
284
  studio: Optional[bool] = False,
@@ -261,11 +289,16 @@ def delete_dataset(
261
289
  a force flag.
262
290
 
263
291
  Args:
264
- name : Dataset name
292
+ name: The dataset name, which can be a fully qualified name including the
293
+ namespace and project. Alternatively, it can be a regular name, in which
294
+ case the explicitly defined namespace and project will be used if they are
295
+ set; otherwise, default values will be applied.
296
+ namespace : optional name of namespace in which dataset to delete is created
297
+ project : optional name of project in which dataset to delete is created
265
298
  version : Optional dataset version
266
299
  force: If true, all datasets versions will be removed. Defaults to False.
267
- studio: If True, removes dataset from Studio only,
268
- otherwise remove from local. Defaults to False.
300
+ studio: If True, removes dataset from Studio only, otherwise removes local
301
+ dataset. Defaults to False.
269
302
  session: Optional session instance. If not provided, uses default session.
270
303
  in_memory: If True, creates an in-memory session. Defaults to False.
271
304
 
@@ -282,11 +315,32 @@ def delete_dataset(
282
315
  dc.delete_dataset("cats", version="1.0.0")
283
316
  ```
284
317
  """
318
+ from datachain.studio import remove_studio_dataset
285
319
 
286
320
  session = Session.get(session, in_memory=in_memory)
287
321
  catalog = session.catalog
322
+
323
+ namespace_name, project_name, name = parse_dataset_name(name)
324
+ namespace_name = (
325
+ namespace_name or namespace or catalog.metastore.default_namespace_name
326
+ )
327
+ project_name = project_name or project or catalog.metastore.default_project_name
328
+
329
+ if not catalog.metastore.is_local_dataset(namespace_name) and studio:
330
+ return remove_studio_dataset(
331
+ None, name, namespace_name, project_name, version=version, force=force
332
+ )
333
+
334
+ try:
335
+ ds_project = get_project(project_name, namespace_name, session=session)
336
+ except ProjectNotFoundError:
337
+ raise DatasetNotFoundError(
338
+ f"Dataset {name} not found in namespace {namespace_name} and project",
339
+ f" {project_name}",
340
+ ) from None
341
+
288
342
  if not force:
289
- version = version or catalog.get_dataset(name).latest_version
343
+ version = version or catalog.get_dataset(name, ds_project).latest_version
290
344
  else:
291
345
  version = None
292
- catalog.remove_dataset(name, version=version, force=force, studio=studio)
346
+ catalog.remove_dataset(name, ds_project, version=version, force=force)
@@ -37,7 +37,7 @@ class ReadOnlyQueryStep(QueryStep):
37
37
  return sa.select(*columns)
38
38
 
39
39
  table_name = self.catalog.warehouse.dataset_table_name(
40
- self.dataset_name, self.dataset_version
40
+ self.dataset, self.dataset_version
41
41
  )
42
42
  dataset_row_cls = self.catalog.warehouse.schema.dataset_row_cls
43
43
  table = dataset_row_cls.new_table(
@@ -51,7 +51,7 @@ class ReadOnlyQueryStep(QueryStep):
51
51
  )
52
52
 
53
53
  return step_result(
54
- q, table.columns, dependencies=[(self.dataset_name, self.dataset_version)]
54
+ q, table.columns, dependencies=[(self.dataset, self.dataset_version)]
55
55
  )
56
56
 
57
57
 
@@ -142,7 +142,7 @@ def read_listing_dataset(
142
142
  _settings = Settings(prefetch=0)
143
143
  signal_schema = SignalSchema({"sys": Sys, "file": File})
144
144
 
145
- query.starting_step = ReadOnlyQueryStep(query.catalog, name, version)
145
+ query.starting_step = ReadOnlyQueryStep(query.catalog, dataset, version)
146
146
  query.version = version
147
147
  # We already know that this is a listing dataset,
148
148
  # so we can set the listing function to True
@@ -68,6 +68,7 @@ def read_records(
68
68
 
69
69
  dsr = catalog.create_dataset(
70
70
  name,
71
+ catalog.metastore.default_project,
71
72
  columns=columns,
72
73
  feature_schema=(
73
74
  signal_schema.clone_without_sys_signals().serialize()
@@ -35,7 +35,11 @@ def read_storage(
35
35
  update: bool = False,
36
36
  anon: bool = False,
37
37
  delta: Optional[bool] = False,
38
- delta_on: Optional[Union[str, Sequence[str]]] = None,
38
+ delta_on: Optional[Union[str, Sequence[str]]] = (
39
+ "file.path",
40
+ "file.etag",
41
+ "file.version",
42
+ ),
39
43
  delta_result_on: Optional[Union[str, Sequence[str]]] = None,
40
44
  delta_compare: Optional[Union[str, Sequence[str]]] = None,
41
45
  delta_retry: Optional[Union[bool, str]] = None,
@@ -54,43 +58,25 @@ def read_storage(
54
58
  update : force storage reindexing. Default is False.
55
59
  anon : If True, we will treat cloud bucket as public one
56
60
  client_config : Optional client configuration for the storage client.
57
- delta: If set to True, we optimize the creation of new dataset versions by
58
- calculating the diff between the latest version of this storage and the
59
- version used to create the most recent version of the resulting chain
60
- dataset (the one specified in `.save()`). We then run the "diff" chain
61
- using only the diff data, rather than the entire storage data, and merge
62
- that diff chain with the latest version of the resulting dataset to create
63
- a new version. This approach avoids applying modifications to all records
64
- from storage every time, which can be an expensive operation.
65
- The diff is calculated using the `DataChain.compare()` method, which
66
- compares the `delta_on` fields to find matches and checks the compare
67
- fields to determine if a record has changed. Note that this process only
68
- considers added and modified records in storage; deleted records are not
69
- removed from the new dataset version.
70
- This calculation is based on the difference between the current version
71
- of the source and the version used to create the dataset.
72
- delta_on: A list of fields that uniquely identify rows in the source.
73
- If two rows have the same values, they are considered the same (e.g., they
74
- could be different versions of the same row in a versioned source).
75
- This is used in the delta update to calculate the diff.
76
- delta_result_on: A list of fields in the resulting dataset that correspond
77
- to the `delta_on` fields from the source.
78
- This is needed to identify rows that have changed in the source but are
79
- already present in the current version of the resulting dataset, in order
80
- to avoid including outdated versions of those rows in the new dataset.
81
- We retain only the latest versions of rows to prevent duplication.
82
- There is no need to define this if the `delta_on` fields are present in
83
- the final dataset and have not been renamed.
84
- delta_compare: A list of fields used to check if the same row has been modified
85
- in the new version of the source.
86
- If not defined, all fields except those defined in `delta_on` will be used.
87
- delta_retry: Controls which records to reprocess. Can be:
88
- - A string specifying a field name: Records where this field is not None
89
- will be reprocessed (error checking mode).
90
- - True: Records that exist in the source dataset but not in the result
91
- dataset (based on delta_on/delta_result_on fields) will be reprocessed
92
- (missing records mode).
93
- - False or None: No retry processing.
61
+ delta: If True, only process new or changed files instead of reprocessing
62
+ everything. This saves time by skipping files that were already processed in
63
+ previous versions. The optimization is working when a new version of the
64
+ dataset is created.
65
+ Default is False.
66
+ delta_on: Field(s) that uniquely identify each record in the source data.
67
+ Used to detect which records are new or changed.
68
+ Default is ("file.path", "file.etag", "file.version").
69
+ delta_result_on: Field(s) in the result dataset that match `delta_on` fields.
70
+ Only needed if you rename the identifying fields during processing.
71
+ Default is None.
72
+ delta_compare: Field(s) used to detect if a record has changed.
73
+ If not specified, all fields except `delta_on` fields are used.
74
+ Default is None.
75
+ delta_retry: Controls retry behavior for failed records:
76
+ - String (field name): Reprocess records where this field is not empty
77
+ (error mode)
78
+ - True: Reprocess records missing from the result dataset (missing mode)
79
+ - None: No retry processing (default)
94
80
 
95
81
  Returns:
96
82
  DataChain: A DataChain object containing the file information.
@@ -144,6 +130,8 @@ def read_storage(
144
130
  catalog = session.catalog
145
131
  cache = catalog.cache
146
132
  client_config = session.catalog.client_config
133
+ listing_namespace_name = catalog.metastore.system_namespace_name
134
+ listing_project_name = catalog.metastore.listing_project_name
147
135
 
148
136
  uris = uri if isinstance(uri, (list, tuple)) else [uri]
149
137
 
@@ -167,7 +155,13 @@ def read_storage(
167
155
  )
168
156
  continue
169
157
 
170
- dc = read_dataset(list_ds_name, session=session, settings=settings)
158
+ dc = read_dataset(
159
+ list_ds_name,
160
+ namespace=listing_namespace_name,
161
+ project=listing_project_name,
162
+ session=session,
163
+ settings=settings,
164
+ )
171
165
  dc._query.update = update
172
166
  dc.signals_schema = dc.signals_schema.mutate({f"{column}": file_type})
173
167
 
@@ -182,7 +176,11 @@ def read_storage(
182
176
  settings=settings,
183
177
  in_memory=in_memory,
184
178
  )
185
- .settings(prefetch=0)
179
+ .settings(
180
+ prefetch=0,
181
+ namespace=listing_namespace_name,
182
+ project=listing_project_name,
183
+ )
186
184
  .gen(
187
185
  list_bucket(lst_uri, cache, client_config=client_config),
188
186
  output={f"{column}": file_type},
datachain/lib/file.py CHANGED
@@ -5,13 +5,14 @@ import json
5
5
  import logging
6
6
  import os
7
7
  import posixpath
8
+ import warnings
8
9
  from abc import ABC, abstractmethod
9
10
  from collections.abc import Iterator
10
11
  from contextlib import contextmanager
11
12
  from datetime import datetime
12
13
  from functools import partial
13
14
  from io import BytesIO
14
- from pathlib import Path, PurePosixPath
15
+ from pathlib import Path, PurePath, PurePosixPath
15
16
  from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
16
17
  from urllib.parse import unquote, urlparse
17
18
  from urllib.request import url2pathname
@@ -69,7 +70,7 @@ class FileExporter(NodesThreadPool):
69
70
  for task in done:
70
71
  task.result()
71
72
 
72
- def do_task(self, file):
73
+ def do_task(self, file: "File"):
73
74
  file.export(
74
75
  self.output,
75
76
  self.placement,
@@ -274,8 +275,8 @@ class File(DataModel):
274
275
 
275
276
  @field_validator("path", mode="before")
276
277
  @classmethod
277
- def validate_path(cls, path):
278
- return Path(path).as_posix() if path else ""
278
+ def validate_path(cls, path: str) -> str:
279
+ return PurePath(path).as_posix() if path else ""
279
280
 
280
281
  def model_dump_custom(self):
281
282
  res = self.model_dump()
@@ -337,11 +338,11 @@ class File(DataModel):
337
338
  return cls(**{key: row[key] for key in cls._datachain_column_types})
338
339
 
339
340
  @property
340
- def name(self):
341
+ def name(self) -> str:
341
342
  return PurePosixPath(self.path).name
342
343
 
343
344
  @property
344
- def parent(self):
345
+ def parent(self) -> str:
345
346
  return str(PurePosixPath(self.path).parent)
346
347
 
347
348
  @contextmanager
@@ -391,7 +392,7 @@ class File(DataModel):
391
392
 
392
393
  client.upload(self.read(), destination)
393
394
 
394
- def _symlink_to(self, destination: str):
395
+ def _symlink_to(self, destination: str) -> None:
395
396
  if self.location:
396
397
  raise OSError(errno.ENOTSUP, "Symlinking virtual file is not supported")
397
398
 
@@ -400,7 +401,7 @@ class File(DataModel):
400
401
  source = self.get_local_path()
401
402
  assert source, "File was not cached"
402
403
  elif self.source.startswith("file://"):
403
- source = self.get_path()
404
+ source = self.get_fs_path()
404
405
  else:
405
406
  raise OSError(errno.EXDEV, "can't link across filesystems")
406
407
 
@@ -481,27 +482,62 @@ class File(DataModel):
481
482
 
482
483
  def get_file_ext(self):
483
484
  """Returns last part of file name without `.`."""
484
- return PurePosixPath(self.path).suffix.strip(".")
485
+ return PurePosixPath(self.path).suffix.lstrip(".")
485
486
 
486
487
  def get_file_stem(self):
487
488
  """Returns file name without extension."""
488
489
  return PurePosixPath(self.path).stem
489
490
 
490
491
  def get_full_name(self):
491
- """Returns name with parent directories."""
492
+ """
493
+ [DEPRECATED] Use `file.path` directly instead.
494
+
495
+ Returns name with parent directories.
496
+ """
497
+ warnings.warn(
498
+ "file.get_full_name() is deprecated and will be removed "
499
+ "in a future version. Use `file.path` directly.",
500
+ DeprecationWarning,
501
+ stacklevel=2,
502
+ )
492
503
  return self.path
493
504
 
494
- def get_uri(self):
505
+ def get_path_normalized(self) -> str:
506
+ if not self.path:
507
+ raise FileError("path must not be empty", self.source, self.path)
508
+
509
+ if self.path.endswith("/"):
510
+ raise FileError("path must not be a directory", self.source, self.path)
511
+
512
+ normpath = os.path.normpath(self.path)
513
+ normpath = PurePath(normpath).as_posix()
514
+
515
+ if normpath == ".":
516
+ raise FileError("path must not be a directory", self.source, self.path)
517
+
518
+ if any(part == ".." for part in PurePath(normpath).parts):
519
+ raise FileError("path must not contain '..'", self.source, self.path)
520
+
521
+ return normpath
522
+
523
+ def get_uri(self) -> str:
495
524
  """Returns file URI."""
496
- return f"{self.source}/{self.get_full_name()}"
525
+ return f"{self.source}/{self.get_path_normalized()}"
497
526
 
498
- def get_path(self) -> str:
499
- """Returns file path."""
527
+ def get_fs_path(self) -> str:
528
+ """
529
+ Returns file path with respect to the filescheme.
530
+
531
+ If `normalize` is True, the path is normalized to remove any redundant
532
+ separators and up-level references.
533
+
534
+ If the file scheme is "file", the path is converted to a local file path
535
+ using `url2pathname`. Otherwise, the original path with scheme is returned.
536
+ """
500
537
  path = unquote(self.get_uri())
501
- source = urlparse(self.source)
502
- if source.scheme == "file":
503
- path = urlparse(path).path
504
- path = url2pathname(path)
538
+ path_parsed = urlparse(path)
539
+ if path_parsed.scheme == "file":
540
+ path = url2pathname(path_parsed.path)
505
541
  return path
506
542
 
507
543
  def get_destination_path(
@@ -516,7 +552,7 @@ class File(DataModel):
516
552
  elif placement == "etag":
517
553
  path = f"{self.etag}{self.get_file_suffix()}"
518
554
  elif placement == "fullpath":
519
- path = unquote(self.get_full_name())
555
+ path = unquote(self.get_path_normalized())
520
556
  source = urlparse(self.source)
521
557
  if source.scheme and source.scheme != "file":
522
558
  path = posixpath.join(source.netloc, path)
@@ -554,8 +590,9 @@ class File(DataModel):
554
590
  ) from e
555
591
 
556
592
  try:
557
- info = client.fs.info(client.get_full_path(self.path))
558
- converted_info = client.info_to_file(info, self.path)
593
+ normalized_path = self.get_path_normalized()
594
+ info = client.fs.info(client.get_full_path(normalized_path))
595
+ converted_info = client.info_to_file(info, normalized_path)
559
596
  return type(self)(
560
597
  path=self.path,
561
598
  source=self.source,
@@ -566,8 +603,17 @@ class File(DataModel):
566
603
  last_modified=converted_info.last_modified,
567
604
  location=self.location,
568
605
  )
606
+ except FileError as e:
607
+ logger.warning(
608
+ "File error when resolving %s/%s: %s", self.source, self.path, str(e)
609
+ )
569
610
  except (FileNotFoundError, PermissionError, OSError) as e:
570
- logger.warning("File system error when resolving %s: %s", self.path, str(e))
611
+ logger.warning(
612
+ "File system error when resolving %s/%s: %s",
613
+ self.source,
614
+ self.path,
615
+ str(e),
616
+ )
571
617
 
572
618
  return type(self)(
573
619
  path=self.path,
@@ -583,6 +629,8 @@ class File(DataModel):
583
629
 
584
630
  def resolve(file: File) -> File:
585
631
  """
632
+ [DEPRECATED] Use `file.resolve()` directly instead.
633
+
586
634
  Resolve a File object by checking its existence and updating its metadata.
587
635
 
588
636
  This function is a wrapper around the File.resolve() method, designed to be
@@ -598,6 +646,12 @@ def resolve(file: File) -> File:
598
646
  RuntimeError: If the file's catalog is not set or if
599
647
  the file source protocol is unsupported.
600
648
  """
649
+ warnings.warn(
650
+ "resolve() is deprecated and will be removed "
651
+ "in a future version. Use file.resolve() directly.",
652
+ DeprecationWarning,
653
+ stacklevel=2,
654
+ )
601
655
  return file.resolve()
602
656
 
603
657
 
@@ -945,7 +999,7 @@ class ArrowRow(DataModel):
945
999
  ds = dataset(path, **self.kwargs)
946
1000
 
947
1001
  else:
948
- path = self.file.get_path()
1002
+ path = self.file.get_fs_path()
949
1003
  ds = dataset(path, filesystem=self.file.get_fs(), **self.kwargs)
950
1004
 
951
1005
  return ds.take([self.index]).to_reader()
datachain/lib/listing.py CHANGED
@@ -123,6 +123,9 @@ def parse_listing_uri(uri: str) -> tuple[str, str, str]:
123
123
  f"{LISTING_PREFIX}{storage_uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
124
124
  )
125
125
 
126
+ # we should remove dots from the name
127
+ ds_name = ds_name.replace(".", "_")
128
+
126
129
  return ds_name, lst_uri, path
127
130
 
128
131
 
@@ -195,5 +198,4 @@ def get_listing(
195
198
  list_path = f"{ds_name.strip('/').removeprefix(listing.name)}/{list_path}"
196
199
 
197
200
  ds_name = listing.name if listing else ds_name
198
-
199
201
  return ds_name, list_uri, list_path, bool(listing)
@@ -106,7 +106,7 @@ def read_meta( # noqa: C901
106
106
  from datachain import read_storage
107
107
 
108
108
  if schema_from:
109
- file = next(read_storage(schema_from, type="text").limit(1).collect("file"))
109
+ file = read_storage(schema_from, type="text").limit(1).to_values("file")[0]
110
110
  model_code = gen_datamodel_code(
111
111
  file, format=format, jmespath=jmespath, model_name=model_name
112
112
  )