datachain 0.21.1__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (49) hide show
  1. datachain/__init__.py +2 -0
  2. datachain/cache.py +2 -2
  3. datachain/catalog/catalog.py +213 -65
  4. datachain/cli/__init__.py +0 -7
  5. datachain/cli/commands/datasets.py +35 -26
  6. datachain/cli/commands/ls.py +2 -2
  7. datachain/cli/parser/__init__.py +1 -35
  8. datachain/client/fsspec.py +5 -3
  9. datachain/client/hf.py +10 -0
  10. datachain/client/local.py +4 -4
  11. datachain/data_storage/metastore.py +433 -37
  12. datachain/data_storage/sqlite.py +140 -7
  13. datachain/data_storage/warehouse.py +26 -7
  14. datachain/dataset.py +128 -12
  15. datachain/delta.py +11 -7
  16. datachain/error.py +36 -0
  17. datachain/func/func.py +1 -1
  18. datachain/lib/arrow.py +3 -3
  19. datachain/lib/dataset_info.py +4 -0
  20. datachain/lib/dc/datachain.py +253 -91
  21. datachain/lib/dc/datasets.py +103 -50
  22. datachain/lib/dc/listings.py +3 -3
  23. datachain/lib/dc/records.py +2 -1
  24. datachain/lib/dc/storage.py +38 -40
  25. datachain/lib/file.py +77 -23
  26. datachain/lib/listing.py +3 -1
  27. datachain/lib/meta_formats.py +1 -1
  28. datachain/lib/namespaces.py +71 -0
  29. datachain/lib/projects.py +86 -0
  30. datachain/lib/pytorch.py +1 -1
  31. datachain/lib/settings.py +10 -0
  32. datachain/lib/signal_schema.py +8 -0
  33. datachain/lib/tar.py +1 -2
  34. datachain/lib/udf.py +1 -1
  35. datachain/lib/udf_signature.py +1 -1
  36. datachain/lib/webdataset.py +30 -20
  37. datachain/listing.py +3 -1
  38. datachain/namespace.py +65 -0
  39. datachain/project.py +78 -0
  40. datachain/query/dataset.py +71 -46
  41. datachain/query/session.py +1 -1
  42. datachain/remote/studio.py +61 -26
  43. datachain/studio.py +23 -6
  44. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/METADATA +2 -2
  45. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/RECORD +49 -45
  46. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/WHEEL +0 -0
  47. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/entry_points.txt +0 -0
  48. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/licenses/LICENSE +0 -0
  49. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,16 @@
1
1
  from collections.abc import Sequence
2
2
  from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
3
3
 
4
- from datachain.error import DatasetVersionNotFoundError
4
+ from datachain.error import (
5
+ DatasetNotFoundError,
6
+ DatasetVersionNotFoundError,
7
+ ProjectNotFoundError,
8
+ )
5
9
  from datachain.lib.dataset_info import DatasetInfo
6
10
  from datachain.lib.file import (
7
11
  File,
8
12
  )
13
+ from datachain.lib.projects import get as get_project
9
14
  from datachain.lib.settings import Settings
10
15
  from datachain.lib.signal_schema import SignalSchema
11
16
  from datachain.query import Session
@@ -24,12 +29,18 @@ if TYPE_CHECKING:
24
29
 
25
30
  def read_dataset(
26
31
  name: str,
32
+ namespace: Optional[str] = None,
33
+ project: Optional[str] = None,
27
34
  version: Optional[Union[str, int]] = None,
28
35
  session: Optional[Session] = None,
29
36
  settings: Optional[dict] = None,
30
37
  fallback_to_studio: bool = True,
31
38
  delta: Optional[bool] = False,
32
- delta_on: Optional[Union[str, Sequence[str]]] = None,
39
+ delta_on: Optional[Union[str, Sequence[str]]] = (
40
+ "file.path",
41
+ "file.etag",
42
+ "file.version",
43
+ ),
33
44
  delta_result_on: Optional[Union[str, Sequence[str]]] = None,
34
45
  delta_compare: Optional[Union[str, Sequence[str]]] = None,
35
46
  delta_retry: Optional[Union[bool, str]] = None,
@@ -38,47 +49,36 @@ def read_dataset(
38
49
  If dataset or version is not found locally, it will try to pull it from Studio.
39
50
 
40
51
  Parameters:
41
- name : dataset name
52
+ name: The dataset name, which can be a fully qualified name including the
53
+ namespace and project. Alternatively, it can be a regular name, in which
54
+ case the explicitly defined namespace and project will be used if they are
55
+ set; otherwise, default values will be applied.
56
+ namespace : optional name of namespace in which dataset to read is created
57
+ project : optional name of project in which dataset to read is created
42
58
  version : dataset version
43
59
  session : Session to use for the chain.
44
60
  settings : Settings to use for the chain.
45
61
  fallback_to_studio : Try to pull dataset from Studio if not found locally.
46
62
  Default is True.
47
- delta: If set to True, we optimize the creation of new dataset versions by
48
- calculating the diff between the latest version of this storage and the
49
- version used to create the most recent version of the resulting chain
50
- dataset (the one specified in `.save()`). We then run the "diff" chain
51
- using only the diff data, rather than the entire storage data, and merge
52
- that diff chain with the latest version of the resulting dataset to create
53
- a new version. This approach avoids applying modifications to all records
54
- from storage every time, which can be an expensive operation.
55
- The diff is calculated using the `DataChain.compare()` method, which
56
- compares the `delta_on` fields to find matches and checks the compare
57
- fields to determine if a record has changed. Note that this process only
58
- considers added and modified records in storage; deleted records are not
59
- removed from the new dataset version.
60
- This calculation is based on the difference between the current version
61
- of the source and the version used to create the dataset.
62
- delta_on: A list of fields that uniquely identify rows in the source.
63
- If two rows have the same values, they are considered the same (e.g., they
64
- could be different versions of the same row in a versioned source).
65
- This is used in the delta update to calculate the diff.
66
- delta_result_on: A list of fields in the resulting dataset that correspond
67
- to the `delta_on` fields from the source.
68
- This is needed to identify rows that have changed in the source but are
69
- already present in the current version of the resulting dataset, in order
70
- to avoid including outdated versions of those rows in the new dataset.
71
- We retain only the latest versions of rows to prevent duplication.
72
- There is no need to define this if the `delta_on` fields are present in
73
- the final dataset and have not been renamed.
74
- delta_compare: A list of fields used to check if the same row has been modified
75
- in the new version of the source.
76
- If not defined, all fields except those defined in delta_on will be used.
77
- delta_retry: Specifies retry behavior for delta processing. If a string,
78
- it's the name of a field in the result dataset that indicates an error
79
- when not None - records with errors will be reprocessed. If True,
80
- records that exist in the source dataset but not in the result dataset
81
- will be reprocessed.
63
+ delta: If True, only process new or changed files instead of reprocessing
64
+ everything. This saves time by skipping files that were already processed in
65
+ previous versions. The optimization is working when a new version of the
66
+ dataset is created.
67
+ Default is False.
68
+ delta_on: Field(s) that uniquely identify each record in the source data.
69
+ Used to detect which records are new or changed.
70
+ Default is ("file.path", "file.etag", "file.version").
71
+ delta_result_on: Field(s) in the result dataset that match `delta_on` fields.
72
+ Only needed if you rename the identifying fields during processing.
73
+ Default is None.
74
+ delta_compare: Field(s) used to detect if a record has changed.
75
+ If not specified, all fields except `delta_on` fields are used.
76
+ Default is None.
77
+ delta_retry: Controls retry behavior for failed records:
78
+ - String (field name): Reprocess records where this field is not empty
79
+ (error mode)
80
+ - True: Reprocess records missing from the result dataset (missing mode)
81
+ - None: No retry processing (default)
82
82
 
83
83
  Example:
84
84
  ```py
@@ -86,6 +86,11 @@ def read_dataset(
86
86
  chain = dc.read_dataset("my_cats")
87
87
  ```
88
88
 
89
+ ```py
90
+ import datachain as dc
91
+ chain = dc.read_dataset("dev.animals.my_cats")
92
+ ```
93
+
89
94
  ```py
90
95
  chain = dc.read_dataset("my_cats", fallback_to_studio=False)
91
96
  ```
@@ -116,6 +121,15 @@ def read_dataset(
116
121
 
117
122
  from .datachain import DataChain
118
123
 
124
+ session = Session.get(session)
125
+ catalog = session.catalog
126
+
127
+ namespace_name, project_name, name = catalog.get_full_dataset_name(
128
+ name,
129
+ project_name=project,
130
+ namespace_name=namespace,
131
+ )
132
+
119
133
  if version is not None:
120
134
  try:
121
135
  # for backward compatibility we still allow users to put version as integer
@@ -125,7 +139,15 @@ def read_dataset(
125
139
  # all 2.* dataset versions). If dataset doesn't have any versions where
126
140
  # major part is equal to that input, exception is thrown.
127
141
  major = int(version)
128
- dataset = Session.get(session).catalog.get_dataset(name)
142
+ try:
143
+ ds_project = get_project(project_name, namespace_name, session=session)
144
+ except ProjectNotFoundError:
145
+ raise DatasetNotFoundError(
146
+ f"Dataset {name} not found in namespace {namespace_name} and",
147
+ f" project {project_name}",
148
+ ) from None
149
+
150
+ dataset = session.catalog.get_dataset(name, ds_project)
129
151
  latest_major = dataset.latest_major_version(major)
130
152
  if not latest_major:
131
153
  raise DatasetVersionNotFoundError(
@@ -136,19 +158,22 @@ def read_dataset(
136
158
  # version is in new semver string format, continuing as normal
137
159
  pass
138
160
 
161
+ if settings:
162
+ _settings = Settings(**settings)
163
+ else:
164
+ _settings = Settings()
165
+
139
166
  query = DatasetQuery(
140
167
  name=name,
168
+ project_name=project_name,
169
+ namespace_name=namespace_name,
141
170
  version=version, # type: ignore[arg-type]
142
171
  session=session,
143
172
  indexing_column_types=File._datachain_column_types,
144
173
  fallback_to_studio=fallback_to_studio,
145
174
  )
146
- telemetry.send_event_once("class", "datachain_init", name=name, version=version)
147
- if settings:
148
- _settings = Settings(**settings)
149
- else:
150
- _settings = Settings()
151
175
 
176
+ telemetry.send_event_once("class", "datachain_init", name=name, version=version)
152
177
  signals_schema = SignalSchema({"sys": Sys})
153
178
  if query.feature_schema:
154
179
  signals_schema |= SignalSchema.deserialize(query.feature_schema)
@@ -200,7 +225,7 @@ def datasets(
200
225
  import datachain as dc
201
226
 
202
227
  chain = dc.datasets(column="dataset")
203
- for ds in chain.collect("dataset"):
228
+ for ds in chain.to_iter("dataset"):
204
229
  print(f"{ds.name}@v{ds.version}")
205
230
  ```
206
231
  """
@@ -251,6 +276,8 @@ def datasets(
251
276
 
252
277
  def delete_dataset(
253
278
  name: str,
279
+ namespace: Optional[str] = None,
280
+ project: Optional[str] = None,
254
281
  version: Optional[str] = None,
255
282
  force: Optional[bool] = False,
256
283
  studio: Optional[bool] = False,
@@ -261,11 +288,16 @@ def delete_dataset(
261
288
  a force flag.
262
289
 
263
290
  Args:
264
- name : Dataset name
291
+ name: The dataset name, which can be a fully qualified name including the
292
+ namespace and project. Alternatively, it can be a regular name, in which
293
+ case the explicitly defined namespace and project will be used if they are
294
+ set; otherwise, default values will be applied.
295
+ namespace : optional name of namespace in which dataset to delete is created
296
+ project : optional name of project in which dataset to delete is created
265
297
  version : Optional dataset version
266
298
  force: If true, all datasets versions will be removed. Defaults to False.
267
- studio: If True, removes dataset from Studio only,
268
- otherwise remove from local. Defaults to False.
299
+ studio: If True, removes dataset from Studio only, otherwise removes local
300
+ dataset. Defaults to False.
269
301
  session: Optional session instance. If not provided, uses default session.
270
302
  in_memory: If True, creates an in-memory session. Defaults to False.
271
303
 
@@ -282,11 +314,32 @@ def delete_dataset(
282
314
  dc.delete_dataset("cats", version="1.0.0")
283
315
  ```
284
316
  """
317
+ from datachain.studio import remove_studio_dataset
285
318
 
286
319
  session = Session.get(session, in_memory=in_memory)
287
320
  catalog = session.catalog
321
+
322
+ namespace_name, project_name, name = catalog.get_full_dataset_name(
323
+ name,
324
+ project_name=project,
325
+ namespace_name=namespace,
326
+ )
327
+
328
+ if not catalog.metastore.is_local_dataset(namespace_name) and studio:
329
+ return remove_studio_dataset(
330
+ None, name, namespace_name, project_name, version=version, force=force
331
+ )
332
+
333
+ try:
334
+ ds_project = get_project(project_name, namespace_name, session=session)
335
+ except ProjectNotFoundError:
336
+ raise DatasetNotFoundError(
337
+ f"Dataset {name} not found in namespace {namespace_name} and project",
338
+ f" {project_name}",
339
+ ) from None
340
+
288
341
  if not force:
289
- version = version or catalog.get_dataset(name).latest_version
342
+ version = version or catalog.get_dataset(name, ds_project).latest_version
290
343
  else:
291
344
  version = None
292
- catalog.remove_dataset(name, version=version, force=force, studio=studio)
345
+ catalog.remove_dataset(name, ds_project, version=version, force=force)
@@ -37,7 +37,7 @@ class ReadOnlyQueryStep(QueryStep):
37
37
  return sa.select(*columns)
38
38
 
39
39
  table_name = self.catalog.warehouse.dataset_table_name(
40
- self.dataset_name, self.dataset_version
40
+ self.dataset, self.dataset_version
41
41
  )
42
42
  dataset_row_cls = self.catalog.warehouse.schema.dataset_row_cls
43
43
  table = dataset_row_cls.new_table(
@@ -51,7 +51,7 @@ class ReadOnlyQueryStep(QueryStep):
51
51
  )
52
52
 
53
53
  return step_result(
54
- q, table.columns, dependencies=[(self.dataset_name, self.dataset_version)]
54
+ q, table.columns, dependencies=[(self.dataset, self.dataset_version)]
55
55
  )
56
56
 
57
57
 
@@ -142,7 +142,7 @@ def read_listing_dataset(
142
142
  _settings = Settings(prefetch=0)
143
143
  signal_schema = SignalSchema({"sys": Sys, "file": File})
144
144
 
145
- query.starting_step = ReadOnlyQueryStep(query.catalog, name, version)
145
+ query.starting_step = ReadOnlyQueryStep(query.catalog, dataset, version)
146
146
  query.version = version
147
147
  # We already know that this is a listing dataset,
148
148
  # so we can set the listing function to True
@@ -68,6 +68,7 @@ def read_records(
68
68
 
69
69
  dsr = catalog.create_dataset(
70
70
  name,
71
+ catalog.metastore.default_project,
71
72
  columns=columns,
72
73
  feature_schema=(
73
74
  signal_schema.clone_without_sys_signals().serialize()
@@ -96,4 +97,4 @@ def read_records(
96
97
  for chunk in batched(records, INSERT_BATCH_SIZE):
97
98
  warehouse.insert_rows(table, chunk)
98
99
  warehouse.insert_rows_done(table)
99
- return read_dataset(name=dsr.name, session=session, settings=settings)
100
+ return read_dataset(name=dsr.full_name, session=session, settings=settings)
@@ -35,7 +35,11 @@ def read_storage(
35
35
  update: bool = False,
36
36
  anon: bool = False,
37
37
  delta: Optional[bool] = False,
38
- delta_on: Optional[Union[str, Sequence[str]]] = None,
38
+ delta_on: Optional[Union[str, Sequence[str]]] = (
39
+ "file.path",
40
+ "file.etag",
41
+ "file.version",
42
+ ),
39
43
  delta_result_on: Optional[Union[str, Sequence[str]]] = None,
40
44
  delta_compare: Optional[Union[str, Sequence[str]]] = None,
41
45
  delta_retry: Optional[Union[bool, str]] = None,
@@ -54,43 +58,25 @@ def read_storage(
54
58
  update : force storage reindexing. Default is False.
55
59
  anon : If True, we will treat cloud bucket as public one
56
60
  client_config : Optional client configuration for the storage client.
57
- delta: If set to True, we optimize the creation of new dataset versions by
58
- calculating the diff between the latest version of this storage and the
59
- version used to create the most recent version of the resulting chain
60
- dataset (the one specified in `.save()`). We then run the "diff" chain
61
- using only the diff data, rather than the entire storage data, and merge
62
- that diff chain with the latest version of the resulting dataset to create
63
- a new version. This approach avoids applying modifications to all records
64
- from storage every time, which can be an expensive operation.
65
- The diff is calculated using the `DataChain.compare()` method, which
66
- compares the `delta_on` fields to find matches and checks the compare
67
- fields to determine if a record has changed. Note that this process only
68
- considers added and modified records in storage; deleted records are not
69
- removed from the new dataset version.
70
- This calculation is based on the difference between the current version
71
- of the source and the version used to create the dataset.
72
- delta_on: A list of fields that uniquely identify rows in the source.
73
- If two rows have the same values, they are considered the same (e.g., they
74
- could be different versions of the same row in a versioned source).
75
- This is used in the delta update to calculate the diff.
76
- delta_result_on: A list of fields in the resulting dataset that correspond
77
- to the `delta_on` fields from the source.
78
- This is needed to identify rows that have changed in the source but are
79
- already present in the current version of the resulting dataset, in order
80
- to avoid including outdated versions of those rows in the new dataset.
81
- We retain only the latest versions of rows to prevent duplication.
82
- There is no need to define this if the `delta_on` fields are present in
83
- the final dataset and have not been renamed.
84
- delta_compare: A list of fields used to check if the same row has been modified
85
- in the new version of the source.
86
- If not defined, all fields except those defined in `delta_on` will be used.
87
- delta_retry: Controls which records to reprocess. Can be:
88
- - A string specifying a field name: Records where this field is not None
89
- will be reprocessed (error checking mode).
90
- - True: Records that exist in the source dataset but not in the result
91
- dataset (based on delta_on/delta_result_on fields) will be reprocessed
92
- (missing records mode).
93
- - False or None: No retry processing.
61
+ delta: If True, only process new or changed files instead of reprocessing
62
+ everything. This saves time by skipping files that were already processed in
63
+ previous versions. The optimization is working when a new version of the
64
+ dataset is created.
65
+ Default is False.
66
+ delta_on: Field(s) that uniquely identify each record in the source data.
67
+ Used to detect which records are new or changed.
68
+ Default is ("file.path", "file.etag", "file.version").
69
+ delta_result_on: Field(s) in the result dataset that match `delta_on` fields.
70
+ Only needed if you rename the identifying fields during processing.
71
+ Default is None.
72
+ delta_compare: Field(s) used to detect if a record has changed.
73
+ If not specified, all fields except `delta_on` fields are used.
74
+ Default is None.
75
+ delta_retry: Controls retry behavior for failed records:
76
+ - String (field name): Reprocess records where this field is not empty
77
+ (error mode)
78
+ - True: Reprocess records missing from the result dataset (missing mode)
79
+ - None: No retry processing (default)
94
80
 
95
81
  Returns:
96
82
  DataChain: A DataChain object containing the file information.
@@ -144,6 +130,8 @@ def read_storage(
144
130
  catalog = session.catalog
145
131
  cache = catalog.cache
146
132
  client_config = session.catalog.client_config
133
+ listing_namespace_name = catalog.metastore.system_namespace_name
134
+ listing_project_name = catalog.metastore.listing_project_name
147
135
 
148
136
  uris = uri if isinstance(uri, (list, tuple)) else [uri]
149
137
 
@@ -167,7 +155,13 @@ def read_storage(
167
155
  )
168
156
  continue
169
157
 
170
- dc = read_dataset(list_ds_name, session=session, settings=settings)
158
+ dc = read_dataset(
159
+ list_ds_name,
160
+ namespace=listing_namespace_name,
161
+ project=listing_project_name,
162
+ session=session,
163
+ settings=settings,
164
+ )
171
165
  dc._query.update = update
172
166
  dc.signals_schema = dc.signals_schema.mutate({f"{column}": file_type})
173
167
 
@@ -182,7 +176,11 @@ def read_storage(
182
176
  settings=settings,
183
177
  in_memory=in_memory,
184
178
  )
185
- .settings(prefetch=0)
179
+ .settings(
180
+ prefetch=0,
181
+ namespace=listing_namespace_name,
182
+ project=listing_project_name,
183
+ )
186
184
  .gen(
187
185
  list_bucket(lst_uri, cache, client_config=client_config),
188
186
  output={f"{column}": file_type},
datachain/lib/file.py CHANGED
@@ -5,13 +5,14 @@ import json
5
5
  import logging
6
6
  import os
7
7
  import posixpath
8
+ import warnings
8
9
  from abc import ABC, abstractmethod
9
10
  from collections.abc import Iterator
10
11
  from contextlib import contextmanager
11
12
  from datetime import datetime
12
13
  from functools import partial
13
14
  from io import BytesIO
14
- from pathlib import Path, PurePosixPath
15
+ from pathlib import Path, PurePath, PurePosixPath
15
16
  from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
16
17
  from urllib.parse import unquote, urlparse
17
18
  from urllib.request import url2pathname
@@ -69,7 +70,7 @@ class FileExporter(NodesThreadPool):
69
70
  for task in done:
70
71
  task.result()
71
72
 
72
- def do_task(self, file):
73
+ def do_task(self, file: "File"):
73
74
  file.export(
74
75
  self.output,
75
76
  self.placement,
@@ -274,8 +275,8 @@ class File(DataModel):
274
275
 
275
276
  @field_validator("path", mode="before")
276
277
  @classmethod
277
- def validate_path(cls, path):
278
- return Path(path).as_posix() if path else ""
278
+ def validate_path(cls, path: str) -> str:
279
+ return PurePath(path).as_posix() if path else ""
279
280
 
280
281
  def model_dump_custom(self):
281
282
  res = self.model_dump()
@@ -337,11 +338,11 @@ class File(DataModel):
337
338
  return cls(**{key: row[key] for key in cls._datachain_column_types})
338
339
 
339
340
  @property
340
- def name(self):
341
+ def name(self) -> str:
341
342
  return PurePosixPath(self.path).name
342
343
 
343
344
  @property
344
- def parent(self):
345
+ def parent(self) -> str:
345
346
  return str(PurePosixPath(self.path).parent)
346
347
 
347
348
  @contextmanager
@@ -391,7 +392,7 @@ class File(DataModel):
391
392
 
392
393
  client.upload(self.read(), destination)
393
394
 
394
- def _symlink_to(self, destination: str):
395
+ def _symlink_to(self, destination: str) -> None:
395
396
  if self.location:
396
397
  raise OSError(errno.ENOTSUP, "Symlinking virtual file is not supported")
397
398
 
@@ -400,7 +401,7 @@ class File(DataModel):
400
401
  source = self.get_local_path()
401
402
  assert source, "File was not cached"
402
403
  elif self.source.startswith("file://"):
403
- source = self.get_path()
404
+ source = self.get_fs_path()
404
405
  else:
405
406
  raise OSError(errno.EXDEV, "can't link across filesystems")
406
407
 
@@ -481,27 +482,62 @@ class File(DataModel):
481
482
 
482
483
  def get_file_ext(self):
483
484
  """Returns last part of file name without `.`."""
484
- return PurePosixPath(self.path).suffix.strip(".")
485
+ return PurePosixPath(self.path).suffix.lstrip(".")
485
486
 
486
487
  def get_file_stem(self):
487
488
  """Returns file name without extension."""
488
489
  return PurePosixPath(self.path).stem
489
490
 
490
491
  def get_full_name(self):
491
- """Returns name with parent directories."""
492
+ """
493
+ [DEPRECATED] Use `file.path` directly instead.
494
+
495
+ Returns name with parent directories.
496
+ """
497
+ warnings.warn(
498
+ "file.get_full_name() is deprecated and will be removed "
499
+ "in a future version. Use `file.path` directly.",
500
+ DeprecationWarning,
501
+ stacklevel=2,
502
+ )
492
503
  return self.path
493
504
 
494
- def get_uri(self):
505
+ def get_path_normalized(self) -> str:
506
+ if not self.path:
507
+ raise FileError("path must not be empty", self.source, self.path)
508
+
509
+ if self.path.endswith("/"):
510
+ raise FileError("path must not be a directory", self.source, self.path)
511
+
512
+ normpath = os.path.normpath(self.path)
513
+ normpath = PurePath(normpath).as_posix()
514
+
515
+ if normpath == ".":
516
+ raise FileError("path must not be a directory", self.source, self.path)
517
+
518
+ if any(part == ".." for part in PurePath(normpath).parts):
519
+ raise FileError("path must not contain '..'", self.source, self.path)
520
+
521
+ return normpath
522
+
523
+ def get_uri(self) -> str:
495
524
  """Returns file URI."""
496
- return f"{self.source}/{self.get_full_name()}"
525
+ return f"{self.source}/{self.get_path_normalized()}"
497
526
 
498
- def get_path(self) -> str:
499
- """Returns file path."""
527
+ def get_fs_path(self) -> str:
528
+ """
529
+ Returns file path with respect to the filescheme.
530
+
531
+ If `normalize` is True, the path is normalized to remove any redundant
532
+ separators and up-level references.
533
+
534
+ If the file scheme is "file", the path is converted to a local file path
535
+ using `url2pathname`. Otherwise, the original path with scheme is returned.
536
+ """
500
537
  path = unquote(self.get_uri())
501
- source = urlparse(self.source)
502
- if source.scheme == "file":
503
- path = urlparse(path).path
504
- path = url2pathname(path)
538
+ path_parsed = urlparse(path)
539
+ if path_parsed.scheme == "file":
540
+ path = url2pathname(path_parsed.path)
505
541
  return path
506
542
 
507
543
  def get_destination_path(
@@ -516,7 +552,7 @@ class File(DataModel):
516
552
  elif placement == "etag":
517
553
  path = f"{self.etag}{self.get_file_suffix()}"
518
554
  elif placement == "fullpath":
519
- path = unquote(self.get_full_name())
555
+ path = unquote(self.get_path_normalized())
520
556
  source = urlparse(self.source)
521
557
  if source.scheme and source.scheme != "file":
522
558
  path = posixpath.join(source.netloc, path)
@@ -554,8 +590,9 @@ class File(DataModel):
554
590
  ) from e
555
591
 
556
592
  try:
557
- info = client.fs.info(client.get_full_path(self.path))
558
- converted_info = client.info_to_file(info, self.path)
593
+ normalized_path = self.get_path_normalized()
594
+ info = client.fs.info(client.get_full_path(normalized_path))
595
+ converted_info = client.info_to_file(info, normalized_path)
559
596
  return type(self)(
560
597
  path=self.path,
561
598
  source=self.source,
@@ -566,8 +603,17 @@ class File(DataModel):
566
603
  last_modified=converted_info.last_modified,
567
604
  location=self.location,
568
605
  )
606
+ except FileError as e:
607
+ logger.warning(
608
+ "File error when resolving %s/%s: %s", self.source, self.path, str(e)
609
+ )
569
610
  except (FileNotFoundError, PermissionError, OSError) as e:
570
- logger.warning("File system error when resolving %s: %s", self.path, str(e))
611
+ logger.warning(
612
+ "File system error when resolving %s/%s: %s",
613
+ self.source,
614
+ self.path,
615
+ str(e),
616
+ )
571
617
 
572
618
  return type(self)(
573
619
  path=self.path,
@@ -583,6 +629,8 @@ class File(DataModel):
583
629
 
584
630
  def resolve(file: File) -> File:
585
631
  """
632
+ [DEPRECATED] Use `file.resolve()` directly instead.
633
+
586
634
  Resolve a File object by checking its existence and updating its metadata.
587
635
 
588
636
  This function is a wrapper around the File.resolve() method, designed to be
@@ -598,6 +646,12 @@ def resolve(file: File) -> File:
598
646
  RuntimeError: If the file's catalog is not set or if
599
647
  the file source protocol is unsupported.
600
648
  """
649
+ warnings.warn(
650
+ "resolve() is deprecated and will be removed "
651
+ "in a future version. Use file.resolve() directly.",
652
+ DeprecationWarning,
653
+ stacklevel=2,
654
+ )
601
655
  return file.resolve()
602
656
 
603
657
 
@@ -945,7 +999,7 @@ class ArrowRow(DataModel):
945
999
  ds = dataset(path, **self.kwargs)
946
1000
 
947
1001
  else:
948
- path = self.file.get_path()
1002
+ path = self.file.get_fs_path()
949
1003
  ds = dataset(path, filesystem=self.file.get_fs(), **self.kwargs)
950
1004
 
951
1005
  return ds.take([self.index]).to_reader()
datachain/lib/listing.py CHANGED
@@ -123,6 +123,9 @@ def parse_listing_uri(uri: str) -> tuple[str, str, str]:
123
123
  f"{LISTING_PREFIX}{storage_uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
124
124
  )
125
125
 
126
+ # we should remove dots from the name
127
+ ds_name = ds_name.replace(".", "_")
128
+
126
129
  return ds_name, lst_uri, path
127
130
 
128
131
 
@@ -195,5 +198,4 @@ def get_listing(
195
198
  list_path = f"{ds_name.strip('/').removeprefix(listing.name)}/{list_path}"
196
199
 
197
200
  ds_name = listing.name if listing else ds_name
198
-
199
201
  return ds_name, list_uri, list_path, bool(listing)
@@ -106,7 +106,7 @@ def read_meta( # noqa: C901
106
106
  from datachain import read_storage
107
107
 
108
108
  if schema_from:
109
- file = next(read_storage(schema_from, type="text").limit(1).collect("file"))
109
+ file = read_storage(schema_from, type="text").limit(1).to_values("file")[0]
110
110
  model_code = gen_datamodel_code(
111
111
  file, format=format, jmespath=jmespath, model_name=model_name
112
112
  )