datachain 0.20.2__py3-none-any.whl → 0.20.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/__init__.py CHANGED
@@ -1,4 +1,3 @@
1
- from datachain.lib import namespaces, projects
2
1
  from datachain.lib.data_model import DataModel, DataType, is_chain_type
3
2
  from datachain.lib.dc import (
4
3
  C,
@@ -33,6 +32,7 @@ from datachain.lib.file import (
33
32
  VideoFrame,
34
33
  )
35
34
  from datachain.lib.model_store import ModelStore
35
+ from datachain.lib.projects import create as create_project
36
36
  from datachain.lib.udf import Aggregator, Generator, Mapper
37
37
  from datachain.lib.utils import AbstractUDF, DataChainError
38
38
  from datachain.query import metrics, param
@@ -63,14 +63,13 @@ __all__ = [
63
63
  "VideoFile",
64
64
  "VideoFragment",
65
65
  "VideoFrame",
66
+ "create_project",
66
67
  "datasets",
67
68
  "delete_dataset",
68
69
  "is_chain_type",
69
70
  "listings",
70
71
  "metrics",
71
- "namespaces",
72
72
  "param",
73
- "projects",
74
73
  "read_csv",
75
74
  "read_database",
76
75
  "read_dataset",
datachain/cache.py CHANGED
@@ -39,7 +39,7 @@ def temporary_cache(
39
39
  cache.destroy()
40
40
 
41
41
 
42
- class Cache:
42
+ class Cache: # noqa: PLW1641
43
43
  def __init__(self, cache_dir: str, tmp_dir: str):
44
44
  self.odb = LocalHashFileDB(
45
45
  LocalFileSystem(),
@@ -76,9 +76,9 @@ class Cache:
76
76
  async def download(
77
77
  self, file: "File", client: "Client", callback: Optional[Callback] = None
78
78
  ) -> None:
79
- from_path = f"{file.source}/{file.path}"
80
79
  from dvc_objects.fs.utils import tmp_fname
81
80
 
81
+ from_path = file.get_uri()
82
82
  odb_fs = self.odb.fs
83
83
  tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
84
84
  size = file.size
@@ -1491,13 +1491,13 @@ class Catalog:
1491
1491
 
1492
1492
  namespace = self.metastore.create_namespace(
1493
1493
  remote_ds.project.namespace.name,
1494
- description=remote_ds.project.namespace.description,
1494
+ description=remote_ds.project.namespace.descr,
1495
1495
  uuid=remote_ds.project.namespace.uuid,
1496
1496
  )
1497
1497
  project = self.metastore.create_project(
1498
- remote_ds.project.name,
1499
1498
  namespace.name,
1500
- description=remote_ds.project.description,
1499
+ remote_ds.project.name,
1500
+ description=remote_ds.project.descr,
1501
1501
  uuid=remote_ds.project.uuid,
1502
1502
  )
1503
1503
 
@@ -63,8 +63,8 @@ def ls_local(
63
63
  print(format_ls_entry(entry))
64
64
  else:
65
65
  # Collect results in a list here to prevent interference from `tqdm` and `print`
66
- listing = list(listings().collect("listing"))
67
- for ls in listing:
66
+ listing = listings().to_list("listing")
67
+ for (ls,) in listing:
68
68
  print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
69
69
 
70
70
 
@@ -207,13 +207,14 @@ class Client(ABC):
207
207
  )
208
208
 
209
209
  async def get_current_etag(self, file: "File") -> str:
210
+ file_path = file.get_path_normalized()
210
211
  kwargs = {}
211
212
  if self._is_version_aware():
212
213
  kwargs["version_id"] = file.version
213
214
  info = await self.fs._info(
214
- self.get_full_path(file.path, file.version), **kwargs
215
+ self.get_full_path(file_path, file.version), **kwargs
215
216
  )
216
- return self.info_to_file(info, file.path).etag
217
+ return self.info_to_file(info, file_path).etag
217
218
 
218
219
  def get_file_info(self, path: str, version_id: Optional[str] = None) -> "File":
219
220
  info = self.fs.info(self.get_full_path(path, version_id), version_id=version_id)
@@ -386,7 +387,8 @@ class Client(ABC):
386
387
  return open(cache_path, mode="rb")
387
388
  assert not file.location
388
389
  return FileWrapper(
389
- self.fs.open(self.get_full_path(file.path, file.version)), cb
390
+ self.fs.open(self.get_full_path(file.get_path_normalized(), file.version)),
391
+ cb,
390
392
  ) # type: ignore[return-value]
391
393
 
392
394
  def upload(self, data: bytes, path: str) -> "File":
datachain/client/local.py CHANGED
@@ -99,7 +99,7 @@ class FileClient(Client):
99
99
  )
100
100
 
101
101
  async def get_current_etag(self, file: "File") -> str:
102
- info = self.fs.info(self.get_full_path(file.path))
102
+ info = self.fs.info(self.get_full_path(file.get_path_normalized()))
103
103
  return self.info_to_file(info, "").etag
104
104
 
105
105
  async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
@@ -138,8 +138,8 @@ class FileClient(Client):
138
138
  if not self.use_symlinks:
139
139
  super().fetch_nodes(nodes, shared_progress_bar)
140
140
 
141
- def do_instantiate_object(self, uid, dst):
141
+ def do_instantiate_object(self, file: File, dst: str) -> None:
142
142
  if self.use_symlinks:
143
- os.symlink(Path(self.name, uid.path), dst)
143
+ os.symlink(Path(self.name, file.path), dst)
144
144
  else:
145
- super().do_instantiate_object(uid, dst)
145
+ super().do_instantiate_object(file, dst)
@@ -176,7 +176,9 @@ class AbstractMetastore(ABC, Serializable):
176
176
 
177
177
  @cached_property
178
178
  def default_project(self) -> Project:
179
- return self.get_project(self.default_project_name, self.default_namespace_name)
179
+ return self.get_project(
180
+ self.default_project_name, self.default_namespace_name, create=True
181
+ )
180
182
 
181
183
  @cached_property
182
184
  def listing_project(self) -> Project:
@@ -185,8 +187,8 @@ class AbstractMetastore(ABC, Serializable):
185
187
  @abstractmethod
186
188
  def create_project(
187
189
  self,
188
- name: str,
189
190
  namespace_name: str,
191
+ name: str,
190
192
  description: Optional[str] = None,
191
193
  uuid: Optional[str] = None,
192
194
  ignore_if_exists: bool = True,
@@ -195,8 +197,13 @@ class AbstractMetastore(ABC, Serializable):
195
197
  """Creates new project in specific namespace"""
196
198
 
197
199
  @abstractmethod
198
- def get_project(self, name: str, namespace_name: str, conn=None) -> Project:
199
- """Gets a single project inside some namespace by name"""
200
+ def get_project(
201
+ self, name: str, namespace_name: str, create: bool = False, conn=None
202
+ ) -> Project:
203
+ """
204
+ Gets a single project inside some namespace by name.
205
+ It also creates project if not found and create flag is set to True.
206
+ """
200
207
 
201
208
  @abstractmethod
202
209
  def list_projects(self, namespace_id: Optional[int], conn=None) -> list[Project]:
@@ -763,14 +770,18 @@ class AbstractDBMetastore(AbstractMetastore):
763
770
 
764
771
  def create_project(
765
772
  self,
766
- name: str,
767
773
  namespace_name: str,
774
+ name: str,
768
775
  description: Optional[str] = None,
769
776
  uuid: Optional[str] = None,
770
777
  ignore_if_exists: bool = True,
771
778
  **kwargs,
772
779
  ) -> Project:
773
- namespace = self.get_namespace(namespace_name)
780
+ try:
781
+ namespace = self.get_namespace(namespace_name)
782
+ except NamespaceNotFoundError:
783
+ namespace = self.create_namespace(namespace_name)
784
+
774
785
  query = self._projects_insert().values(
775
786
  namespace_id=namespace.id,
776
787
  uuid=uuid or str(uuid4()),
@@ -788,7 +799,9 @@ class AbstractDBMetastore(AbstractMetastore):
788
799
 
789
800
  return self.get_project(name, namespace.name)
790
801
 
791
- def get_project(self, name: str, namespace_name: str, conn=None) -> Project:
802
+ def get_project(
803
+ self, name: str, namespace_name: str, create: bool = False, conn=None
804
+ ) -> Project:
792
805
  """Gets a single project inside some namespace by name"""
793
806
  n = self._namespaces
794
807
  p = self._projects
@@ -803,6 +816,8 @@ class AbstractDBMetastore(AbstractMetastore):
803
816
 
804
817
  rows = list(self.db.execute(query, conn=conn))
805
818
  if not rows:
819
+ if create:
820
+ return self.create_project(namespace_name, name)
806
821
  raise ProjectNotFoundError(
807
822
  f"Project {name} in namespace {namespace_name} not found."
808
823
  )
@@ -469,10 +469,7 @@ class SQLiteMetastore(AbstractDBMetastore):
469
469
  Studio dataset.
470
470
  """
471
471
  system_namespace = self.create_namespace(Namespace.system(), "System namespace")
472
- self.create_project(Project.listing(), system_namespace.name, "Listing project")
473
-
474
- local_namespace = self.create_namespace(Namespace.default(), "Local namespace")
475
- self.create_project(Project.default(), local_namespace.name, "Local project")
472
+ self.create_project(system_namespace.name, Project.listing(), "Listing project")
476
473
 
477
474
  def _check_schema_version(self) -> None:
478
475
  """
datachain/dataset.py CHANGED
@@ -83,10 +83,11 @@ def parse_dataset_name(name: str) -> tuple[Optional[str], Optional[str], str]:
83
83
  if not name:
84
84
  raise ValueError("Name must be defined to parse it")
85
85
  split = name.split(".")
86
- if len(split) == 3:
87
- return tuple(split) # type: ignore[return-value]
86
+ name = split[-1]
87
+ project_name = split[-2] if len(split) > 1 else None
88
+ namespace_name = split[-3] if len(split) > 2 else None
88
89
 
89
- return None, None, name
90
+ return namespace_name, project_name, name
90
91
 
91
92
 
92
93
  class DatasetDependencyType:
datachain/delta.py CHANGED
@@ -62,7 +62,7 @@ def _get_delta_chain(
62
62
  )
63
63
 
64
64
  # Calculate diff between source versions
65
- return source_dc_latest.compare(source_dc, on=on, compare=compare, deleted=False)
65
+ return source_dc_latest.diff(source_dc, on=on, compare=compare, deleted=False)
66
66
 
67
67
 
68
68
  def _get_retry_chain(
@@ -237,7 +237,7 @@ def delta_retry_update(
237
237
  return None, None, False
238
238
 
239
239
  latest_dataset = datachain.read_dataset(name, version=latest_version)
240
- compared_chain = latest_dataset.compare(
240
+ compared_chain = latest_dataset.diff(
241
241
  processing_chain,
242
242
  on=right_on or on,
243
243
  added=True,
datachain/func/func.py CHANGED
@@ -25,7 +25,7 @@ if TYPE_CHECKING:
25
25
  ColT = Union[str, Column, ColumnElement, "Func", tuple]
26
26
 
27
27
 
28
- class Func(Function):
28
+ class Func(Function): # noqa: PLW1641
29
29
  """Represents a function to be applied to a column in a SQL query."""
30
30
 
31
31
  def __init__(
datachain/lib/arrow.py CHANGED
@@ -76,7 +76,7 @@ class ArrowGenerator(Generator):
76
76
  fs_path = file.path
77
77
  fs = ReferenceFileSystem({fs_path: [cache_path]})
78
78
  else:
79
- fs, fs_path = file.get_fs(), file.get_path()
79
+ fs, fs_path = file.get_fs(), file.get_fs_path()
80
80
 
81
81
  kwargs = self.kwargs
82
82
  if format := kwargs.get("format"):
@@ -160,8 +160,8 @@ def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:
160
160
  kwargs["format"] = fix_pyarrow_format(format, parse_options)
161
161
 
162
162
  schemas = []
163
- for file in chain.collect("file"):
164
- ds = dataset(file.get_path(), filesystem=file.get_fs(), **kwargs) # type: ignore[union-attr]
163
+ for (file,) in chain.to_iter("file"):
164
+ ds = dataset(file.get_fs_path(), filesystem=file.get_fs(), **kwargs) # type: ignore[union-attr]
165
165
  schemas.append(ds.schema)
166
166
  if not schemas:
167
167
  raise ValueError(
@@ -22,8 +22,8 @@ if TYPE_CHECKING:
22
22
 
23
23
  class DatasetInfo(DataModel):
24
24
  name: str
25
- namespace_name: str
26
- project_name: str
25
+ namespace: str
26
+ project: str
27
27
  uuid: str = Field(default=str(uuid4()))
28
28
  version: str = Field(default=DEFAULT_DATASET_VERSION)
29
29
  status: int = Field(default=DatasetStatus.CREATED)
@@ -93,8 +93,8 @@ class DatasetInfo(DataModel):
93
93
  return cls(
94
94
  uuid=version.uuid,
95
95
  name=dataset.name,
96
- namespace_name=dataset.project.namespace.name,
97
- project_name=dataset.project.name,
96
+ namespace=dataset.project.namespace.name,
97
+ project=dataset.project.name,
98
98
  version=version.version,
99
99
  status=version.status,
100
100
  created_at=version.created_at,