datachain 0.24.5__py3-none-any.whl → 0.25.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/__init__.py CHANGED
@@ -7,6 +7,7 @@ from datachain.lib.dc import (
7
7
  datasets,
8
8
  delete_dataset,
9
9
  listings,
10
+ move_dataset,
10
11
  read_csv,
11
12
  read_database,
12
13
  read_dataset,
@@ -69,6 +70,7 @@ __all__ = [
69
70
  "is_chain_type",
70
71
  "listings",
71
72
  "metrics",
73
+ "move_dataset",
72
74
  "param",
73
75
  "read_csv",
74
76
  "read_database",
@@ -956,26 +956,9 @@ class Catalog:
956
956
  self, dataset: DatasetRecord, conn=None, **kwargs
957
957
  ) -> DatasetRecord:
958
958
  """Updates dataset fields."""
959
- old_name = None
960
- new_name = None
961
- if "name" in kwargs and kwargs["name"] != dataset.name:
962
- old_name = dataset.name
963
- new_name = kwargs["name"]
964
-
965
- dataset = self.metastore.update_dataset(dataset, conn=conn, **kwargs)
966
-
967
- if old_name and new_name:
968
- # updating name must result in updating dataset table names as well
969
- for version in [v.version for v in dataset.versions]:
970
- self.warehouse.rename_dataset_table(
971
- dataset,
972
- old_name,
973
- new_name,
974
- old_version=version,
975
- new_version=version,
976
- )
977
-
978
- return dataset
959
+ dataset_updated = self.metastore.update_dataset(dataset, conn=conn, **kwargs)
960
+ self.warehouse.rename_dataset_tables(dataset, dataset_updated)
961
+ return dataset_updated
979
962
 
980
963
  def remove_dataset_version(
981
964
  self, dataset: DatasetRecord, version: str, drop_rows: Optional[bool] = True
@@ -1555,12 +1538,14 @@ class Catalog:
1555
1538
  remote_ds.project.namespace.name,
1556
1539
  description=remote_ds.project.namespace.descr,
1557
1540
  uuid=remote_ds.project.namespace.uuid,
1541
+ validate=False,
1558
1542
  )
1559
1543
  project = self.metastore.create_project(
1560
1544
  namespace.name,
1561
1545
  remote_ds.project.name,
1562
1546
  description=remote_ds.project.descr,
1563
1547
  uuid=remote_ds.project.uuid,
1548
+ validate=False,
1564
1549
  )
1565
1550
 
1566
1551
  try:
@@ -207,6 +207,10 @@ class AbstractMetastore(ABC, Serializable):
207
207
  It also creates project if not found and create flag is set to True.
208
208
  """
209
209
 
210
+ @abstractmethod
211
+ def get_project_by_id(self, project_id: int, conn=None) -> Project:
212
+ """Gets a single project by id"""
213
+
210
214
  @abstractmethod
211
215
  def list_projects(self, namespace_id: Optional[int], conn=None) -> list[Project]:
212
216
  """Gets list of projects in some namespace or in general (in all namespaces)"""
@@ -851,6 +855,24 @@ class AbstractDBMetastore(AbstractMetastore):
851
855
  )
852
856
  return self.project_class.parse(*rows[0])
853
857
 
858
+ def get_project_by_id(self, project_id: int, conn=None) -> Project:
859
+ """Gets a single project by id"""
860
+ n = self._namespaces
861
+ p = self._projects
862
+
863
+ query = self._projects_select(
864
+ *(getattr(n.c, f) for f in self._namespaces_fields),
865
+ *(getattr(p.c, f) for f in self._projects_fields),
866
+ )
867
+ query = query.select_from(n.join(p, n.c.id == p.c.namespace_id)).where(
868
+ p.c.id == project_id
869
+ )
870
+
871
+ rows = list(self.db.execute(query, conn=conn))
872
+ if not rows:
873
+ raise ProjectNotFoundError(f"Project with id {project_id} not found.")
874
+ return self.project_class.parse(*rows[0])
875
+
854
876
  def list_projects(self, namespace_id: Optional[int], conn=None) -> list[Project]:
855
877
  """
856
878
  Gets a list of projects inside some namespace, or in all namespaces
@@ -1008,6 +1030,11 @@ class AbstractDBMetastore(AbstractMetastore):
1008
1030
  else:
1009
1031
  values[field] = json.dumps(value)
1010
1032
  dataset_values[field] = DatasetRecord.parse_schema(value)
1033
+ elif field == "project_id":
1034
+ if not value:
1035
+ raise ValueError("Cannot set empty project_id for dataset")
1036
+ dataset_values["project"] = self.get_project_by_id(value)
1037
+ values[field] = value
1011
1038
  else:
1012
1039
  values[field] = value
1013
1040
  dataset_values[field] = value
@@ -1017,7 +1044,9 @@ class AbstractDBMetastore(AbstractMetastore):
1017
1044
 
1018
1045
  d = self._datasets
1019
1046
  self.db.execute(
1020
- self._datasets_update().where(d.c.name == dataset.name).values(values),
1047
+ self._datasets_update()
1048
+ .where(d.c.name == dataset.name, d.c.project_id == dataset.project.id)
1049
+ .values(values),
1021
1050
  conn=conn,
1022
1051
  ) # type: ignore [attr-defined]
1023
1052
 
@@ -356,24 +356,23 @@ class AbstractWarehouse(ABC, Serializable):
356
356
  self, dataset: DatasetRecord, version: str
357
357
  ) -> list[StorageURI]: ...
358
358
 
359
- def rename_dataset_table(
360
- self,
361
- dataset: DatasetRecord,
362
- old_name: str,
363
- new_name: str,
364
- old_version: str,
365
- new_version: str,
359
+ def rename_dataset_tables(
360
+ self, dataset: DatasetRecord, dataset_updated: DatasetRecord
366
361
  ) -> None:
367
- namespace = dataset.project.namespace.name
368
- project = dataset.project.name
369
- old_ds_table_name = self._construct_dataset_table_name(
370
- namespace, project, old_name, old_version
371
- )
372
- new_ds_table_name = self._construct_dataset_table_name(
373
- namespace, project, new_name, new_version
374
- )
375
-
376
- self.db.rename_table(old_ds_table_name, new_ds_table_name)
362
+ """
363
+ Renames all dataset version tables when parts of the dataset that
364
+ are used in constructing table name are updated.
365
+ If nothing important is changed, nothing will be renamed (no DB calls
366
+ will be made at all).
367
+ """
368
+ for version in [v.version for v in dataset_updated.versions]:
369
+ if not dataset.has_version(version):
370
+ continue
371
+ src = self.dataset_table_name(dataset, version)
372
+ dest = self.dataset_table_name(dataset_updated, version)
373
+ if src == dest:
374
+ continue
375
+ self.db.rename_table(src, dest)
377
376
 
378
377
  def dataset_rows_count(self, dataset: DatasetRecord, version=None) -> int:
379
378
  """Returns total number of rows in a dataset"""
@@ -1,7 +1,7 @@
1
1
  from .csv import read_csv
2
2
  from .database import read_database
3
3
  from .datachain import C, Column, DataChain
4
- from .datasets import datasets, delete_dataset, read_dataset
4
+ from .datasets import datasets, delete_dataset, move_dataset, read_dataset
5
5
  from .hf import read_hf
6
6
  from .json import read_json
7
7
  from .listings import listings
@@ -22,6 +22,7 @@ __all__ = [
22
22
  "datasets",
23
23
  "delete_dataset",
24
24
  "listings",
25
+ "move_dataset",
25
26
  "read_csv",
26
27
  "read_database",
27
28
  "read_dataset",
@@ -361,3 +361,58 @@ def delete_dataset(
361
361
  else:
362
362
  version = None
363
363
  catalog.remove_dataset(name, ds_project, version=version, force=force)
364
+
365
+
366
+ def move_dataset(
367
+ src: str,
368
+ dest: str,
369
+ session: Optional[Session] = None,
370
+ in_memory: bool = False,
371
+ ) -> None:
372
+ """Moves an entire dataset between namespaces and projects.
373
+
374
+ Args:
375
+ src: The source dataset name. This can be a fully qualified name that includes
376
+ the namespace and project, or a regular name. If a regular name is used,
377
+ default values will be applied. The source dataset will no longer exist
378
+ after the move.
379
+ dst: The destination dataset name. This can also be a fully qualified
380
+ name with a namespace and project, or just a regular name (default values
381
+ will be used in that case). The original dataset will be moved here.
382
+ session: An optional session instance. If not provided, the default session
383
+ will be used.
384
+ in_memory: If True, creates an in-memory session. Defaults to False.
385
+
386
+ Returns:
387
+ None
388
+
389
+ Examples:
390
+ ```python
391
+ import datachain as dc
392
+ dc.move_dataset("cats", "new_cats")
393
+ ```
394
+
395
+ ```python
396
+ import datachain as dc
397
+ dc.move_dataset("dev.animals.cats", "prod.animals.cats")
398
+ ```
399
+ """
400
+ session = Session.get(session, in_memory=in_memory)
401
+ catalog = session.catalog
402
+
403
+ namespace, project, name = catalog.get_full_dataset_name(src)
404
+ dest_namespace, dest_project, dest_name = catalog.get_full_dataset_name(dest)
405
+
406
+ dataset = catalog.get_dataset(
407
+ name, catalog.metastore.get_project(project, namespace)
408
+ )
409
+
410
+ catalog.update_dataset(
411
+ dataset,
412
+ name=dest_name,
413
+ project_id=catalog.metastore.get_project(
414
+ dest_project,
415
+ dest_namespace,
416
+ create=catalog.metastore.project_allowed_to_create,
417
+ ).id,
418
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.24.5
3
+ Version: 0.25.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -1,4 +1,4 @@
1
- datachain/__init__.py,sha256=gGeaUZXIGQIMCllVXCyDinLfW6oIn33vlK1bXfCAJjI,1578
1
+ datachain/__init__.py,sha256=ofXacfzLKYzTqU1oyHz5xZi1L4skQCoJdUMC4YARenk,1616
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
4
4
  datachain/cache.py,sha256=ESVRaCJXEThMIfGEFVHx6wJPOZA7FYk9V6WxjyuqUBY,3626
@@ -21,7 +21,7 @@ datachain/studio.py,sha256=bLok-eJNFRHQScEyAyA_Fas52dmijd5r-73KudWxV4k,13337
21
21
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
22
22
  datachain/utils.py,sha256=DNqOi-Ydb7InyWvD9m7_yailxz6-YGpZzh00biQaHNo,15305
23
23
  datachain/catalog/__init__.py,sha256=cMZzSz3VoUi-6qXSVaHYN-agxQuAcz2XSqnEPZ55crE,353
24
- datachain/catalog/catalog.py,sha256=3QwiljkEHWm5xNmvqT3ey_JvKS8viiJgGWwhbqWIH0M,65996
24
+ datachain/catalog/catalog.py,sha256=QTWCXy75iWo-0MCXyfV_WbsKeZ1fpLpvL8d60rxn1ws,65528
25
25
  datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
26
26
  datachain/catalog/loader.py,sha256=UXjYD6BNRoupPvkiz3-b04jepXhtLHCA4gzKFnXxOtQ,5987
27
27
  datachain/cli/__init__.py,sha256=WvBqnwjG8Wp9xGCn-4eqfoZ3n7Sj1HJemCi4MayJh_c,8221
@@ -49,11 +49,11 @@ datachain/client/s3.py,sha256=6DNVGLg-woPS1DVlYVX2rIlunNblsuxyOnI1rSzhW3k,7515
49
49
  datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
50
50
  datachain/data_storage/db_engine.py,sha256=n8ojCbvVMPY2e3SG8fUaaD0b9GkVfpl_Naa_6EiHfWg,3788
51
51
  datachain/data_storage/job.py,sha256=9r0OGwh22bHNIvLHqg8_-eJSP1YYB-BN5HOla5TdCxw,402
52
- datachain/data_storage/metastore.py,sha256=af7TsMHbANXmWKIu-LzQmsZpNxM6-hhzZfInWx7MQXI,52667
52
+ datachain/data_storage/metastore.py,sha256=Qw332arvhgXB4UY0yX-Hu8Vgl3smU12l6bvxrL9Q-vo,53810
53
53
  datachain/data_storage/schema.py,sha256=o3JbURKXRg3IJyIVA4QjHHkn6byRuz7avbydU2FlvNY,9897
54
54
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
55
55
  datachain/data_storage/sqlite.py,sha256=TTQjdDXUaZSr3MEaxZjDhsVIkIJqxFNA-sD25TO3m_4,30228
56
- datachain/data_storage/warehouse.py,sha256=2Bp2fXfcm-acwYjDWqVzGjoIQSAR4L56GPNtPcaT2gU,32418
56
+ datachain/data_storage/warehouse.py,sha256=nhF8yfpdJpstpXnv_sj7WFzU97JkvSeqetqJQp33cyE,32563
57
57
  datachain/diff/__init__.py,sha256=-OFZzgOplqO84iWgGY7kfe60NXaWR9JRIh9T-uJboAM,9668
58
58
  datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
59
59
  datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
@@ -100,11 +100,11 @@ datachain/lib/convert/python_to_sql.py,sha256=wg-O5FRKX3x3Wh8ZL1b9ntMlgf1zRO4djM
100
100
  datachain/lib/convert/sql_to_python.py,sha256=XXCBYDQFUXJIBNWkjEP944cnCfJ8GF2Tji0DLF3A_zQ,315
101
101
  datachain/lib/convert/unflatten.py,sha256=ysMkstwJzPMWUlnxn-Z-tXJR3wmhjHeSN_P-sDcLS6s,2010
102
102
  datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUOzHUGPoyZXAB0,4360
103
- datachain/lib/dc/__init__.py,sha256=HD0NYrdy44u6kkpvgGjJcvGz-UGTHui2azghcT8ZUg0,838
103
+ datachain/lib/dc/__init__.py,sha256=TFci5HTvYGjBesNUxDAnXaX36PnzPEUSn5a6JxB9o0U,872
104
104
  datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
105
105
  datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,6051
106
106
  datachain/lib/dc/datachain.py,sha256=_FJnpgNN_b2xz39MsgeS0NTto0hzpcFPbJlaUBLcqTs,87094
107
- datachain/lib/dc/datasets.py,sha256=MzM7MTn90Q-dZYuMNUzJXRW1YHOpHeFHspadRjQoI70,13297
107
+ datachain/lib/dc/datasets.py,sha256=eBhcybEeXHcQ_7RweRCh5uJyF5Ym1EEDPmD0YWYDPHw,15097
108
108
  datachain/lib/dc/hf.py,sha256=PJl2wiLjdRsMz0SYbLT-6H8b-D5i2WjeH7li8HHOk_0,2145
109
109
  datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
110
110
  datachain/lib/dc/listings.py,sha256=V379Cb-7ZyquM0w7sWArQZkzInZy4GB7QQ1ZfowKzQY,4544
@@ -157,9 +157,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
157
157
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
158
158
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
159
159
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
160
- datachain-0.24.5.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
161
- datachain-0.24.5.dist-info/METADATA,sha256=Z2iA5yN6lZ_rNmi3YqSOj-j5jvJ-HCIpy8sVQvb3vR4,13281
162
- datachain-0.24.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
163
- datachain-0.24.5.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
164
- datachain-0.24.5.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
165
- datachain-0.24.5.dist-info/RECORD,,
160
+ datachain-0.25.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
161
+ datachain-0.25.0.dist-info/METADATA,sha256=8CTAh5kMX-1sYBuh5CXD6u_gLS1yIWwWTwF1_umz7ek,13281
162
+ datachain-0.25.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
163
+ datachain-0.25.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
164
+ datachain-0.25.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
165
+ datachain-0.25.0.dist-info/RECORD,,