datachain 0.21.0__py3-none-any.whl → 0.22.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (48) hide show
  1. datachain/__init__.py +2 -0
  2. datachain/cache.py +2 -2
  3. datachain/catalog/catalog.py +180 -65
  4. datachain/cli/__init__.py +4 -9
  5. datachain/cli/commands/datasets.py +43 -28
  6. datachain/cli/commands/ls.py +2 -2
  7. datachain/cli/parser/__init__.py +1 -35
  8. datachain/client/fsspec.py +5 -3
  9. datachain/client/hf.py +10 -0
  10. datachain/client/local.py +4 -4
  11. datachain/data_storage/metastore.py +422 -37
  12. datachain/data_storage/sqlite.py +136 -7
  13. datachain/data_storage/warehouse.py +26 -7
  14. datachain/dataset.py +126 -12
  15. datachain/delta.py +11 -7
  16. datachain/error.py +36 -0
  17. datachain/func/func.py +1 -1
  18. datachain/lib/arrow.py +3 -3
  19. datachain/lib/dataset_info.py +4 -0
  20. datachain/lib/dc/datachain.py +260 -92
  21. datachain/lib/dc/datasets.py +104 -50
  22. datachain/lib/dc/listings.py +3 -3
  23. datachain/lib/dc/records.py +1 -0
  24. datachain/lib/dc/storage.py +38 -40
  25. datachain/lib/file.py +77 -23
  26. datachain/lib/listing.py +3 -1
  27. datachain/lib/meta_formats.py +1 -1
  28. datachain/lib/namespaces.py +71 -0
  29. datachain/lib/projects.py +86 -0
  30. datachain/lib/pytorch.py +1 -1
  31. datachain/lib/settings.py +10 -0
  32. datachain/lib/tar.py +1 -2
  33. datachain/lib/udf.py +1 -1
  34. datachain/lib/udf_signature.py +1 -1
  35. datachain/lib/webdataset.py +30 -20
  36. datachain/listing.py +3 -1
  37. datachain/namespace.py +65 -0
  38. datachain/project.py +78 -0
  39. datachain/query/dataset.py +71 -46
  40. datachain/query/session.py +1 -1
  41. datachain/remote/studio.py +61 -26
  42. datachain/studio.py +36 -10
  43. {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/METADATA +2 -2
  44. {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/RECORD +48 -44
  45. {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/WHEEL +0 -0
  46. {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/entry_points.txt +0 -0
  47. {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/licenses/LICENSE +0 -0
  48. {datachain-0.21.0.dist-info → datachain-0.22.0.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,7 @@ import os
3
3
  import sqlite3
4
4
  from collections.abc import Iterable, Sequence
5
5
  from contextlib import contextmanager
6
- from functools import wraps
6
+ from functools import cached_property, wraps
7
7
  from time import sleep
8
8
  from typing import (
9
9
  TYPE_CHECKING,
@@ -15,7 +15,15 @@ from typing import (
15
15
  )
16
16
 
17
17
  import sqlalchemy
18
- from sqlalchemy import MetaData, Table, UniqueConstraint, exists, select
18
+ from sqlalchemy import (
19
+ Column,
20
+ Integer,
21
+ MetaData,
22
+ Table,
23
+ UniqueConstraint,
24
+ exists,
25
+ select,
26
+ )
19
27
  from sqlalchemy.dialects import sqlite
20
28
  from sqlalchemy.schema import CreateIndex, CreateTable, DropTable
21
29
  from sqlalchemy.sql import func
@@ -30,7 +38,9 @@ from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
30
38
  from datachain.data_storage.db_engine import DatabaseEngine
31
39
  from datachain.data_storage.schema import DefaultSchema
32
40
  from datachain.dataset import DatasetRecord, StorageURI
33
- from datachain.error import DataChainError
41
+ from datachain.error import DataChainError, OutdatedDatabaseSchemaError
42
+ from datachain.namespace import Namespace
43
+ from datachain.project import Project
34
44
  from datachain.sql.sqlite import create_user_defined_sql_functions, sqlite_dialect
35
45
  from datachain.sql.sqlite.base import load_usearch_extension
36
46
  from datachain.sql.types import SQLType
@@ -60,6 +70,14 @@ datachain.sql.sqlite.setup()
60
70
  quote_schema = sqlite_dialect.identifier_preparer.quote_schema
61
71
  quote = sqlite_dialect.identifier_preparer.quote
62
72
 
73
+ # NOTE! This should be manually increased when we change our DB schema in codebase
74
+ SCHEMA_VERSION = 1
75
+
76
+ OUTDATED_SCHEMA_ERROR_MESSAGE = (
77
+ "You have an old version of the database schema. Please refer to the documentation"
78
+ " for more information."
79
+ )
80
+
63
81
 
64
82
  def _get_in_memory_uri():
65
83
  return "file::memory:?cache=shared"
@@ -303,6 +321,11 @@ class SQLiteDatabaseEngine(DatabaseEngine):
303
321
  )
304
322
  return bool(next(self.execute(query))[0])
305
323
 
324
+ @property
325
+ def table_names(self) -> list[str]:
326
+ query = "SELECT name FROM sqlite_master WHERE type='table';"
327
+ return [r[0] for r in self.execute_str(query).fetchall()]
328
+
306
329
  def create_table(self, table: "Table", if_not_exists: bool = True) -> None:
307
330
  self.execute(CreateTable(table, if_not_exists=if_not_exists))
308
331
 
@@ -321,6 +344,8 @@ class SQLiteMetastore(AbstractDBMetastore):
321
344
  This is currently used for the local cli.
322
345
  """
323
346
 
347
+ META_TABLE = "meta"
348
+
324
349
  db: "SQLiteDatabaseEngine"
325
350
 
326
351
  def __init__(
@@ -342,7 +367,11 @@ class SQLiteMetastore(AbstractDBMetastore):
342
367
 
343
368
  self.db = db or SQLiteDatabaseEngine.from_db_file(db_file)
344
369
 
370
+ self._init_meta_table()
371
+ self._init_meta_schema_value()
372
+ self._check_schema_version()
345
373
  self._init_tables()
374
+ self._init_namespaces_projects()
346
375
 
347
376
  def __exit__(self, exc_type, exc_value, traceback) -> None:
348
377
  """Close connection upon exit from context manager."""
@@ -383,8 +412,44 @@ class SQLiteMetastore(AbstractDBMetastore):
383
412
  (db_class, db_args, db_kwargs) = db_clone_params
384
413
  return cls(uri=uri, db=db_class(*db_args, **db_kwargs))
385
414
 
415
+ @cached_property
416
+ def _meta(self) -> Table:
417
+ return Table(self.META_TABLE, self.db.metadata, *self._meta_columns())
418
+
419
+ def _meta_select(self, *columns) -> "Select":
420
+ if not columns:
421
+ return self._meta.select()
422
+ return select(*columns)
423
+
424
+ def _meta_insert(self) -> "Insert":
425
+ return sqlite.insert(self._meta)
426
+
427
+ def _init_meta_table(self) -> None:
428
+ """Initializes meta table"""
429
+ # NOTE! needs to be called before _init_tables()
430
+ table_names = self.db.table_names
431
+ if table_names and self.META_TABLE not in table_names:
432
+ # this will happen on first run
433
+ raise OutdatedDatabaseSchemaError(OUTDATED_SCHEMA_ERROR_MESSAGE)
434
+
435
+ self.db.create_table(self._meta, if_not_exists=True)
436
+ self.default_table_names.append(self._meta.name)
437
+
438
+ def _init_meta_schema_value(self) -> None:
439
+ """Inserts current schema version value if not present in meta table yet"""
440
+ stmt = (
441
+ self._meta_insert()
442
+ .values(id=1, schema_version=SCHEMA_VERSION)
443
+ .on_conflict_do_nothing(index_elements=["id"])
444
+ )
445
+ self.db.execute(stmt)
446
+
386
447
  def _init_tables(self) -> None:
387
448
  """Initialize tables."""
449
+ self.db.create_table(self._namespaces, if_not_exists=True)
450
+ self.default_table_names.append(self._namespaces.name)
451
+ self.db.create_table(self._projects, if_not_exists=True)
452
+ self.default_table_names.append(self._projects.name)
388
453
  self.db.create_table(self._datasets, if_not_exists=True)
389
454
  self.default_table_names.append(self._datasets.name)
390
455
  self.db.create_table(self._datasets_versions, if_not_exists=True)
@@ -394,10 +459,52 @@ class SQLiteMetastore(AbstractDBMetastore):
394
459
  self.db.create_table(self._jobs, if_not_exists=True)
395
460
  self.default_table_names.append(self._jobs.name)
396
461
 
462
+ def _init_namespaces_projects(self) -> None:
463
+ """
464
+ Creates local namespace and local project connected to it.
465
+ In local environment user cannot explicitly create other namespaces and
466
+ projects and all datasets user creates will be stored in those.
467
+ When pulling dataset from Studio, then other namespaces and projects will
468
+ be created implicitly though, to keep the same fully qualified name with
469
+ Studio dataset.
470
+ """
471
+ system_namespace = self.create_namespace(Namespace.system(), "System namespace")
472
+ self.create_project(system_namespace.name, Project.listing(), "Listing project")
473
+
474
+ def _check_schema_version(self) -> None:
475
+ """
476
+ Checks if current DB schema is up to date with latest DB model and schema
477
+ version. If not, OutdatedDatabaseSchemaError is raised.
478
+ """
479
+ schema_version = next(self.db.execute(self._meta_select()))[1]
480
+ if schema_version < SCHEMA_VERSION:
481
+ raise OutdatedDatabaseSchemaError(OUTDATED_SCHEMA_ERROR_MESSAGE)
482
+
483
+ #
484
+ # Dataset dependencies
485
+ #
486
+ @classmethod
487
+ def _meta_columns(cls) -> list["SchemaItem"]:
488
+ return [
489
+ Column("id", Integer, primary_key=True),
490
+ Column("schema_version", Integer, default=SCHEMA_VERSION),
491
+ ]
492
+
397
493
  @classmethod
398
494
  def _datasets_columns(cls) -> list["SchemaItem"]:
399
495
  """Datasets table columns."""
400
- return [*super()._datasets_columns(), UniqueConstraint("name")]
496
+ return [*super()._datasets_columns(), UniqueConstraint("project_id", "name")]
497
+
498
+ @classmethod
499
+ def _namespaces_columns(cls) -> list["SchemaItem"]:
500
+ """Datasets table columns."""
501
+ return [*super()._namespaces_columns(), UniqueConstraint("name")]
502
+
503
+ def _namespaces_insert(self) -> "Insert":
504
+ return sqlite.insert(self._namespaces)
505
+
506
+ def _projects_insert(self) -> "Insert":
507
+ return sqlite.insert(self._projects)
401
508
 
402
509
  def _datasets_insert(self) -> "Insert":
403
510
  return sqlite.insert(self._datasets)
@@ -414,6 +521,8 @@ class SQLiteMetastore(AbstractDBMetastore):
414
521
 
415
522
  def _dataset_dependencies_select_columns(self) -> list["SchemaItem"]:
416
523
  return [
524
+ self._namespaces.c.name,
525
+ self._projects.c.name,
417
526
  self._datasets_dependencies.c.id,
418
527
  self._datasets_dependencies.c.dataset_id,
419
528
  self._datasets_dependencies.c.dataset_version_id,
@@ -429,6 +538,26 @@ class SQLiteMetastore(AbstractDBMetastore):
429
538
  def _jobs_insert(self) -> "Insert":
430
539
  return sqlite.insert(self._jobs)
431
540
 
541
+ @property
542
+ def is_studio(self) -> bool:
543
+ return False
544
+
545
+ #
546
+ # Namespaces
547
+ #
548
+
549
+ @property
550
+ def default_namespace_name(self):
551
+ return Namespace.default()
552
+
553
+ #
554
+ # Projects
555
+ #
556
+
557
+ @property
558
+ def default_project_name(self):
559
+ return Project.default()
560
+
432
561
 
433
562
  class SQLiteWarehouse(AbstractWarehouse):
434
563
  """
@@ -534,16 +663,16 @@ class SQLiteWarehouse(AbstractWarehouse):
534
663
  ) -> None:
535
664
  dst_empty = False
536
665
 
537
- if not self.db.has_table(self.dataset_table_name(src.name, src_version)):
666
+ if not self.db.has_table(self.dataset_table_name(src, src_version)):
538
667
  # source table doesn't exist, nothing to do
539
668
  return
540
669
 
541
670
  src_dr = self.dataset_rows(src, src_version).table
542
671
 
543
- if not self.db.has_table(self.dataset_table_name(dst.name, dst_version)):
672
+ if not self.db.has_table(self.dataset_table_name(dst, dst_version)):
544
673
  # destination table doesn't exist, create it
545
674
  self.create_dataset_rows_table(
546
- self.dataset_table_name(dst.name, dst_version),
675
+ self.dataset_table_name(dst, dst_version),
547
676
  columns=src_dr.columns,
548
677
  )
549
678
  dst_empty = True
@@ -182,7 +182,7 @@ class AbstractWarehouse(ABC, Serializable):
182
182
  ):
183
183
  version = version or dataset.latest_version
184
184
 
185
- table_name = self.dataset_table_name(dataset.name, version)
185
+ table_name = self.dataset_table_name(dataset, version)
186
186
  return self.schema.dataset_row_cls(
187
187
  table_name,
188
188
  self.db,
@@ -254,12 +254,24 @@ class AbstractWarehouse(ABC, Serializable):
254
254
  name = parsed.path if parsed.scheme == "file" else parsed.netloc
255
255
  return parsed.scheme, name
256
256
 
257
- def dataset_table_name(self, dataset_name: str, version: str) -> str:
257
+ def dataset_table_name(self, dataset: DatasetRecord, version: str) -> str:
258
+ return self._construct_dataset_table_name(
259
+ dataset.project.namespace.name,
260
+ dataset.project.name,
261
+ dataset.name,
262
+ version,
263
+ )
264
+
265
+ def _construct_dataset_table_name(
266
+ self, namespace: str, project: str, dataset_name: str, version: str
267
+ ) -> str:
258
268
  prefix = self.DATASET_TABLE_PREFIX
259
269
  if Client.is_data_source_uri(dataset_name):
260
270
  # for datasets that are created for bucket listing we use different prefix
261
271
  prefix = self.DATASET_SOURCE_TABLE_PREFIX
262
- return f"{prefix}{dataset_name}_{version.replace('.', '_')}"
272
+ return (
273
+ f"{prefix}{namespace}_{project}_{dataset_name}_{version.replace('.', '_')}"
274
+ )
263
275
 
264
276
  def temp_table_name(self) -> str:
265
277
  return self.TMP_TABLE_NAME_PREFIX + _random_string(6)
@@ -287,7 +299,7 @@ class AbstractWarehouse(ABC, Serializable):
287
299
  if_exists: bool = True,
288
300
  ) -> None:
289
301
  """Drops a dataset rows table for the given dataset name."""
290
- table_name = self.dataset_table_name(dataset.name, version)
302
+ table_name = self.dataset_table_name(dataset, version)
291
303
  table = sa.Table(table_name, self.db.metadata)
292
304
  self.db.drop_table(table, if_exists=if_exists)
293
305
 
@@ -344,13 +356,20 @@ class AbstractWarehouse(ABC, Serializable):
344
356
 
345
357
  def rename_dataset_table(
346
358
  self,
359
+ dataset: DatasetRecord,
347
360
  old_name: str,
348
361
  new_name: str,
349
362
  old_version: str,
350
363
  new_version: str,
351
364
  ) -> None:
352
- old_ds_table_name = self.dataset_table_name(old_name, old_version)
353
- new_ds_table_name = self.dataset_table_name(new_name, new_version)
365
+ namespace = dataset.project.namespace.name
366
+ project = dataset.project.name
367
+ old_ds_table_name = self._construct_dataset_table_name(
368
+ namespace, project, old_name, old_version
369
+ )
370
+ new_ds_table_name = self._construct_dataset_table_name(
371
+ namespace, project, new_name, new_version
372
+ )
354
373
 
355
374
  self.db.rename_table(old_ds_table_name, new_ds_table_name)
356
375
 
@@ -368,7 +387,7 @@ class AbstractWarehouse(ABC, Serializable):
368
387
  """
369
388
  Returns tuple with dataset stats: total number of rows and total dataset size.
370
389
  """
371
- if not (self.db.has_table(self.dataset_table_name(dataset.name, version))):
390
+ if not (self.db.has_table(self.dataset_table_name(dataset, version))):
372
391
  return None, None
373
392
 
374
393
  file_signals = list(
datachain/dataset.py CHANGED
@@ -13,7 +13,9 @@ from typing import (
13
13
  from urllib.parse import urlparse
14
14
 
15
15
  from datachain import semver
16
- from datachain.error import DatasetVersionNotFoundError
16
+ from datachain.error import DatasetVersionNotFoundError, InvalidDatasetNameError
17
+ from datachain.namespace import Namespace
18
+ from datachain.project import Project
17
19
  from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
18
20
 
19
21
  T = TypeVar("T", bound="DatasetRecord")
@@ -27,6 +29,8 @@ QUERY_DATASET_PREFIX = "ds_query_"
27
29
  LISTING_PREFIX = "lst__"
28
30
 
29
31
  DEFAULT_DATASET_VERSION = "1.0.0"
32
+ DATASET_NAME_RESERVED_CHARS = ["."]
33
+ DATASET_NAME_REPLACEMENT_CHAR = "_"
30
34
 
31
35
 
32
36
  # StorageURI represents a normalised URI to a valid storage location (full bucket or
@@ -57,20 +61,35 @@ def parse_dataset_uri(uri: str) -> tuple[str, Optional[str]]:
57
61
  return name, s[1]
58
62
 
59
63
 
60
- def create_dataset_uri(name: str, version: Optional[str] = None) -> str:
64
+ def create_dataset_uri(
65
+ name: str, namespace: str, project: str, version: Optional[str] = None
66
+ ) -> str:
61
67
  """
62
- Creates a dataset uri based on dataset name and optionally version
68
+ Creates a dataset uri based on namespace, project, dataset name and optionally
69
+ version.
63
70
  Example:
64
- Input: zalando, 3.0.1
65
- Output: ds//zalando@v3.0.1
71
+ Input: dev, clothes, zalando, 3.0.1
72
+ Output: ds//dev.clothes.zalando@v3.0.1
66
73
  """
67
- uri = f"{DATASET_PREFIX}{name}"
74
+ uri = f"{DATASET_PREFIX}{namespace}.{project}.{name}"
68
75
  if version:
69
76
  uri += f"@v{version}"
70
77
 
71
78
  return uri
72
79
 
73
80
 
81
+ def parse_dataset_name(name: str) -> tuple[Optional[str], Optional[str], str]:
82
+ """Parses dataset name and returns namespace, project and name"""
83
+ if not name:
84
+ raise ValueError("Name must be defined to parse it")
85
+ split = name.split(".")
86
+ name = split[-1]
87
+ project_name = split[-2] if len(split) > 1 else None
88
+ namespace_name = split[-3] if len(split) > 2 else None
89
+
90
+ return namespace_name, project_name, name
91
+
92
+
74
93
  class DatasetDependencyType:
75
94
  DATASET = "dataset"
76
95
  STORAGE = "storage"
@@ -78,8 +97,12 @@ class DatasetDependencyType:
78
97
 
79
98
  @dataclass
80
99
  class DatasetDependency:
100
+ # TODO put `DatasetRecord` instead of name + version which will
101
+ # simplify codebase in various places
81
102
  id: int
82
103
  type: str
104
+ namespace: str
105
+ project: str
83
106
  name: str
84
107
  version: str
85
108
  created_at: datetime
@@ -100,6 +123,8 @@ class DatasetDependency:
100
123
  @classmethod
101
124
  def parse(
102
125
  cls: builtins.type[DD],
126
+ namespace_name: str,
127
+ project_name: str,
103
128
  id: int,
104
129
  dataset_id: Optional[int],
105
130
  dataset_version_id: Optional[int],
@@ -121,6 +146,8 @@ class DatasetDependency:
121
146
  if is_listing_dataset(dataset_name)
122
147
  else DatasetDependencyType.DATASET
123
148
  ),
149
+ namespace_name,
150
+ project_name,
124
151
  dataset_name,
125
152
  (
126
153
  dataset_version # type: ignore[arg-type]
@@ -335,6 +362,7 @@ class DatasetListVersion:
335
362
  class DatasetRecord:
336
363
  id: int
337
364
  name: str
365
+ project: Project
338
366
  description: Optional[str]
339
367
  attrs: list[str]
340
368
  schema: dict[str, Union[SQLType, type[SQLType]]]
@@ -349,6 +377,9 @@ class DatasetRecord:
349
377
  sources: str = ""
350
378
  query_script: str = ""
351
379
 
380
+ def __hash__(self):
381
+ return hash(f"{self.id}")
382
+
352
383
  @staticmethod
353
384
  def parse_schema(
354
385
  ct: dict[str, Any],
@@ -358,10 +389,31 @@ class DatasetRecord:
358
389
  for c_name, c_type in ct.items()
359
390
  }
360
391
 
392
+ @staticmethod
393
+ def validate_name(name: str) -> None:
394
+ """Throws exception if name has reserved characters"""
395
+ for c in DATASET_NAME_RESERVED_CHARS:
396
+ if c in name:
397
+ raise InvalidDatasetNameError(
398
+ f"Character {c} is reserved and not allowed in dataset name"
399
+ )
400
+
361
401
  @classmethod
362
402
  def parse( # noqa: PLR0913
363
403
  cls,
364
- id: int,
404
+ namespace_id: int,
405
+ namespace_uuid: str,
406
+ namespace_name: str,
407
+ namespace_description: Optional[str],
408
+ namespace_created_at: datetime,
409
+ project_id: int,
410
+ project_uuid: str,
411
+ project_name: str,
412
+ project_description: Optional[str],
413
+ project_created_at: datetime,
414
+ project_namespace_id: int,
415
+ dataset_id: int,
416
+ dataset_project_id: int,
365
417
  name: str,
366
418
  description: Optional[str],
367
419
  attrs: str,
@@ -400,6 +452,23 @@ class DatasetRecord:
400
452
  json.loads(version_schema) if version_schema else {}
401
453
  )
402
454
 
455
+ namespace = Namespace(
456
+ namespace_id,
457
+ namespace_uuid,
458
+ namespace_name,
459
+ namespace_description,
460
+ namespace_created_at,
461
+ )
462
+
463
+ project = Project(
464
+ project_id,
465
+ project_uuid,
466
+ project_name,
467
+ project_description,
468
+ project_created_at,
469
+ namespace,
470
+ )
471
+
403
472
  dataset_version = DatasetVersion.parse(
404
473
  version_id,
405
474
  version_uuid,
@@ -422,8 +491,9 @@ class DatasetRecord:
422
491
  )
423
492
 
424
493
  return cls(
425
- id,
494
+ dataset_id,
426
495
  name,
496
+ project,
427
497
  description,
428
498
  attrs_lst,
429
499
  cls.parse_schema(schema_dct), # type: ignore[arg-type]
@@ -448,6 +518,10 @@ class DatasetRecord:
448
518
  for c_name, c_type in self.schema.items()
449
519
  }
450
520
 
521
+ @property
522
+ def full_name(self) -> str:
523
+ return f"{self.project.namespace.name}.{self.project.name}.{self.name}"
524
+
451
525
  def get_schema(self, version: str) -> dict[str, Union[SQLType, type[SQLType]]]:
452
526
  return self.get_version(version).schema if version else self.schema
453
527
 
@@ -527,7 +601,10 @@ class DatasetRecord:
527
601
  Dataset uri example: ds://dogs@v3.0.1
528
602
  """
529
603
  identifier = self.identifier(version)
530
- return f"{DATASET_PREFIX}{identifier}"
604
+ return (
605
+ f"{DATASET_PREFIX}{self.project.namespace.name}"
606
+ f".{self.project.name}.{identifier}"
607
+ )
531
608
 
532
609
  @property
533
610
  def next_version_major(self) -> str:
@@ -592,15 +669,17 @@ class DatasetRecord:
592
669
 
593
670
  @classmethod
594
671
  def from_dict(cls, d: dict[str, Any]) -> "DatasetRecord":
672
+ project = Project.from_dict(d.pop("project"))
595
673
  versions = [DatasetVersion.from_dict(v) for v in d.pop("versions", [])]
596
674
  kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
597
- return cls(**kwargs, versions=versions)
675
+ return cls(**kwargs, versions=versions, project=project)
598
676
 
599
677
 
600
678
  @dataclass
601
679
  class DatasetListRecord:
602
680
  id: int
603
681
  name: str
682
+ project: Project
604
683
  description: Optional[str]
605
684
  attrs: list[str]
606
685
  versions: list[DatasetListVersion]
@@ -609,7 +688,18 @@ class DatasetListRecord:
609
688
  @classmethod
610
689
  def parse( # noqa: PLR0913
611
690
  cls,
612
- id: int,
691
+ namespace_id: int,
692
+ namespace_uuid: str,
693
+ namespace_name: str,
694
+ namespace_description: Optional[str],
695
+ namespace_created_at: datetime,
696
+ project_id: int,
697
+ project_uuid: str,
698
+ project_name: str,
699
+ project_description: Optional[str],
700
+ project_created_at: datetime,
701
+ project_namespace_id: int,
702
+ dataset_id: int,
613
703
  name: str,
614
704
  description: Optional[str],
615
705
  attrs: str,
@@ -630,6 +720,23 @@ class DatasetListRecord:
630
720
  ) -> "DatasetListRecord":
631
721
  attrs_lst: list[str] = json.loads(attrs) if attrs else []
632
722
 
723
+ namespace = Namespace(
724
+ namespace_id,
725
+ namespace_uuid,
726
+ namespace_name,
727
+ namespace_description,
728
+ namespace_created_at,
729
+ )
730
+
731
+ project = Project(
732
+ project_id,
733
+ project_uuid,
734
+ project_name,
735
+ project_description,
736
+ project_created_at,
737
+ namespace,
738
+ )
739
+
633
740
  dataset_version = DatasetListVersion.parse(
634
741
  version_id,
635
742
  version_uuid,
@@ -647,14 +754,19 @@ class DatasetListRecord:
647
754
  )
648
755
 
649
756
  return cls(
650
- id,
757
+ dataset_id,
651
758
  name,
759
+ project,
652
760
  description,
653
761
  attrs_lst,
654
762
  [dataset_version],
655
763
  created_at,
656
764
  )
657
765
 
766
+ @property
767
+ def full_name(self) -> str:
768
+ return f"{self.project.namespace.name}.{self.project.name}.{self.name}"
769
+
658
770
  def merge_versions(self, other: "DatasetListRecord") -> "DatasetListRecord":
659
771
  """Merge versions from another dataset"""
660
772
  if other.id != self.id:
@@ -691,9 +803,11 @@ class DatasetListRecord:
691
803
 
692
804
  @classmethod
693
805
  def from_dict(cls, d: dict[str, Any]) -> "DatasetListRecord":
806
+ project = Project.from_dict(d.pop("project"))
694
807
  versions = [DatasetListVersion.parse(**v) for v in d.get("versions", [])]
695
808
  kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
696
809
  kwargs["versions"] = versions
810
+ kwargs["project"] = project
697
811
  return cls(**kwargs)
698
812
 
699
813
 
datachain/delta.py CHANGED
@@ -56,11 +56,13 @@ def _get_delta_chain(
56
56
  compare: Optional[Union[str, Sequence[str]]] = None,
57
57
  ) -> "DataChain":
58
58
  """Get delta chain for processing changes between versions."""
59
- source_dc = datachain.read_dataset(source_ds_name, source_ds_version)
60
- source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
59
+ source_dc = datachain.read_dataset(source_ds_name, version=source_ds_version)
60
+ source_dc_latest = datachain.read_dataset(
61
+ source_ds_name, version=source_ds_latest_version
62
+ )
61
63
 
62
64
  # Calculate diff between source versions
63
- return source_dc_latest.compare(source_dc, on=on, compare=compare, deleted=False)
65
+ return source_dc_latest.diff(source_dc, on=on, compare=compare, deleted=False)
64
66
 
65
67
 
66
68
  def _get_retry_chain(
@@ -79,8 +81,10 @@ def _get_retry_chain(
79
81
  retry_chain = None
80
82
 
81
83
  # Read the latest version of the result dataset for retry logic
82
- result_dataset = datachain.read_dataset(name, latest_version)
83
- source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
84
+ result_dataset = datachain.read_dataset(name, version=latest_version)
85
+ source_dc_latest = datachain.read_dataset(
86
+ source_ds_name, version=source_ds_latest_version
87
+ )
84
88
 
85
89
  # Handle error records if delta_retry is a string (column name)
86
90
  if isinstance(delta_retry, str):
@@ -232,8 +236,8 @@ def delta_retry_update(
232
236
  if processing_chain is None or (processing_chain and processing_chain.empty):
233
237
  return None, None, False
234
238
 
235
- latest_dataset = datachain.read_dataset(name, latest_version)
236
- compared_chain = latest_dataset.compare(
239
+ latest_dataset = datachain.read_dataset(name, version=latest_version)
240
+ compared_chain = latest_dataset.diff(
237
241
  processing_chain,
238
242
  on=right_on or on,
239
243
  added=True,
datachain/error.py CHANGED
@@ -2,10 +2,42 @@ class DataChainError(RuntimeError):
2
2
  pass
3
3
 
4
4
 
5
+ class InvalidDatasetNameError(RuntimeError):
6
+ pass
7
+
8
+
9
+ class InvalidNamespaceNameError(RuntimeError):
10
+ pass
11
+
12
+
13
+ class InvalidProjectNameError(RuntimeError):
14
+ pass
15
+
16
+
5
17
  class NotFoundError(Exception):
6
18
  pass
7
19
 
8
20
 
21
+ class NamespaceNotFoundError(NotFoundError):
22
+ pass
23
+
24
+
25
+ class NotAllowedError(Exception):
26
+ pass
27
+
28
+
29
+ class NamespaceCreateNotAllowedError(NotAllowedError):
30
+ pass
31
+
32
+
33
+ class ProjectCreateNotAllowedError(NotAllowedError):
34
+ pass
35
+
36
+
37
+ class ProjectNotFoundError(NotFoundError):
38
+ pass
39
+
40
+
9
41
  class DatasetNotFoundError(NotFoundError):
10
42
  pass
11
43
 
@@ -53,3 +85,7 @@ class ClientError(RuntimeError):
53
85
 
54
86
  class TableMissingError(DataChainError):
55
87
  pass
88
+
89
+
90
+ class OutdatedDatabaseSchemaError(DataChainError):
91
+ pass