datachain 0.21.1__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (49) hide show
  1. datachain/__init__.py +2 -0
  2. datachain/cache.py +2 -2
  3. datachain/catalog/catalog.py +213 -65
  4. datachain/cli/__init__.py +0 -7
  5. datachain/cli/commands/datasets.py +35 -26
  6. datachain/cli/commands/ls.py +2 -2
  7. datachain/cli/parser/__init__.py +1 -35
  8. datachain/client/fsspec.py +5 -3
  9. datachain/client/hf.py +10 -0
  10. datachain/client/local.py +4 -4
  11. datachain/data_storage/metastore.py +433 -37
  12. datachain/data_storage/sqlite.py +140 -7
  13. datachain/data_storage/warehouse.py +26 -7
  14. datachain/dataset.py +128 -12
  15. datachain/delta.py +11 -7
  16. datachain/error.py +36 -0
  17. datachain/func/func.py +1 -1
  18. datachain/lib/arrow.py +3 -3
  19. datachain/lib/dataset_info.py +4 -0
  20. datachain/lib/dc/datachain.py +253 -91
  21. datachain/lib/dc/datasets.py +103 -50
  22. datachain/lib/dc/listings.py +3 -3
  23. datachain/lib/dc/records.py +2 -1
  24. datachain/lib/dc/storage.py +38 -40
  25. datachain/lib/file.py +77 -23
  26. datachain/lib/listing.py +3 -1
  27. datachain/lib/meta_formats.py +1 -1
  28. datachain/lib/namespaces.py +71 -0
  29. datachain/lib/projects.py +86 -0
  30. datachain/lib/pytorch.py +1 -1
  31. datachain/lib/settings.py +10 -0
  32. datachain/lib/signal_schema.py +8 -0
  33. datachain/lib/tar.py +1 -2
  34. datachain/lib/udf.py +1 -1
  35. datachain/lib/udf_signature.py +1 -1
  36. datachain/lib/webdataset.py +30 -20
  37. datachain/listing.py +3 -1
  38. datachain/namespace.py +65 -0
  39. datachain/project.py +78 -0
  40. datachain/query/dataset.py +71 -46
  41. datachain/query/session.py +1 -1
  42. datachain/remote/studio.py +61 -26
  43. datachain/studio.py +23 -6
  44. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/METADATA +2 -2
  45. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/RECORD +49 -45
  46. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/WHEEL +0 -0
  47. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/entry_points.txt +0 -0
  48. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/licenses/LICENSE +0 -0
  49. {datachain-0.21.1.dist-info → datachain-0.23.0.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,7 @@ import os
3
3
  import sqlite3
4
4
  from collections.abc import Iterable, Sequence
5
5
  from contextlib import contextmanager
6
- from functools import wraps
6
+ from functools import cached_property, wraps
7
7
  from time import sleep
8
8
  from typing import (
9
9
  TYPE_CHECKING,
@@ -15,7 +15,15 @@ from typing import (
15
15
  )
16
16
 
17
17
  import sqlalchemy
18
- from sqlalchemy import MetaData, Table, UniqueConstraint, exists, select
18
+ from sqlalchemy import (
19
+ Column,
20
+ Integer,
21
+ MetaData,
22
+ Table,
23
+ UniqueConstraint,
24
+ exists,
25
+ select,
26
+ )
19
27
  from sqlalchemy.dialects import sqlite
20
28
  from sqlalchemy.schema import CreateIndex, CreateTable, DropTable
21
29
  from sqlalchemy.sql import func
@@ -30,7 +38,9 @@ from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
30
38
  from datachain.data_storage.db_engine import DatabaseEngine
31
39
  from datachain.data_storage.schema import DefaultSchema
32
40
  from datachain.dataset import DatasetRecord, StorageURI
33
- from datachain.error import DataChainError
41
+ from datachain.error import DataChainError, OutdatedDatabaseSchemaError
42
+ from datachain.namespace import Namespace
43
+ from datachain.project import Project
34
44
  from datachain.sql.sqlite import create_user_defined_sql_functions, sqlite_dialect
35
45
  from datachain.sql.sqlite.base import load_usearch_extension
36
46
  from datachain.sql.types import SQLType
@@ -60,6 +70,14 @@ datachain.sql.sqlite.setup()
60
70
  quote_schema = sqlite_dialect.identifier_preparer.quote_schema
61
71
  quote = sqlite_dialect.identifier_preparer.quote
62
72
 
73
+ # NOTE! This should be manually increased when we change our DB schema in codebase
74
+ SCHEMA_VERSION = 1
75
+
76
+ OUTDATED_SCHEMA_ERROR_MESSAGE = (
77
+ "You have an old version of the database schema. Please refer to the documentation"
78
+ " for more information."
79
+ )
80
+
63
81
 
64
82
  def _get_in_memory_uri():
65
83
  return "file::memory:?cache=shared"
@@ -303,6 +321,11 @@ class SQLiteDatabaseEngine(DatabaseEngine):
303
321
  )
304
322
  return bool(next(self.execute(query))[0])
305
323
 
324
+ @property
325
+ def table_names(self) -> list[str]:
326
+ query = "SELECT name FROM sqlite_master WHERE type='table';"
327
+ return [r[0] for r in self.execute_str(query).fetchall()]
328
+
306
329
  def create_table(self, table: "Table", if_not_exists: bool = True) -> None:
307
330
  self.execute(CreateTable(table, if_not_exists=if_not_exists))
308
331
 
@@ -321,6 +344,8 @@ class SQLiteMetastore(AbstractDBMetastore):
321
344
  This is currently used for the local cli.
322
345
  """
323
346
 
347
+ META_TABLE = "meta"
348
+
324
349
  db: "SQLiteDatabaseEngine"
325
350
 
326
351
  def __init__(
@@ -342,7 +367,11 @@ class SQLiteMetastore(AbstractDBMetastore):
342
367
 
343
368
  self.db = db or SQLiteDatabaseEngine.from_db_file(db_file)
344
369
 
370
+ self._init_meta_table()
371
+ self._init_meta_schema_value()
372
+ self._check_schema_version()
345
373
  self._init_tables()
374
+ self._init_namespaces_projects()
346
375
 
347
376
  def __exit__(self, exc_type, exc_value, traceback) -> None:
348
377
  """Close connection upon exit from context manager."""
@@ -383,8 +412,44 @@ class SQLiteMetastore(AbstractDBMetastore):
383
412
  (db_class, db_args, db_kwargs) = db_clone_params
384
413
  return cls(uri=uri, db=db_class(*db_args, **db_kwargs))
385
414
 
415
+ @cached_property
416
+ def _meta(self) -> Table:
417
+ return Table(self.META_TABLE, self.db.metadata, *self._meta_columns())
418
+
419
+ def _meta_select(self, *columns) -> "Select":
420
+ if not columns:
421
+ return self._meta.select()
422
+ return select(*columns)
423
+
424
+ def _meta_insert(self) -> "Insert":
425
+ return sqlite.insert(self._meta)
426
+
427
+ def _init_meta_table(self) -> None:
428
+ """Initializes meta table"""
429
+ # NOTE! needs to be called before _init_tables()
430
+ table_names = self.db.table_names
431
+ if table_names and self.META_TABLE not in table_names:
432
+ # this will happen on first run
433
+ raise OutdatedDatabaseSchemaError(OUTDATED_SCHEMA_ERROR_MESSAGE)
434
+
435
+ self.db.create_table(self._meta, if_not_exists=True)
436
+ self.default_table_names.append(self._meta.name)
437
+
438
+ def _init_meta_schema_value(self) -> None:
439
+ """Inserts current schema version value if not present in meta table yet"""
440
+ stmt = (
441
+ self._meta_insert()
442
+ .values(id=1, schema_version=SCHEMA_VERSION)
443
+ .on_conflict_do_nothing(index_elements=["id"])
444
+ )
445
+ self.db.execute(stmt)
446
+
386
447
  def _init_tables(self) -> None:
387
448
  """Initialize tables."""
449
+ self.db.create_table(self._namespaces, if_not_exists=True)
450
+ self.default_table_names.append(self._namespaces.name)
451
+ self.db.create_table(self._projects, if_not_exists=True)
452
+ self.default_table_names.append(self._projects.name)
388
453
  self.db.create_table(self._datasets, if_not_exists=True)
389
454
  self.default_table_names.append(self._datasets.name)
390
455
  self.db.create_table(self._datasets_versions, if_not_exists=True)
@@ -394,10 +459,56 @@ class SQLiteMetastore(AbstractDBMetastore):
394
459
  self.db.create_table(self._jobs, if_not_exists=True)
395
460
  self.default_table_names.append(self._jobs.name)
396
461
 
462
+ def _init_namespaces_projects(self) -> None:
463
+ """
464
+ Creates local namespace and local project connected to it.
465
+ In local environment user cannot explicitly create other namespaces and
466
+ projects and all datasets user creates will be stored in those.
467
+ When pulling dataset from Studio, then other namespaces and projects will
468
+ be created implicitly though, to keep the same fully qualified name with
469
+ Studio dataset.
470
+ """
471
+ system_namespace = self.create_namespace(
472
+ Namespace.system(), "System namespace", validate=False
473
+ )
474
+ self.create_project(
475
+ system_namespace.name, Project.listing(), "Listing project", validate=False
476
+ )
477
+
478
+ def _check_schema_version(self) -> None:
479
+ """
480
+ Checks if current DB schema is up to date with latest DB model and schema
481
+ version. If not, OutdatedDatabaseSchemaError is raised.
482
+ """
483
+ schema_version = next(self.db.execute(self._meta_select()))[1]
484
+ if schema_version < SCHEMA_VERSION:
485
+ raise OutdatedDatabaseSchemaError(OUTDATED_SCHEMA_ERROR_MESSAGE)
486
+
487
+ #
488
+ # Dataset dependencies
489
+ #
490
+ @classmethod
491
+ def _meta_columns(cls) -> list["SchemaItem"]:
492
+ return [
493
+ Column("id", Integer, primary_key=True),
494
+ Column("schema_version", Integer, default=SCHEMA_VERSION),
495
+ ]
496
+
397
497
  @classmethod
398
498
  def _datasets_columns(cls) -> list["SchemaItem"]:
399
499
  """Datasets table columns."""
400
- return [*super()._datasets_columns(), UniqueConstraint("name")]
500
+ return [*super()._datasets_columns(), UniqueConstraint("project_id", "name")]
501
+
502
+ @classmethod
503
+ def _namespaces_columns(cls) -> list["SchemaItem"]:
504
+ """Datasets table columns."""
505
+ return [*super()._namespaces_columns(), UniqueConstraint("name")]
506
+
507
+ def _namespaces_insert(self) -> "Insert":
508
+ return sqlite.insert(self._namespaces)
509
+
510
+ def _projects_insert(self) -> "Insert":
511
+ return sqlite.insert(self._projects)
401
512
 
402
513
  def _datasets_insert(self) -> "Insert":
403
514
  return sqlite.insert(self._datasets)
@@ -414,6 +525,8 @@ class SQLiteMetastore(AbstractDBMetastore):
414
525
 
415
526
  def _dataset_dependencies_select_columns(self) -> list["SchemaItem"]:
416
527
  return [
528
+ self._namespaces.c.name,
529
+ self._projects.c.name,
417
530
  self._datasets_dependencies.c.id,
418
531
  self._datasets_dependencies.c.dataset_id,
419
532
  self._datasets_dependencies.c.dataset_version_id,
@@ -429,6 +542,26 @@ class SQLiteMetastore(AbstractDBMetastore):
429
542
  def _jobs_insert(self) -> "Insert":
430
543
  return sqlite.insert(self._jobs)
431
544
 
545
+ @property
546
+ def is_studio(self) -> bool:
547
+ return False
548
+
549
+ #
550
+ # Namespaces
551
+ #
552
+
553
+ @property
554
+ def default_namespace_name(self):
555
+ return Namespace.default()
556
+
557
+ #
558
+ # Projects
559
+ #
560
+
561
+ @property
562
+ def default_project_name(self):
563
+ return Project.default()
564
+
432
565
 
433
566
  class SQLiteWarehouse(AbstractWarehouse):
434
567
  """
@@ -534,16 +667,16 @@ class SQLiteWarehouse(AbstractWarehouse):
534
667
  ) -> None:
535
668
  dst_empty = False
536
669
 
537
- if not self.db.has_table(self.dataset_table_name(src.name, src_version)):
670
+ if not self.db.has_table(self.dataset_table_name(src, src_version)):
538
671
  # source table doesn't exist, nothing to do
539
672
  return
540
673
 
541
674
  src_dr = self.dataset_rows(src, src_version).table
542
675
 
543
- if not self.db.has_table(self.dataset_table_name(dst.name, dst_version)):
676
+ if not self.db.has_table(self.dataset_table_name(dst, dst_version)):
544
677
  # destination table doesn't exist, create it
545
678
  self.create_dataset_rows_table(
546
- self.dataset_table_name(dst.name, dst_version),
679
+ self.dataset_table_name(dst, dst_version),
547
680
  columns=src_dr.columns,
548
681
  )
549
682
  dst_empty = True
@@ -182,7 +182,7 @@ class AbstractWarehouse(ABC, Serializable):
182
182
  ):
183
183
  version = version or dataset.latest_version
184
184
 
185
- table_name = self.dataset_table_name(dataset.name, version)
185
+ table_name = self.dataset_table_name(dataset, version)
186
186
  return self.schema.dataset_row_cls(
187
187
  table_name,
188
188
  self.db,
@@ -254,12 +254,24 @@ class AbstractWarehouse(ABC, Serializable):
254
254
  name = parsed.path if parsed.scheme == "file" else parsed.netloc
255
255
  return parsed.scheme, name
256
256
 
257
- def dataset_table_name(self, dataset_name: str, version: str) -> str:
257
+ def dataset_table_name(self, dataset: DatasetRecord, version: str) -> str:
258
+ return self._construct_dataset_table_name(
259
+ dataset.project.namespace.name,
260
+ dataset.project.name,
261
+ dataset.name,
262
+ version,
263
+ )
264
+
265
+ def _construct_dataset_table_name(
266
+ self, namespace: str, project: str, dataset_name: str, version: str
267
+ ) -> str:
258
268
  prefix = self.DATASET_TABLE_PREFIX
259
269
  if Client.is_data_source_uri(dataset_name):
260
270
  # for datasets that are created for bucket listing we use different prefix
261
271
  prefix = self.DATASET_SOURCE_TABLE_PREFIX
262
- return f"{prefix}{dataset_name}_{version.replace('.', '_')}"
272
+ return (
273
+ f"{prefix}{namespace}_{project}_{dataset_name}_{version.replace('.', '_')}"
274
+ )
263
275
 
264
276
  def temp_table_name(self) -> str:
265
277
  return self.TMP_TABLE_NAME_PREFIX + _random_string(6)
@@ -287,7 +299,7 @@ class AbstractWarehouse(ABC, Serializable):
287
299
  if_exists: bool = True,
288
300
  ) -> None:
289
301
  """Drops a dataset rows table for the given dataset name."""
290
- table_name = self.dataset_table_name(dataset.name, version)
302
+ table_name = self.dataset_table_name(dataset, version)
291
303
  table = sa.Table(table_name, self.db.metadata)
292
304
  self.db.drop_table(table, if_exists=if_exists)
293
305
 
@@ -344,13 +356,20 @@ class AbstractWarehouse(ABC, Serializable):
344
356
 
345
357
  def rename_dataset_table(
346
358
  self,
359
+ dataset: DatasetRecord,
347
360
  old_name: str,
348
361
  new_name: str,
349
362
  old_version: str,
350
363
  new_version: str,
351
364
  ) -> None:
352
- old_ds_table_name = self.dataset_table_name(old_name, old_version)
353
- new_ds_table_name = self.dataset_table_name(new_name, new_version)
365
+ namespace = dataset.project.namespace.name
366
+ project = dataset.project.name
367
+ old_ds_table_name = self._construct_dataset_table_name(
368
+ namespace, project, old_name, old_version
369
+ )
370
+ new_ds_table_name = self._construct_dataset_table_name(
371
+ namespace, project, new_name, new_version
372
+ )
354
373
 
355
374
  self.db.rename_table(old_ds_table_name, new_ds_table_name)
356
375
 
@@ -368,7 +387,7 @@ class AbstractWarehouse(ABC, Serializable):
368
387
  """
369
388
  Returns tuple with dataset stats: total number of rows and total dataset size.
370
389
  """
371
- if not (self.db.has_table(self.dataset_table_name(dataset.name, version))):
390
+ if not (self.db.has_table(self.dataset_table_name(dataset, version))):
372
391
  return None, None
373
392
 
374
393
  file_signals = list(
datachain/dataset.py CHANGED
@@ -13,7 +13,9 @@ from typing import (
13
13
  from urllib.parse import urlparse
14
14
 
15
15
  from datachain import semver
16
- from datachain.error import DatasetVersionNotFoundError
16
+ from datachain.error import DatasetVersionNotFoundError, InvalidDatasetNameError
17
+ from datachain.namespace import Namespace
18
+ from datachain.project import Project
17
19
  from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
18
20
 
19
21
  T = TypeVar("T", bound="DatasetRecord")
@@ -27,6 +29,8 @@ QUERY_DATASET_PREFIX = "ds_query_"
27
29
  LISTING_PREFIX = "lst__"
28
30
 
29
31
  DEFAULT_DATASET_VERSION = "1.0.0"
32
+ DATASET_NAME_RESERVED_CHARS = ["."]
33
+ DATASET_NAME_REPLACEMENT_CHAR = "_"
30
34
 
31
35
 
32
36
  # StorageURI represents a normalised URI to a valid storage location (full bucket or
@@ -57,20 +61,37 @@ def parse_dataset_uri(uri: str) -> tuple[str, Optional[str]]:
57
61
  return name, s[1]
58
62
 
59
63
 
60
- def create_dataset_uri(name: str, version: Optional[str] = None) -> str:
64
+ def create_dataset_uri(
65
+ name: str, namespace: str, project: str, version: Optional[str] = None
66
+ ) -> str:
61
67
  """
62
- Creates a dataset uri based on dataset name and optionally version
68
+ Creates a dataset uri based on namespace, project, dataset name and optionally
69
+ version.
63
70
  Example:
64
- Input: zalando, 3.0.1
65
- Output: ds//zalando@v3.0.1
71
+ Input: dev, clothes, zalando, 3.0.1
72
+ Output: ds//dev.clothes.zalando@v3.0.1
66
73
  """
67
- uri = f"{DATASET_PREFIX}{name}"
74
+ uri = f"{DATASET_PREFIX}{namespace}.{project}.{name}"
68
75
  if version:
69
76
  uri += f"@v{version}"
70
77
 
71
78
  return uri
72
79
 
73
80
 
81
+ def parse_dataset_name(name: str) -> tuple[Optional[str], Optional[str], str]:
82
+ """Parses dataset name and returns namespace, project and name"""
83
+ if not name:
84
+ raise InvalidDatasetNameError("Name must be defined to parse it")
85
+ split = name.split(".")
86
+ if len(split) > 3:
87
+ raise InvalidDatasetNameError(f"Invalid dataset name {name}")
88
+ name = split[-1]
89
+ project_name = split[-2] if len(split) > 1 else None
90
+ namespace_name = split[-3] if len(split) > 2 else None
91
+
92
+ return namespace_name, project_name, name
93
+
94
+
74
95
  class DatasetDependencyType:
75
96
  DATASET = "dataset"
76
97
  STORAGE = "storage"
@@ -78,8 +99,12 @@ class DatasetDependencyType:
78
99
 
79
100
  @dataclass
80
101
  class DatasetDependency:
102
+ # TODO put `DatasetRecord` instead of name + version which will
103
+ # simplify codebase in various places
81
104
  id: int
82
105
  type: str
106
+ namespace: str
107
+ project: str
83
108
  name: str
84
109
  version: str
85
110
  created_at: datetime
@@ -100,6 +125,8 @@ class DatasetDependency:
100
125
  @classmethod
101
126
  def parse(
102
127
  cls: builtins.type[DD],
128
+ namespace_name: str,
129
+ project_name: str,
103
130
  id: int,
104
131
  dataset_id: Optional[int],
105
132
  dataset_version_id: Optional[int],
@@ -121,6 +148,8 @@ class DatasetDependency:
121
148
  if is_listing_dataset(dataset_name)
122
149
  else DatasetDependencyType.DATASET
123
150
  ),
151
+ namespace_name,
152
+ project_name,
124
153
  dataset_name,
125
154
  (
126
155
  dataset_version # type: ignore[arg-type]
@@ -335,6 +364,7 @@ class DatasetListVersion:
335
364
  class DatasetRecord:
336
365
  id: int
337
366
  name: str
367
+ project: Project
338
368
  description: Optional[str]
339
369
  attrs: list[str]
340
370
  schema: dict[str, Union[SQLType, type[SQLType]]]
@@ -349,6 +379,9 @@ class DatasetRecord:
349
379
  sources: str = ""
350
380
  query_script: str = ""
351
381
 
382
+ def __hash__(self):
383
+ return hash(f"{self.id}")
384
+
352
385
  @staticmethod
353
386
  def parse_schema(
354
387
  ct: dict[str, Any],
@@ -358,10 +391,31 @@ class DatasetRecord:
358
391
  for c_name, c_type in ct.items()
359
392
  }
360
393
 
394
+ @staticmethod
395
+ def validate_name(name: str) -> None:
396
+ """Throws exception if name has reserved characters"""
397
+ for c in DATASET_NAME_RESERVED_CHARS:
398
+ if c in name:
399
+ raise InvalidDatasetNameError(
400
+ f"Character {c} is reserved and not allowed in dataset name"
401
+ )
402
+
361
403
  @classmethod
362
404
  def parse( # noqa: PLR0913
363
405
  cls,
364
- id: int,
406
+ namespace_id: int,
407
+ namespace_uuid: str,
408
+ namespace_name: str,
409
+ namespace_description: Optional[str],
410
+ namespace_created_at: datetime,
411
+ project_id: int,
412
+ project_uuid: str,
413
+ project_name: str,
414
+ project_description: Optional[str],
415
+ project_created_at: datetime,
416
+ project_namespace_id: int,
417
+ dataset_id: int,
418
+ dataset_project_id: int,
365
419
  name: str,
366
420
  description: Optional[str],
367
421
  attrs: str,
@@ -400,6 +454,23 @@ class DatasetRecord:
400
454
  json.loads(version_schema) if version_schema else {}
401
455
  )
402
456
 
457
+ namespace = Namespace(
458
+ namespace_id,
459
+ namespace_uuid,
460
+ namespace_name,
461
+ namespace_description,
462
+ namespace_created_at,
463
+ )
464
+
465
+ project = Project(
466
+ project_id,
467
+ project_uuid,
468
+ project_name,
469
+ project_description,
470
+ project_created_at,
471
+ namespace,
472
+ )
473
+
403
474
  dataset_version = DatasetVersion.parse(
404
475
  version_id,
405
476
  version_uuid,
@@ -422,8 +493,9 @@ class DatasetRecord:
422
493
  )
423
494
 
424
495
  return cls(
425
- id,
496
+ dataset_id,
426
497
  name,
498
+ project,
427
499
  description,
428
500
  attrs_lst,
429
501
  cls.parse_schema(schema_dct), # type: ignore[arg-type]
@@ -448,6 +520,10 @@ class DatasetRecord:
448
520
  for c_name, c_type in self.schema.items()
449
521
  }
450
522
 
523
+ @property
524
+ def full_name(self) -> str:
525
+ return f"{self.project.namespace.name}.{self.project.name}.{self.name}"
526
+
451
527
  def get_schema(self, version: str) -> dict[str, Union[SQLType, type[SQLType]]]:
452
528
  return self.get_version(version).schema if version else self.schema
453
529
 
@@ -527,7 +603,10 @@ class DatasetRecord:
527
603
  Dataset uri example: ds://dogs@v3.0.1
528
604
  """
529
605
  identifier = self.identifier(version)
530
- return f"{DATASET_PREFIX}{identifier}"
606
+ return (
607
+ f"{DATASET_PREFIX}{self.project.namespace.name}"
608
+ f".{self.project.name}.{identifier}"
609
+ )
531
610
 
532
611
  @property
533
612
  def next_version_major(self) -> str:
@@ -592,15 +671,17 @@ class DatasetRecord:
592
671
 
593
672
  @classmethod
594
673
  def from_dict(cls, d: dict[str, Any]) -> "DatasetRecord":
674
+ project = Project.from_dict(d.pop("project"))
595
675
  versions = [DatasetVersion.from_dict(v) for v in d.pop("versions", [])]
596
676
  kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
597
- return cls(**kwargs, versions=versions)
677
+ return cls(**kwargs, versions=versions, project=project)
598
678
 
599
679
 
600
680
  @dataclass
601
681
  class DatasetListRecord:
602
682
  id: int
603
683
  name: str
684
+ project: Project
604
685
  description: Optional[str]
605
686
  attrs: list[str]
606
687
  versions: list[DatasetListVersion]
@@ -609,7 +690,18 @@ class DatasetListRecord:
609
690
  @classmethod
610
691
  def parse( # noqa: PLR0913
611
692
  cls,
612
- id: int,
693
+ namespace_id: int,
694
+ namespace_uuid: str,
695
+ namespace_name: str,
696
+ namespace_description: Optional[str],
697
+ namespace_created_at: datetime,
698
+ project_id: int,
699
+ project_uuid: str,
700
+ project_name: str,
701
+ project_description: Optional[str],
702
+ project_created_at: datetime,
703
+ project_namespace_id: int,
704
+ dataset_id: int,
613
705
  name: str,
614
706
  description: Optional[str],
615
707
  attrs: str,
@@ -630,6 +722,23 @@ class DatasetListRecord:
630
722
  ) -> "DatasetListRecord":
631
723
  attrs_lst: list[str] = json.loads(attrs) if attrs else []
632
724
 
725
+ namespace = Namespace(
726
+ namespace_id,
727
+ namespace_uuid,
728
+ namespace_name,
729
+ namespace_description,
730
+ namespace_created_at,
731
+ )
732
+
733
+ project = Project(
734
+ project_id,
735
+ project_uuid,
736
+ project_name,
737
+ project_description,
738
+ project_created_at,
739
+ namespace,
740
+ )
741
+
633
742
  dataset_version = DatasetListVersion.parse(
634
743
  version_id,
635
744
  version_uuid,
@@ -647,14 +756,19 @@ class DatasetListRecord:
647
756
  )
648
757
 
649
758
  return cls(
650
- id,
759
+ dataset_id,
651
760
  name,
761
+ project,
652
762
  description,
653
763
  attrs_lst,
654
764
  [dataset_version],
655
765
  created_at,
656
766
  )
657
767
 
768
+ @property
769
+ def full_name(self) -> str:
770
+ return f"{self.project.namespace.name}.{self.project.name}.{self.name}"
771
+
658
772
  def merge_versions(self, other: "DatasetListRecord") -> "DatasetListRecord":
659
773
  """Merge versions from another dataset"""
660
774
  if other.id != self.id:
@@ -691,9 +805,11 @@ class DatasetListRecord:
691
805
 
692
806
  @classmethod
693
807
  def from_dict(cls, d: dict[str, Any]) -> "DatasetListRecord":
808
+ project = Project.from_dict(d.pop("project"))
694
809
  versions = [DatasetListVersion.parse(**v) for v in d.get("versions", [])]
695
810
  kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
696
811
  kwargs["versions"] = versions
812
+ kwargs["project"] = project
697
813
  return cls(**kwargs)
698
814
 
699
815
 
datachain/delta.py CHANGED
@@ -56,11 +56,13 @@ def _get_delta_chain(
56
56
  compare: Optional[Union[str, Sequence[str]]] = None,
57
57
  ) -> "DataChain":
58
58
  """Get delta chain for processing changes between versions."""
59
- source_dc = datachain.read_dataset(source_ds_name, source_ds_version)
60
- source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
59
+ source_dc = datachain.read_dataset(source_ds_name, version=source_ds_version)
60
+ source_dc_latest = datachain.read_dataset(
61
+ source_ds_name, version=source_ds_latest_version
62
+ )
61
63
 
62
64
  # Calculate diff between source versions
63
- return source_dc_latest.compare(source_dc, on=on, compare=compare, deleted=False)
65
+ return source_dc_latest.diff(source_dc, on=on, compare=compare, deleted=False)
64
66
 
65
67
 
66
68
  def _get_retry_chain(
@@ -79,8 +81,10 @@ def _get_retry_chain(
79
81
  retry_chain = None
80
82
 
81
83
  # Read the latest version of the result dataset for retry logic
82
- result_dataset = datachain.read_dataset(name, latest_version)
83
- source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
84
+ result_dataset = datachain.read_dataset(name, version=latest_version)
85
+ source_dc_latest = datachain.read_dataset(
86
+ source_ds_name, version=source_ds_latest_version
87
+ )
84
88
 
85
89
  # Handle error records if delta_retry is a string (column name)
86
90
  if isinstance(delta_retry, str):
@@ -232,8 +236,8 @@ def delta_retry_update(
232
236
  if processing_chain is None or (processing_chain and processing_chain.empty):
233
237
  return None, None, False
234
238
 
235
- latest_dataset = datachain.read_dataset(name, latest_version)
236
- compared_chain = latest_dataset.compare(
239
+ latest_dataset = datachain.read_dataset(name, version=latest_version)
240
+ compared_chain = latest_dataset.diff(
237
241
  processing_chain,
238
242
  on=right_on or on,
239
243
  added=True,
datachain/error.py CHANGED
@@ -2,10 +2,42 @@ class DataChainError(RuntimeError):
2
2
  pass
3
3
 
4
4
 
5
+ class InvalidDatasetNameError(RuntimeError):
6
+ pass
7
+
8
+
9
+ class InvalidNamespaceNameError(RuntimeError):
10
+ pass
11
+
12
+
13
+ class InvalidProjectNameError(RuntimeError):
14
+ pass
15
+
16
+
5
17
  class NotFoundError(Exception):
6
18
  pass
7
19
 
8
20
 
21
+ class NamespaceNotFoundError(NotFoundError):
22
+ pass
23
+
24
+
25
+ class NotAllowedError(Exception):
26
+ pass
27
+
28
+
29
+ class NamespaceCreateNotAllowedError(NotAllowedError):
30
+ pass
31
+
32
+
33
+ class ProjectCreateNotAllowedError(NotAllowedError):
34
+ pass
35
+
36
+
37
+ class ProjectNotFoundError(NotFoundError):
38
+ pass
39
+
40
+
9
41
  class DatasetNotFoundError(NotFoundError):
10
42
  pass
11
43
 
@@ -53,3 +85,7 @@ class ClientError(RuntimeError):
53
85
 
54
86
  class TableMissingError(DataChainError):
55
87
  pass
88
+
89
+
90
+ class OutdatedDatabaseSchemaError(DataChainError):
91
+ pass