datachain 0.22.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -49,6 +49,7 @@ from datachain.error import (
49
49
  DatasetInvalidVersionError,
50
50
  DatasetNotFoundError,
51
51
  DatasetVersionNotFoundError,
52
+ NamespaceNotFoundError,
52
53
  ProjectNotFoundError,
53
54
  QueryScriptCancelError,
54
55
  QueryScriptRunError,
@@ -1059,6 +1060,39 @@ class Catalog:
1059
1060
 
1060
1061
  return self.get_dataset(name, project)
1061
1062
 
1063
+ def get_full_dataset_name(
1064
+ self,
1065
+ name: str,
1066
+ project_name: Optional[str] = None,
1067
+ namespace_name: Optional[str] = None,
1068
+ ) -> tuple[str, str, str]:
1069
+ """
1070
+ Returns dataset name together with separated namespace and project name.
1071
+ It takes into account all the ways namespace and project can be added.
1072
+ """
1073
+ parsed_namespace_name, parsed_project_name, name = parse_dataset_name(name)
1074
+
1075
+ namespace_env = os.environ.get("DATACHAIN_NAMESPACE")
1076
+ project_env = os.environ.get("DATACHAIN_PROJECT")
1077
+ if project_env and len(project_env.split(".")) == 2:
1078
+ # we allow setting both namespace and project in DATACHAIN_PROJECT
1079
+ namespace_env, project_env = project_env.split(".")
1080
+
1081
+ namespace_name = (
1082
+ parsed_namespace_name
1083
+ or namespace_name
1084
+ or namespace_env
1085
+ or self.metastore.default_namespace_name
1086
+ )
1087
+ project_name = (
1088
+ parsed_project_name
1089
+ or project_name
1090
+ or project_env
1091
+ or self.metastore.default_project_name
1092
+ )
1093
+
1094
+ return namespace_name, project_name, name
1095
+
1062
1096
  def get_dataset(
1063
1097
  self, name: str, project: Optional[Project] = None
1064
1098
  ) -> DatasetRecord:
@@ -1074,21 +1108,26 @@ class Catalog:
1074
1108
  namespace_name: str,
1075
1109
  project_name: str,
1076
1110
  version: Optional[str] = None,
1111
+ pull_dataset: bool = False,
1112
+ update: bool = False,
1077
1113
  ) -> DatasetRecord:
1078
- try:
1079
- project = self.metastore.get_project(project_name, namespace_name)
1080
- ds = self.get_dataset(name, project)
1081
- if version and not ds.has_version(version):
1082
- raise DatasetVersionNotFoundError(
1083
- f"Dataset {name} does not have version {version}"
1084
- )
1085
- return ds
1114
+ if self.metastore.is_local_dataset(namespace_name) or not update:
1115
+ try:
1116
+ project = self.metastore.get_project(project_name, namespace_name)
1117
+ ds = self.get_dataset(name, project)
1118
+ if not version or ds.has_version(version):
1119
+ return ds
1120
+ except (NamespaceNotFoundError, ProjectNotFoundError, DatasetNotFoundError):
1121
+ pass
1122
+
1123
+ if self.metastore.is_local_dataset(namespace_name):
1124
+ raise DatasetNotFoundError(
1125
+ f"Dataset {name}"
1126
+ + (f" version {version} " if version else " ")
1127
+ + "not found"
1128
+ )
1086
1129
 
1087
- except (
1088
- ProjectNotFoundError,
1089
- DatasetNotFoundError,
1090
- DatasetVersionNotFoundError,
1091
- ):
1130
+ if pull_dataset:
1092
1131
  print("Dataset not found in local catalog, trying to get from studio")
1093
1132
  remote_ds_uri = create_dataset_uri(
1094
1133
  name, namespace_name, project_name, version
@@ -1103,6 +1142,8 @@ class Catalog:
1103
1142
  name, self.metastore.get_project(project_name, namespace_name)
1104
1143
  )
1105
1144
 
1145
+ return self.get_remote_dataset(namespace_name, project_name, name)
1146
+
1106
1147
  def get_dataset_with_version_uuid(self, uuid: str) -> DatasetRecord:
1107
1148
  """Returns dataset that contains version with specific uuid"""
1108
1149
  for dataset in self.ls_datasets():
@@ -1119,6 +1160,10 @@ class Catalog:
1119
1160
 
1120
1161
  info_response = studio_client.dataset_info(namespace, project, name)
1121
1162
  if not info_response.ok:
1163
+ if info_response.status == 404:
1164
+ raise DatasetNotFoundError(
1165
+ f"Dataset {namespace}.{project}.{name} not found"
1166
+ )
1122
1167
  raise DataChainError(info_response.message)
1123
1168
 
1124
1169
  dataset_info = info_response.data
@@ -8,7 +8,6 @@ if TYPE_CHECKING:
8
8
 
9
9
  from datachain.cli.utils import determine_flavors
10
10
  from datachain.config import Config
11
- from datachain.dataset import parse_dataset_name
12
11
  from datachain.error import DataChainError, DatasetNotFoundError
13
12
  from datachain.studio import list_datasets as list_datasets_studio
14
13
 
@@ -106,9 +105,8 @@ def list_datasets_local(catalog: "Catalog", name: Optional[str] = None):
106
105
 
107
106
 
108
107
  def list_datasets_local_versions(catalog: "Catalog", name: str):
109
- namespace_name, project_name, name = parse_dataset_name(name)
110
- namespace_name = namespace_name or catalog.metastore.default_namespace_name
111
- project_name = project_name or catalog.metastore.default_project_name
108
+ namespace_name, project_name, name = catalog.get_full_dataset_name(name)
109
+
112
110
  project = catalog.metastore.get_project(project_name, namespace_name)
113
111
  ds = catalog.get_dataset(name, project)
114
112
  for v in ds.versions:
@@ -137,9 +135,7 @@ def rm_dataset(
137
135
  studio: Optional[bool] = False,
138
136
  team: Optional[str] = None,
139
137
  ):
140
- namespace_name, project_name, name = parse_dataset_name(name)
141
- namespace_name = namespace_name or catalog.metastore.default_namespace_name
142
- project_name = project_name or catalog.metastore.default_project_name
138
+ namespace_name, project_name, name = catalog.get_full_dataset_name(name)
143
139
 
144
140
  if not catalog.metastore.is_local_dataset(namespace_name) and studio:
145
141
  from datachain.studio import remove_studio_dataset
@@ -166,9 +162,7 @@ def edit_dataset(
166
162
  attrs: Optional[list[str]] = None,
167
163
  team: Optional[str] = None,
168
164
  ):
169
- namespace_name, project_name, name = parse_dataset_name(name)
170
- namespace_name = namespace_name or catalog.metastore.default_namespace_name
171
- project_name = project_name or catalog.metastore.default_project_name
165
+ namespace_name, project_name, name = catalog.get_full_dataset_name(name)
172
166
 
173
167
  if catalog.metastore.is_local_dataset(namespace_name):
174
168
  try:
@@ -132,6 +132,7 @@ class AbstractMetastore(ABC, Serializable):
132
132
  description: Optional[str] = None,
133
133
  uuid: Optional[str] = None,
134
134
  ignore_if_exists: bool = True,
135
+ validate: bool = True,
135
136
  **kwargs,
136
137
  ) -> Namespace:
137
138
  """Creates new namespace"""
@@ -192,6 +193,7 @@ class AbstractMetastore(ABC, Serializable):
192
193
  description: Optional[str] = None,
193
194
  uuid: Optional[str] = None,
194
195
  ignore_if_exists: bool = True,
196
+ validate: bool = True,
195
197
  **kwargs,
196
198
  ) -> Project:
197
199
  """Creates new project in specific namespace"""
@@ -725,8 +727,11 @@ class AbstractDBMetastore(AbstractMetastore):
725
727
  description: Optional[str] = None,
726
728
  uuid: Optional[str] = None,
727
729
  ignore_if_exists: bool = True,
730
+ validate: bool = True,
728
731
  **kwargs,
729
732
  ) -> Namespace:
733
+ if validate:
734
+ Namespace.validate_name(name)
730
735
  query = self._namespaces_insert().values(
731
736
  name=name,
732
737
  uuid=uuid or str(uuid4()),
@@ -775,12 +780,15 @@ class AbstractDBMetastore(AbstractMetastore):
775
780
  description: Optional[str] = None,
776
781
  uuid: Optional[str] = None,
777
782
  ignore_if_exists: bool = True,
783
+ validate: bool = True,
778
784
  **kwargs,
779
785
  ) -> Project:
786
+ if validate:
787
+ Project.validate_name(name)
780
788
  try:
781
789
  namespace = self.get_namespace(namespace_name)
782
790
  except NamespaceNotFoundError:
783
- namespace = self.create_namespace(namespace_name)
791
+ namespace = self.create_namespace(namespace_name, validate=validate)
784
792
 
785
793
  query = self._projects_insert().values(
786
794
  namespace_id=namespace.id,
@@ -817,11 +825,14 @@ class AbstractDBMetastore(AbstractMetastore):
817
825
  """Gets a single project inside some namespace by name"""
818
826
  n = self._namespaces
819
827
  p = self._projects
828
+ validate = True
829
+
820
830
  if self._is_listing_project(name, namespace_name) or self._is_default_project(
821
831
  name, namespace_name
822
832
  ):
823
833
  # we are always creating default and listing projects if they don't exist
824
834
  create = True
835
+ validate = False
825
836
 
826
837
  query = self._projects_select(
827
838
  *(getattr(n.c, f) for f in self._namespaces_fields),
@@ -834,7 +845,7 @@ class AbstractDBMetastore(AbstractMetastore):
834
845
  rows = list(self.db.execute(query, conn=conn))
835
846
  if not rows:
836
847
  if create:
837
- return self.create_project(namespace_name, name)
848
+ return self.create_project(namespace_name, name, validate=validate)
838
849
  raise ProjectNotFoundError(
839
850
  f"Project {name} in namespace {namespace_name} not found."
840
851
  )
@@ -468,8 +468,12 @@ class SQLiteMetastore(AbstractDBMetastore):
468
468
  be created implicitly though, to keep the same fully qualified name with
469
469
  Studio dataset.
470
470
  """
471
- system_namespace = self.create_namespace(Namespace.system(), "System namespace")
472
- self.create_project(system_namespace.name, Project.listing(), "Listing project")
471
+ system_namespace = self.create_namespace(
472
+ Namespace.system(), "System namespace", validate=False
473
+ )
474
+ self.create_project(
475
+ system_namespace.name, Project.listing(), "Listing project", validate=False
476
+ )
473
477
 
474
478
  def _check_schema_version(self) -> None:
475
479
  """
datachain/dataset.py CHANGED
@@ -12,6 +12,9 @@ from typing import (
12
12
  )
13
13
  from urllib.parse import urlparse
14
14
 
15
+ from packaging.specifiers import SpecifierSet
16
+ from packaging.version import Version
17
+
15
18
  from datachain import semver
16
19
  from datachain.error import DatasetVersionNotFoundError, InvalidDatasetNameError
17
20
  from datachain.namespace import Namespace
@@ -81,8 +84,10 @@ def create_dataset_uri(
81
84
  def parse_dataset_name(name: str) -> tuple[Optional[str], Optional[str], str]:
82
85
  """Parses dataset name and returns namespace, project and name"""
83
86
  if not name:
84
- raise ValueError("Name must be defined to parse it")
87
+ raise InvalidDatasetNameError("Name must be defined to parse it")
85
88
  split = name.split(".")
89
+ if len(split) > 3:
90
+ raise InvalidDatasetNameError(f"Invalid dataset name {name}")
86
91
  name = split[-1]
87
92
  project_name = split[-2] if len(split) > 1 else None
88
93
  namespace_name = split[-3] if len(split) > 2 else None
@@ -659,13 +664,39 @@ class DatasetRecord:
659
664
  return None
660
665
  return max(versions).version
661
666
 
662
- @property
663
- def prev_version(self) -> Optional[str]:
664
- """Returns previous version of a dataset"""
665
- if len(self.versions) == 1:
667
+ def latest_compatible_version(self, version_spec: str) -> Optional[str]:
668
+ """
669
+ Returns the latest version that matches the given version specifier.
670
+
671
+ Supports Python version specifiers like:
672
+ - ">=1.0.0,<2.0.0" (compatible release range)
673
+ - "~=1.4.2" (compatible release clause)
674
+ - "==1.2.*" (prefix matching)
675
+ - ">1.0.0" (exclusive ordered comparison)
676
+ - ">=1.0.0" (inclusive ordered comparison)
677
+ - "!=1.3.0" (version exclusion)
678
+
679
+ Args:
680
+ version_spec: Version specifier string following PEP 440
681
+
682
+ Returns:
683
+ Latest compatible version string, or None if no compatible version found
684
+ """
685
+ spec_set = SpecifierSet(version_spec)
686
+
687
+ # Convert dataset versions to packaging.Version objects
688
+ # and filter compatible ones
689
+ compatible_versions = []
690
+ for v in self.versions:
691
+ pkg_version = Version(v.version)
692
+ if spec_set.contains(pkg_version):
693
+ compatible_versions.append(v)
694
+
695
+ if not compatible_versions:
666
696
  return None
667
697
 
668
- return sorted(self.versions)[-2].version
698
+ # Return the latest compatible version
699
+ return max(compatible_versions).version
669
700
 
670
701
  @classmethod
671
702
  def from_dict(cls, d: dict[str, Any]) -> "DatasetRecord":
@@ -24,7 +24,7 @@ from pydantic import BaseModel
24
24
  from tqdm import tqdm
25
25
 
26
26
  from datachain import semver
27
- from datachain.dataset import DatasetRecord, parse_dataset_name
27
+ from datachain.dataset import DatasetRecord
28
28
  from datachain.delta import delta_disabled
29
29
  from datachain.error import ProjectCreateNotAllowedError, ProjectNotFoundError
30
30
  from datachain.func import literal
@@ -557,6 +557,7 @@ class DataChain:
557
557
  update_version: which part of the dataset version to automatically increase.
558
558
  Available values: `major`, `minor` or `patch`. Default is `patch`.
559
559
  """
560
+ catalog = self.session.catalog
560
561
  if version is not None:
561
562
  semver.validate(version)
562
563
 
@@ -570,17 +571,10 @@ class DataChain:
570
571
  " patch"
571
572
  )
572
573
 
573
- namespace_name, project_name, name = parse_dataset_name(name)
574
-
575
- namespace_name = (
576
- namespace_name
577
- or self._settings.namespace
578
- or self.session.catalog.metastore.default_namespace_name
579
- )
580
- project_name = (
581
- project_name
582
- or self._settings.project
583
- or self.session.catalog.metastore.default_project_name
574
+ namespace_name, project_name, name = catalog.get_full_dataset_name(
575
+ name,
576
+ namespace_name=self._settings.namespace,
577
+ project_name=self._settings.project,
584
578
  )
585
579
 
586
580
  try:
@@ -1,16 +1,12 @@
1
1
  from collections.abc import Sequence
2
2
  from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
3
3
 
4
- from datachain.dataset import parse_dataset_name
5
4
  from datachain.error import (
6
5
  DatasetNotFoundError,
7
6
  DatasetVersionNotFoundError,
8
7
  ProjectNotFoundError,
9
8
  )
10
9
  from datachain.lib.dataset_info import DatasetInfo
11
- from datachain.lib.file import (
12
- File,
13
- )
14
10
  from datachain.lib.projects import get as get_project
15
11
  from datachain.lib.settings import Settings
16
12
  from datachain.lib.signal_schema import SignalSchema
@@ -35,7 +31,6 @@ def read_dataset(
35
31
  version: Optional[Union[str, int]] = None,
36
32
  session: Optional[Session] = None,
37
33
  settings: Optional[dict] = None,
38
- fallback_to_studio: bool = True,
39
34
  delta: Optional[bool] = False,
40
35
  delta_on: Optional[Union[str, Sequence[str]]] = (
41
36
  "file.path",
@@ -45,6 +40,7 @@ def read_dataset(
45
40
  delta_result_on: Optional[Union[str, Sequence[str]]] = None,
46
41
  delta_compare: Optional[Union[str, Sequence[str]]] = None,
47
42
  delta_retry: Optional[Union[bool, str]] = None,
43
+ update: bool = False,
48
44
  ) -> "DataChain":
49
45
  """Get data from a saved Dataset. It returns the chain itself.
50
46
  If dataset or version is not found locally, it will try to pull it from Studio.
@@ -56,11 +52,12 @@ def read_dataset(
56
52
  set; otherwise, default values will be applied.
57
53
  namespace : optional name of namespace in which dataset to read is created
58
54
  project : optional name of project in which dataset to read is created
59
- version : dataset version
55
+ version : dataset version. Supports:
56
+ - Exact version strings: "1.2.3"
57
+ - Legacy integer versions: 1, 2, 3 (finds latest major version)
58
+ - Version specifiers (PEP 440): ">=1.0.0,<2.0.0", "~=1.4.2", "==1.2.*", etc.
60
59
  session : Session to use for the chain.
61
60
  settings : Settings to use for the chain.
62
- fallback_to_studio : Try to pull dataset from Studio if not found locally.
63
- Default is True.
64
61
  delta: If True, only process new or changed files instead of reprocessing
65
62
  everything. This saves time by skipping files that were already processed in
66
63
  previous versions. The optimization is working when a new version of the
@@ -80,6 +77,10 @@ def read_dataset(
80
77
  (error mode)
81
78
  - True: Reprocess records missing from the result dataset (missing mode)
82
79
  - None: No retry processing (default)
80
+ update: If True always checks for newer versions available on Studio, even if
81
+ some version of the dataset exists locally already. If False (default), it
82
+ will only fetch the dataset from Studio if it is not found locally.
83
+
83
84
 
84
85
  Example:
85
86
  ```py
@@ -93,11 +94,22 @@ def read_dataset(
93
94
  ```
94
95
 
95
96
  ```py
96
- chain = dc.read_dataset("my_cats", fallback_to_studio=False)
97
+ chain = dc.read_dataset("my_cats", version="1.0.0")
97
98
  ```
98
99
 
99
100
  ```py
100
- chain = dc.read_dataset("my_cats", version="1.0.0")
101
+ # Using version specifiers (PEP 440)
102
+ chain = dc.read_dataset("my_cats", version=">=1.0.0,<2.0.0")
103
+ ```
104
+
105
+ ```py
106
+ # Legacy integer version support (finds latest in major version)
107
+ chain = dc.read_dataset("my_cats", version=1) # Latest 1.x.x version
108
+ ```
109
+
110
+ ```py
111
+ # Always check for newer versions matching a version specifier from Studio
112
+ chain = dc.read_dataset("my_cats", version=">=1.0.0", update=True)
101
113
  ```
102
114
 
103
115
  ```py
@@ -114,7 +126,6 @@ def read_dataset(
114
126
  version="1.0.0",
115
127
  session=session,
116
128
  settings=settings,
117
- fallback_to_studio=True,
118
129
  )
119
130
  ```
120
131
  """
@@ -122,41 +133,49 @@ def read_dataset(
122
133
 
123
134
  from .datachain import DataChain
124
135
 
136
+ telemetry.send_event_once("class", "datachain_init", name=name, version=version)
137
+
125
138
  session = Session.get(session)
126
139
  catalog = session.catalog
127
140
 
128
- namespace_name, project_name, name = parse_dataset_name(name)
129
- namespace_name = (
130
- namespace_name or namespace or catalog.metastore.default_namespace_name
141
+ namespace_name, project_name, name = catalog.get_full_dataset_name(
142
+ name,
143
+ project_name=project,
144
+ namespace_name=namespace,
131
145
  )
132
- project_name = project_name or project or catalog.metastore.default_project_name
133
146
 
134
147
  if version is not None:
148
+ dataset = session.catalog.get_dataset_with_remote_fallback(
149
+ name, namespace_name, project_name, update=update
150
+ )
151
+
152
+ # Convert legacy integer versions to version specifiers
153
+ # For backward compatibility we still allow users to put version as integer
154
+ # in which case we convert it to a version specifier that finds the latest
155
+ # version where major part is equal to that input version.
156
+ # For example if user sets version=2, we convert it to ">=2.0.0,<3.0.0"
157
+ # which will find something like 2.4.3 (assuming 2.4.3 is the biggest among
158
+ # all 2.* dataset versions)
159
+ if isinstance(version, int):
160
+ version_spec = f">={version}.0.0,<{version + 1}.0.0"
161
+ else:
162
+ version_spec = str(version)
163
+
164
+ from packaging.specifiers import InvalidSpecifier, SpecifierSet
165
+
135
166
  try:
136
- # for backward compatibility we still allow users to put version as integer
137
- # in which case we are trying to find latest version where major part is
138
- # equal to that input version. For example if user sets version=2, we could
139
- # continue with something like 2.4.3 (assuming 2.4.3 is the biggest among
140
- # all 2.* dataset versions). If dataset doesn't have any versions where
141
- # major part is equal to that input, exception is thrown.
142
- major = int(version)
143
- try:
144
- ds_project = get_project(project_name, namespace_name, session=session)
145
- except ProjectNotFoundError:
146
- raise DatasetNotFoundError(
147
- f"Dataset {name} not found in namespace {namespace_name} and",
148
- f" project {project_name}",
149
- ) from None
150
-
151
- dataset = session.catalog.get_dataset(name, ds_project)
152
- latest_major = dataset.latest_major_version(major)
153
- if not latest_major:
167
+ # Try to parse as version specifier
168
+ SpecifierSet(version_spec)
169
+ # If it's a valid specifier set, find the latest compatible version
170
+ latest_compatible = dataset.latest_compatible_version(version_spec)
171
+ if not latest_compatible:
154
172
  raise DatasetVersionNotFoundError(
155
- f"Dataset {name} does not have version {version}"
173
+ f"No dataset {name} version matching specifier {version_spec}"
156
174
  )
157
- version = latest_major
158
- except ValueError:
159
- # version is in new semver string format, continuing as normal
175
+ version = latest_compatible
176
+ except InvalidSpecifier:
177
+ # If not a valid specifier, treat as exact version string
178
+ # This handles cases like "1.2.3" which are exact versions, not specifiers
160
179
  pass
161
180
 
162
181
  if settings:
@@ -170,11 +189,8 @@ def read_dataset(
170
189
  namespace_name=namespace_name,
171
190
  version=version, # type: ignore[arg-type]
172
191
  session=session,
173
- indexing_column_types=File._datachain_column_types,
174
- fallback_to_studio=fallback_to_studio,
175
192
  )
176
193
 
177
- telemetry.send_event_once("class", "datachain_init", name=name, version=version)
178
194
  signals_schema = SignalSchema({"sys": Sys})
179
195
  if query.feature_schema:
180
196
  signals_schema |= SignalSchema.deserialize(query.feature_schema)
@@ -320,11 +336,11 @@ def delete_dataset(
320
336
  session = Session.get(session, in_memory=in_memory)
321
337
  catalog = session.catalog
322
338
 
323
- namespace_name, project_name, name = parse_dataset_name(name)
324
- namespace_name = (
325
- namespace_name or namespace or catalog.metastore.default_namespace_name
339
+ namespace_name, project_name, name = catalog.get_full_dataset_name(
340
+ name,
341
+ project_name=project,
342
+ namespace_name=namespace,
326
343
  )
327
- project_name = project_name or project or catalog.metastore.default_project_name
328
344
 
329
345
  if not catalog.metastore.is_local_dataset(namespace_name) and studio:
330
346
  return remove_studio_dataset(
@@ -127,12 +127,8 @@ def read_listing_dataset(
127
127
  if version is None:
128
128
  version = dataset.latest_version
129
129
 
130
- query = DatasetQuery(
131
- name=name,
132
- session=session,
133
- indexing_column_types=File._datachain_column_types,
134
- fallback_to_studio=False,
135
- )
130
+ query = DatasetQuery(name=name, session=session)
131
+
136
132
  if settings:
137
133
  cfg = {**settings}
138
134
  if "prefetch" not in cfg:
@@ -97,4 +97,4 @@ def read_records(
97
97
  for chunk in batched(records, INSERT_BATCH_SIZE):
98
98
  warehouse.insert_rows(table, chunk)
99
99
  warehouse.insert_rows_done(table)
100
- return read_dataset(name=dsr.name, session=session, settings=settings)
100
+ return read_dataset(name=dsr.full_name, session=session, settings=settings)
datachain/lib/projects.py CHANGED
@@ -54,7 +54,7 @@ def get(name: str, namespace: str, session: Optional[Session]) -> Project:
54
54
  ```py
55
55
  import datachain as dc
56
56
  from datachain.lib.projects import get as get_project
57
- project = get_project("my-project", "local")
57
+ project = get_project("my-project", "local")
58
58
  ```
59
59
  """
60
60
  return Session.get(session).catalog.metastore.get_project(name, namespace)
@@ -25,6 +25,7 @@ from pydantic import BaseModel, Field, create_model
25
25
  from sqlalchemy import ColumnElement
26
26
  from typing_extensions import Literal as LiteralEx
27
27
 
28
+ from datachain.func import literal
28
29
  from datachain.func.func import Func
29
30
  from datachain.lib.convert.python_to_sql import python_to_sql
30
31
  from datachain.lib.convert.sql_to_python import sql_to_python
@@ -659,6 +660,7 @@ class SignalSchema:
659
660
 
660
661
  def mutate(self, args_map: dict) -> "SignalSchema":
661
662
  new_values = self.values.copy()
663
+ primitives = (bool, str, int, float)
662
664
 
663
665
  for name, value in args_map.items():
664
666
  if isinstance(value, Column) and value.name in self.values:
@@ -679,6 +681,12 @@ class SignalSchema:
679
681
  # adding new signal with function
680
682
  new_values[name] = value.get_result_type(self)
681
683
  continue
684
+ if isinstance(value, primitives):
685
+ # For primitives, store the type, not the value
686
+ val = literal(value)
687
+ val.type = python_to_sql(type(value))()
688
+ new_values[name] = sql_to_python(val)
689
+ continue
682
690
  if isinstance(value, ColumnElement):
683
691
  # adding new signal
684
692
  new_values[name] = sql_to_python(value)
@@ -1099,13 +1099,9 @@ class DatasetQuery:
1099
1099
  namespace_name: Optional[str] = None,
1100
1100
  catalog: Optional["Catalog"] = None,
1101
1101
  session: Optional[Session] = None,
1102
- indexing_column_types: Optional[dict[str, Any]] = None,
1103
1102
  in_memory: bool = False,
1104
- fallback_to_studio: bool = True,
1105
1103
  update: bool = False,
1106
1104
  ) -> None:
1107
- from datachain.remote.studio import is_token_set
1108
-
1109
1105
  self.session = Session.get(session, catalog=catalog, in_memory=in_memory)
1110
1106
  self.catalog = catalog or self.session.catalog
1111
1107
  self.steps: list[Step] = []
@@ -1137,18 +1133,16 @@ class DatasetQuery:
1137
1133
  # not setting query step yet as listing dataset might not exist at
1138
1134
  # this point
1139
1135
  self.list_ds_name = name
1140
- elif fallback_to_studio and is_token_set():
1136
+ else:
1141
1137
  self._set_starting_step(
1142
1138
  self.catalog.get_dataset_with_remote_fallback(
1143
1139
  name,
1144
1140
  namespace_name=namespace_name,
1145
1141
  project_name=project_name,
1146
1142
  version=version,
1143
+ pull_dataset=True,
1147
1144
  )
1148
1145
  )
1149
- else:
1150
- project = self.catalog.metastore.get_project(project_name, namespace_name)
1151
- self._set_starting_step(self.catalog.get_dataset(name, project=project))
1152
1146
 
1153
1147
  def _set_starting_step(self, ds: "DatasetRecord") -> None:
1154
1148
  if not self.version:
@@ -78,10 +78,11 @@ def _parse_dates(obj: dict, date_fields: list[str]):
78
78
 
79
79
 
80
80
  class Response(Generic[T]):
81
- def __init__(self, data: T, ok: bool, message: str) -> None:
81
+ def __init__(self, data: T, ok: bool, message: str, status: int) -> None:
82
82
  self.data = data
83
83
  self.ok = ok
84
84
  self.message = message
85
+ self.status = status
85
86
 
86
87
  def __repr__(self):
87
88
  return (
@@ -186,7 +187,7 @@ class StudioClient:
186
187
  message = "Indexing in progress"
187
188
  else:
188
189
  message = content.get("message", "")
189
- return Response(response_data, ok, message)
190
+ return Response(response_data, ok, message, response.status_code)
190
191
 
191
192
  @retry_with_backoff(retries=3, errors=(HTTPError, Timeout))
192
193
  def _send_request(
@@ -236,7 +237,7 @@ class StudioClient:
236
237
  else:
237
238
  message = ""
238
239
 
239
- return Response(data, ok, message)
240
+ return Response(data, ok, message, response.status_code)
240
241
 
241
242
  @staticmethod
242
243
  def _unpacker_hook(code, data):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.22.0
3
+ Version: 0.24.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -3,7 +3,7 @@ datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
4
4
  datachain/cache.py,sha256=ESVRaCJXEThMIfGEFVHx6wJPOZA7FYk9V6WxjyuqUBY,3626
5
5
  datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
6
- datachain/dataset.py,sha256=--7UI8lZ2lVhk2mNCsHACGigQe96-jBUcbnDMebj-cE,24089
6
+ datachain/dataset.py,sha256=wDrukmkDnYP0X8bAGY-7O1NDE3DWCFqrH8VVDpXM9Ok,25263
7
7
  datachain/delta.py,sha256=4RqLLc9dJLF8x9GG9IDgi86DwuPerZQ4HAUnNBeACw8,8446
8
8
  datachain/error.py,sha256=OWwWMkzZYJrkcoEDGhJHMf7SfKvxcsOLRF94mjPf29I,1609
9
9
  datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
@@ -21,13 +21,13 @@ datachain/studio.py,sha256=bLok-eJNFRHQScEyAyA_Fas52dmijd5r-73KudWxV4k,13337
21
21
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
22
22
  datachain/utils.py,sha256=DNqOi-Ydb7InyWvD9m7_yailxz6-YGpZzh00biQaHNo,15305
23
23
  datachain/catalog/__init__.py,sha256=cMZzSz3VoUi-6qXSVaHYN-agxQuAcz2XSqnEPZ55crE,353
24
- datachain/catalog/catalog.py,sha256=43Yp1xQnwrozl2_VhVWQDFHxYPunXP95v7n1PoGc6mw,63546
24
+ datachain/catalog/catalog.py,sha256=z4GbRMHeW0YA20Sjh7QuPy1Rj4RkX547WN9Pp5wAD6o,65277
25
25
  datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
26
26
  datachain/catalog/loader.py,sha256=UXjYD6BNRoupPvkiz3-b04jepXhtLHCA4gzKFnXxOtQ,5987
27
27
  datachain/cli/__init__.py,sha256=WvBqnwjG8Wp9xGCn-4eqfoZ3n7Sj1HJemCi4MayJh_c,8221
28
28
  datachain/cli/utils.py,sha256=wrLnAh7Wx8O_ojZE8AE4Lxn5WoxHbOj7as8NWlLAA74,3036
29
29
  datachain/cli/commands/__init__.py,sha256=zp3bYIioO60x_X04A4-IpZqSYVnpwOa1AdERQaRlIhI,493
30
- datachain/cli/commands/datasets.py,sha256=Bva9gTi1HMvvCQPFUPxLYrHQduDlJDWV8EN6IcJcC3Y,6949
30
+ datachain/cli/commands/datasets.py,sha256=LfOe22O9OCEDI8K2cy05Gp4_Q-GFHOHRv4bXQ-USM4s,6472
31
31
  datachain/cli/commands/du.py,sha256=9edEzDEs98K2VYk8Wf-ZMpUzALcgm9uD6YtoqbvtUGU,391
32
32
  datachain/cli/commands/index.py,sha256=eglNaIe1yyIadUHHumjtNbgIjht6kme7SS7xE3YHR88,198
33
33
  datachain/cli/commands/ls.py,sha256=CBmk838Q-EQp04lE2Qdnpsc1GXAkC4-I-b-a_828n1E,5272
@@ -49,10 +49,10 @@ datachain/client/s3.py,sha256=6DNVGLg-woPS1DVlYVX2rIlunNblsuxyOnI1rSzhW3k,7515
49
49
  datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
50
50
  datachain/data_storage/db_engine.py,sha256=n8ojCbvVMPY2e3SG8fUaaD0b9GkVfpl_Naa_6EiHfWg,3788
51
51
  datachain/data_storage/job.py,sha256=9r0OGwh22bHNIvLHqg8_-eJSP1YYB-BN5HOla5TdCxw,402
52
- datachain/data_storage/metastore.py,sha256=YhkHEHvE--jKoOpCS5LkcLDMekfCX76VwubbXPoAiic,52317
52
+ datachain/data_storage/metastore.py,sha256=9mWYOKK3AoHeKPGFm-WBfPrmnYHhwYeXx5MOueKTe7I,52657
53
53
  datachain/data_storage/schema.py,sha256=o3JbURKXRg3IJyIVA4QjHHkn6byRuz7avbydU2FlvNY,9897
54
54
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
55
- datachain/data_storage/sqlite.py,sha256=bGb4_kEFvnGf3ZWekiv8z3VMZBzQyO0bSaNB5RrpUUs,29991
55
+ datachain/data_storage/sqlite.py,sha256=tT_soVi6l_pFSKaDktA1t4qW_vmPvXnvYSf4TZTKZYk,30067
56
56
  datachain/data_storage/warehouse.py,sha256=_7btARw-kd-Nx19S0qW6JqdF3VYyypQXFzsXq68SWKI,32327
57
57
  datachain/diff/__init__.py,sha256=-OFZzgOplqO84iWgGY7kfe60NXaWR9JRIh9T-uJboAM,9668
58
58
  datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -82,10 +82,10 @@ datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g
82
82
  datachain/lib/meta_formats.py,sha256=zdyg6XLk3QIsSk3I7s0Ez5kaCJSlE3uq7JiGxf7UwtU,6348
83
83
  datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
84
84
  datachain/lib/namespaces.py,sha256=it52UbbwB8dzhesO2pMs_nThXiPQ1Ph9sD9I3GQkg5s,2099
85
- datachain/lib/projects.py,sha256=C-HTzTLUbIB735_iBSV6MjWnntV6gaKCEIkMSR1YEQw,2596
85
+ datachain/lib/projects.py,sha256=8lN0qV8czX1LGtWURCUvRlSJk-RpO9w9Rra_pOZus6g,2595
86
86
  datachain/lib/pytorch.py,sha256=oBBd6cxYrcwaFz7IQajKqhGqDdNnwUZWs0wJPRizrjk,7712
87
87
  datachain/lib/settings.py,sha256=9wi0FoHxRxNiyn99pR28IYsMkoo47jQxeXuObQr2Ar0,2929
88
- datachain/lib/signal_schema.py,sha256=Zhg8qThFDf9eoNWFH6KGeYB-sIGys7A_ybq2CUBG7Dg,36127
88
+ datachain/lib/signal_schema.py,sha256=dVEqqrQQ_BS3yzU_49-Gari7IjVyMl1UT8h1WIsZabs,36489
89
89
  datachain/lib/tar.py,sha256=MLcVjzIgBqRuJacCNpZ6kwSZNq1i2tLyROc8PVprHsA,999
90
90
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
91
91
  datachain/lib/udf.py,sha256=3uITkhO8IZnX49aePheObzd5ORYi2DIDYZVMQlBAJ-s,16687
@@ -103,14 +103,14 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
103
103
  datachain/lib/dc/__init__.py,sha256=HD0NYrdy44u6kkpvgGjJcvGz-UGTHui2azghcT8ZUg0,838
104
104
  datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
105
105
  datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,6051
106
- datachain/lib/dc/datachain.py,sha256=B6z8e33ZAUKbJ-cqQko-VJEtmia2bfUnuqH7BQQVt_A,85998
107
- datachain/lib/dc/datasets.py,sha256=xiVNe7PosuIsyACFhly9qNxGmRQy1J2TQw3AD6uj9UM,12747
106
+ datachain/lib/dc/datachain.py,sha256=dFI7JX5-41HLgA-TUR99dtR1lvk2vokaMC3mbIW1XT4,85814
107
+ datachain/lib/dc/datasets.py,sha256=U4xqAfs6FdW8HIJjeayQaIg1dunaIsVXYGqfq_sDSv0,13274
108
108
  datachain/lib/dc/hf.py,sha256=PJl2wiLjdRsMz0SYbLT-6H8b-D5i2WjeH7li8HHOk_0,2145
109
109
  datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
110
- datachain/lib/dc/listings.py,sha256=eVBUP25W81dv46DLqkv8K0X7N3nxhoZm77gFrByeT_E,4660
110
+ datachain/lib/dc/listings.py,sha256=V379Cb-7ZyquM0w7sWArQZkzInZy4GB7QQ1ZfowKzQY,4544
111
111
  datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
112
112
  datachain/lib/dc/parquet.py,sha256=zYcSgrWwyEDW9UxGUSVdIVsCu15IGEf0xL8KfWQqK94,1782
113
- datachain/lib/dc/records.py,sha256=AMtfWc7K6mtbW2OiaeIm3SjHTxDGnSgCEQW5u984Qh0,3111
113
+ datachain/lib/dc/records.py,sha256=FpPbApWopUri1gIaSMsfXN4fevja4mjmfb6Q5eiaGxI,3116
114
114
  datachain/lib/dc/storage.py,sha256=8xiV3c6k-sG14RGwNJCp0AbV6L0mNDsTVZ-Est-ccnw,7672
115
115
  datachain/lib/dc/utils.py,sha256=VawOAlJSvAtZbsMg33s5tJe21TRx1Km3QggI1nN6tnw,3984
116
116
  datachain/lib/dc/values.py,sha256=7l1n352xWrEdql2NhBcZ3hj8xyPglWiY4qHjFPjn6iw,1428
@@ -125,7 +125,7 @@ datachain/model/ultralytics/pose.py,sha256=pBlmt63Qe68FKmexHimUGlNbNOoOlMHXG4fzX
125
125
  datachain/model/ultralytics/segment.py,sha256=63bDCj43E6iZ0hFI5J6uQfksdCmjEp6sEm1XzVaE8pw,2986
126
126
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
127
127
  datachain/query/batch.py,sha256=-goxLpE0EUvaDHu66rstj53UnfHpYfBUGux8GSpJ93k,4306
128
- datachain/query/dataset.py,sha256=SjFUh77rBTpgBZG4cfMJiJ2DhiCubGVk2cG1RYX4oyA,61571
128
+ datachain/query/dataset.py,sha256=C60VM0pScsrWcMqLNdX-tU0HE1SnEE9lRN3TU8CfTu4,61223
129
129
  datachain/query/dispatch.py,sha256=A0nPxn6mEN5d9dDo6S8m16Ji_9IvJLXrgF2kqXdi4fs,15546
130
130
  datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
131
131
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -135,7 +135,7 @@ datachain/query/session.py,sha256=gKblltJAVQAVSTswAgWGDgGbpmFlFzFVkIQojDCjgXM,68
135
135
  datachain/query/udf.py,sha256=e753bDJzTNjGFQn1WGTvOAWSwjDbrFI1-_DDWkWN2ls,1343
136
136
  datachain/query/utils.py,sha256=HaSDNH_XGvp_NIcXjcB7j4vJRPi4_tbztDWclYelHY4,1208
137
137
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
138
- datachain/remote/studio.py,sha256=aSAh7CCHrTp7U-642jHFkwY0Fer4YRAHVVpWomab3zY,15110
138
+ datachain/remote/studio.py,sha256=oJp2KD9eO8zQDnPfNpAALZYsOlBfqVKKRTeCkEpcsYk,15196
139
139
  datachain/sql/__init__.py,sha256=6SQRdbljO3d2hx3EAVXEZrHQKv5jth0Jh98PogT59No,262
140
140
  datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
141
141
  datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
@@ -157,9 +157,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
157
157
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
158
158
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
159
159
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
160
- datachain-0.22.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
161
- datachain-0.22.0.dist-info/METADATA,sha256=xfdXuYjS-y5_IokpYEC7ZlmB6Wx1ouF7bh6K-TAacJI,13281
162
- datachain-0.22.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
163
- datachain-0.22.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
164
- datachain-0.22.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
165
- datachain-0.22.0.dist-info/RECORD,,
160
+ datachain-0.24.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
161
+ datachain-0.24.0.dist-info/METADATA,sha256=QWSVON3r5d5d18gRMs9G5DNV4z-kBBY47dMYUEFR0b0,13281
162
+ datachain-0.24.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
163
+ datachain-0.24.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
164
+ datachain-0.24.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
165
+ datachain-0.24.0.dist-info/RECORD,,