datachain 0.8.11__py3-none-any.whl → 0.8.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -38,7 +38,6 @@ from datachain.dataset import (
38
38
  DatasetDependency,
39
39
  DatasetListRecord,
40
40
  DatasetRecord,
41
- DatasetStats,
42
41
  DatasetStatus,
43
42
  StorageURI,
44
43
  create_dataset_uri,
@@ -1235,17 +1234,6 @@ class Catalog:
1235
1234
  dataset = self.get_dataset(name)
1236
1235
  return self.warehouse.dataset_table_export_file_names(dataset, version)
1237
1236
 
1238
- def dataset_stats(self, name: str, version: Optional[int]) -> DatasetStats:
1239
- """
1240
- Returns tuple with dataset stats: total number of rows and total dataset size.
1241
- """
1242
- dataset = self.get_dataset(name)
1243
- dataset_version = dataset.get_version(version or dataset.latest_version)
1244
- return DatasetStats(
1245
- num_objects=dataset_version.num_objects,
1246
- size=dataset_version.size,
1247
- )
1248
-
1249
1237
  def remove_dataset(
1250
1238
  self,
1251
1239
  name: str,
@@ -1391,19 +1379,12 @@ class Catalog:
1391
1379
  except DatasetNotFoundError:
1392
1380
  pass
1393
1381
 
1394
- stats_response = studio_client.dataset_stats(
1395
- remote_ds_name, remote_ds_version.version
1396
- )
1397
- if not stats_response.ok:
1398
- raise_remote_error(stats_response.message)
1399
- ds_stats = stats_response.data
1400
-
1401
1382
  dataset_save_progress_bar = tqdm(
1402
1383
  desc=f"Saving dataset {remote_ds_uri} locally: ",
1403
1384
  unit=" rows",
1404
1385
  unit_scale=True,
1405
1386
  unit_divisor=1000,
1406
- total=ds_stats.num_objects, # type: ignore [union-attr]
1387
+ total=remote_ds_version.num_objects, # type: ignore [union-attr]
1407
1388
  leave=False,
1408
1389
  )
1409
1390
 
datachain/cli/__init__.py CHANGED
@@ -11,7 +11,6 @@ from datachain.telemetry import telemetry
11
11
  from .commands import (
12
12
  clear_cache,
13
13
  completion,
14
- dataset_stats,
15
14
  du,
16
15
  edit_dataset,
17
16
  garbage_collect,
@@ -182,13 +181,6 @@ def handle_dataset_command(args, catalog):
182
181
  all=args.all,
183
182
  team=args.team,
184
183
  ),
185
- "stats": lambda: dataset_stats(
186
- catalog,
187
- args.name,
188
- args.version,
189
- show_bytes=args.bytes,
190
- si=args.si,
191
- ),
192
184
  }
193
185
 
194
186
  handler = dataset_commands.get(args.datasets_cmd)
@@ -1,5 +1,4 @@
1
1
  from .datasets import (
2
- dataset_stats,
3
2
  edit_dataset,
4
3
  list_datasets,
5
4
  list_datasets_local,
@@ -15,7 +14,6 @@ from .show import show
15
14
  __all__ = [
16
15
  "clear_cache",
17
16
  "completion",
18
- "dataset_stats",
19
17
  "du",
20
18
  "edit_dataset",
21
19
  "garbage_collect",
@@ -3,8 +3,6 @@ from typing import TYPE_CHECKING, Optional
3
3
 
4
4
  from tabulate import tabulate
5
5
 
6
- from datachain import utils
7
-
8
6
  if TYPE_CHECKING:
9
7
  from datachain.catalog import Catalog
10
8
 
@@ -109,20 +107,3 @@ def edit_dataset(
109
107
 
110
108
  if (all or studio) and token:
111
109
  edit_studio_dataset(team, name, new_name, description, labels)
112
-
113
-
114
- def dataset_stats(
115
- catalog: "Catalog",
116
- name: str,
117
- version: int,
118
- show_bytes=False,
119
- si=False,
120
- ):
121
- stats = catalog.dataset_stats(name, version)
122
-
123
- if stats:
124
- print(f"Number of objects: {stats.num_objects}")
125
- if show_bytes:
126
- print(f"Total objects size: {stats.size}")
127
- else:
128
- print(f"Total objects size: {utils.sizeof_fmt(stats.size, si=si): >7}")
@@ -307,31 +307,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
307
307
  help="The team to delete a dataset. By default, it will use team from config",
308
308
  )
309
309
 
310
- dataset_stats_parser = datasets_subparser.add_parser(
311
- "stats", parents=[parent_parser], description="Show basic dataset statistics."
312
- )
313
- dataset_stats_parser.add_argument("name", type=str, help="Dataset name")
314
- dataset_stats_parser.add_argument(
315
- "--version",
316
- action="store",
317
- default=None,
318
- type=int,
319
- help="Dataset version",
320
- )
321
- dataset_stats_parser.add_argument(
322
- "-b",
323
- "--bytes",
324
- default=False,
325
- action="store_true",
326
- help="Display size in bytes instead of human-readable size",
327
- )
328
- dataset_stats_parser.add_argument(
329
- "--si",
330
- default=False,
331
- action="store_true",
332
- help="Display size using powers of 1000 not 1024",
333
- )
334
-
335
310
  parse_ls = subp.add_parser(
336
311
  "ls", parents=[parent_parser], description="List storage contents."
337
312
  )
datachain/dataset.py CHANGED
@@ -150,12 +150,6 @@ class DatasetDependency:
150
150
  return hash(f"{self.type}_{self.name}_{self.version}")
151
151
 
152
152
 
153
- @dataclass
154
- class DatasetStats:
155
- num_objects: Optional[int] # None if table is missing
156
- size: Optional[int] # in bytes None if table is missing or empty
157
-
158
-
159
153
  class DatasetStatus:
160
154
  CREATED = 1
161
155
  PENDING = 2
@@ -9,7 +9,7 @@ from datachain.sql.functions import conditional
9
9
 
10
10
  from .func import ColT, Func
11
11
 
12
- CaseT = Union[int, float, complex, bool, str, Func]
12
+ CaseT = Union[int, float, complex, bool, str, Func, ColumnElement]
13
13
 
14
14
 
15
15
  def greatest(*args: Union[ColT, float]) -> Func:
@@ -94,11 +94,12 @@ def case(
94
94
  """
95
95
  Returns the case function that produces case expression which has a list of
96
96
  conditions and corresponding results. Results can be python primitives like string,
97
- numbers or booleans but can also be other nested function (including case function).
97
+ numbers or booleans but can also be other nested functions (including case function)
98
+ or columns.
98
99
  Result type is inferred from condition results.
99
100
 
100
101
  Args:
101
- args (tuple((ColumnElement, Func), (str | int | float | complex | bool, Func))):
102
+ args tuple((ColumnElement | Func),(str | int | float | complex | bool, Func, ColumnElement)):
102
103
  Tuple of condition and values pair.
103
104
  else_ (str | int | float | complex | bool, Func): optional else value in case
104
105
  expression. If omitted, and no case conditions are satisfied, the result
@@ -113,13 +114,16 @@ def case(
113
114
  res=func.case((C("num") > 0, "P"), (C("num") < 0, "N"), else_="Z"),
114
115
  )
115
116
  ```
116
- """
117
+ """ # noqa: E501
117
118
  supported_types = [int, float, complex, str, bool]
118
119
 
119
120
  def _get_type(val):
120
121
  if isinstance(val, Func):
121
122
  # nested functions
122
123
  return val.result_type
124
+ if isinstance(val, Column):
125
+ # at this point we cannot know what is the type of a column
126
+ return None
123
127
  return type(val)
124
128
 
125
129
  if not args:
@@ -129,13 +133,16 @@ def case(
129
133
 
130
134
  for arg in args:
131
135
  arg_type = _get_type(arg[1])
136
+ if arg_type is None:
137
+ # we couldn't figure out the type of case value
138
+ continue
132
139
  if type_ and arg_type != type_:
133
140
  raise DataChainParamsError(
134
141
  f"Statement values must be of the same type, got {type_} and {arg_type}"
135
142
  )
136
143
  type_ = arg_type
137
144
 
138
- if type_ not in supported_types:
145
+ if type_ is not None and type_ not in supported_types:
139
146
  raise DataChainParamsError(
140
147
  f"Only python literals ({supported_types}) are supported for values"
141
148
  )
@@ -151,15 +158,15 @@ def ifelse(
151
158
  """
152
159
  Returns the ifelse function that produces if expression which has a condition
153
160
  and values for true and false outcome. Results can be one of python primitives
154
- like string, numbers or booleans, but can also be nested functions.
161
+ like string, numbers or booleans, but can also be nested functions or columns.
155
162
  Result type is inferred from the values.
156
163
 
157
164
  Args:
158
165
  condition (ColumnElement, Func): Condition which is evaluated.
159
- if_val (str | int | float | complex | bool, Func): Value for true
166
+ if_val (str | int | float | complex | bool, Func, ColumnElement): Value for true
160
167
  condition outcome.
161
- else_val (str | int | float | complex | bool, Func): Value for false condition
162
- outcome.
168
+ else_val (str | int | float | complex | bool, Func, ColumnElement): Value for
169
+ false condition outcome.
163
170
 
164
171
  Returns:
165
172
  Func: A Func object that represents the ifelse function.
datachain/func/func.py CHANGED
@@ -424,10 +424,9 @@ class Func(Function):
424
424
 
425
425
  def get_db_col_type(signals_schema: "SignalSchema", col: ColT) -> "DataType":
426
426
  if isinstance(col, tuple):
427
- raise DataChainParamsError(
428
- "Cannot get type from tuple, please provide type hint to the function"
429
- )
430
-
427
+ # we can only get tuple from case statement where the first tuple item
428
+ # is condition, and second one is value which type is important
429
+ col = col[1]
431
430
  if isinstance(col, Func):
432
431
  return col.get_result_type(signals_schema)
433
432
 
@@ -435,7 +434,7 @@ def get_db_col_type(signals_schema: "SignalSchema", col: ColT) -> "DataType":
435
434
  return sql_to_python(col)
436
435
 
437
436
  return signals_schema.get_column_type(
438
- col.name if isinstance(col, ColumnElement) else col
437
+ col.name if isinstance(col, ColumnElement) else col # type: ignore[arg-type]
439
438
  )
440
439
 
441
440
 
@@ -16,14 +16,12 @@ from urllib.parse import urlparse, urlunparse
16
16
  import websockets
17
17
 
18
18
  from datachain.config import Config
19
- from datachain.dataset import DatasetStats
20
19
  from datachain.error import DataChainError
21
20
  from datachain.utils import STUDIO_URL, retry_with_backoff
22
21
 
23
22
  T = TypeVar("T")
24
23
  LsData = Optional[list[dict[str, Any]]]
25
24
  DatasetInfoData = Optional[dict[str, Any]]
26
- DatasetStatsData = Optional[DatasetStats]
27
25
  DatasetRowsData = Optional[Iterable[dict[str, Any]]]
28
26
  DatasetJobVersionsData = Optional[dict[str, Any]]
29
27
  DatasetExportStatus = Optional[dict[str, Any]]
@@ -309,7 +307,7 @@ class StudioClient:
309
307
  "datachain/datasets",
310
308
  {
311
309
  "dataset_name": name,
312
- "version": version,
310
+ "dataset_version": version,
313
311
  "force": force,
314
312
  },
315
313
  method="DELETE",
@@ -347,16 +345,6 @@ class StudioClient:
347
345
  method="GET",
348
346
  )
349
347
 
350
- def dataset_stats(self, name: str, version: int) -> Response[DatasetStatsData]:
351
- response = self._send_request(
352
- "datachain/datasets/stats",
353
- {"dataset_name": name, "dataset_version": version},
354
- method="GET",
355
- )
356
- if response.ok:
357
- response.data = DatasetStats(**response.data)
358
- return response
359
-
360
348
  def export_dataset_table(
361
349
  self, name: str, version: int
362
350
  ) -> Response[DatasetExportSignedUrls]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: datachain
3
- Version: 0.8.11
3
+ Version: 0.8.12
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -3,7 +3,7 @@ datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
4
4
  datachain/cache.py,sha256=yQblPhOh_Mq74Ma7xT1CL1idLJ0HgrQxpGVYvRy_9Eg,3623
5
5
  datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
6
- datachain/dataset.py,sha256=8bUiEHsO_-Ziwxt7NWTSFM2s3R-EsgWaxTifWSETDv4,19155
6
+ datachain/dataset.py,sha256=uqP6gtVFcVMVUFyB9Twr6Uk2onx-aBurbli8_VZu4-s,18993
7
7
  datachain/error.py,sha256=P1VI-etraA08ZrXHUEg1-xnOa2MkONd7vV0qA5uxBig,1314
8
8
  datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
9
9
  datachain/listing.py,sha256=HNB-xeKA6aUA-HTWr--H22S6jVOxP2OVQ-3d07ISqAk,7109
@@ -16,20 +16,20 @@ datachain/studio.py,sha256=iMVDm9wfc86_G02N2p7qF4sdmKDGUkGz7kTOKc9m3Ao,9408
16
16
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
17
17
  datachain/utils.py,sha256=LBeg-9n48saBTHSPk7u_j-kjJnPUAq5Oyps_peSaqlM,14128
18
18
  datachain/catalog/__init__.py,sha256=cMZzSz3VoUi-6qXSVaHYN-agxQuAcz2XSqnEPZ55crE,353
19
- datachain/catalog/catalog.py,sha256=gjGNnUMRXSu9Hhu63yad8pvxA_JB4ctDTnL0Wk0dZP8,59312
19
+ datachain/catalog/catalog.py,sha256=Kg5JBfuf-e7QoiHx1wLKRq4h3KmWEMpCHpHLd-WBX9E,58611
20
20
  datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
21
21
  datachain/catalog/loader.py,sha256=HA_mBC7q_My8j2WnSvIjUGuJpl6SIdg5vvy_lagxJlA,5733
22
- datachain/cli/__init__.py,sha256=vvxW6CekdfO2tF6VT3U7K4AOOfVZlfHVnArNJlOU7P4,8418
22
+ datachain/cli/__init__.py,sha256=B6xw0qTcBgrICPqeWOhVXPaWJcxdKKg0Os6j2_IGAIc,8219
23
23
  datachain/cli/utils.py,sha256=wrLnAh7Wx8O_ojZE8AE4Lxn5WoxHbOj7as8NWlLAA74,3036
24
- datachain/cli/commands/__init__.py,sha256=uc77ggTRWrq-w1AVsH3Muy6v1ATkNsXUBPIRaOFgNus,533
25
- datachain/cli/commands/datasets.py,sha256=q1FkvFfeBCkuIuaA8pick0y51ZQuQK89ULUFse5xsu0,3583
24
+ datachain/cli/commands/__init__.py,sha256=zp3bYIioO60x_X04A4-IpZqSYVnpwOa1AdERQaRlIhI,493
25
+ datachain/cli/commands/datasets.py,sha256=k_CwJ_wYX-Jcc_Z8t9a8vX5jFHVt7qDQ0dehaA94iKs,3140
26
26
  datachain/cli/commands/du.py,sha256=9edEzDEs98K2VYk8Wf-ZMpUzALcgm9uD6YtoqbvtUGU,391
27
27
  datachain/cli/commands/index.py,sha256=eglNaIe1yyIadUHHumjtNbgIjht6kme7SS7xE3YHR88,198
28
28
  datachain/cli/commands/ls.py,sha256=Wb8hXyBwyhb62Zk6ZhNFPFrj2lJhdbRcnBQQkgL_qyw,5174
29
29
  datachain/cli/commands/misc.py,sha256=c0DmkOLwcDI2YhA8ArOuLJk6aGzSMZCiKL_E2JGibVE,600
30
30
  datachain/cli/commands/query.py,sha256=2S7hQxialt1fkbocxi6JXZI6jS5QnFrD1aOjKgZkzfI,1471
31
31
  datachain/cli/commands/show.py,sha256=RVb_7Kjd1kzqTxRKYFvmD04LaJHOtrCc4FYMyc-ZEYw,1149
32
- datachain/cli/parser/__init__.py,sha256=OYD2hOKxjEopTBAH8PTVmM5SROroFNPfLbBHka05KPk,15155
32
+ datachain/cli/parser/__init__.py,sha256=mfOf3tbN4xGr4WQND1B5qMQ4LXoqEU9OhYac7wI-WBc,14393
33
33
  datachain/cli/parser/job.py,sha256=Zpi_bEsMp71YCr8xay0I93Taz8zS0_jHbxtvvTzXj6c,3197
34
34
  datachain/cli/parser/studio.py,sha256=CwmfdnsDNvDTOEbhLmjun18s4yo8zCgrtGTpF67qf8Q,2968
35
35
  datachain/cli/parser/utils.py,sha256=7ZtzGXfBAjVthnc-agz7R9rplnxqFan0LT6x2tq-6Fk,2007
@@ -54,8 +54,8 @@ datachain/func/__init__.py,sha256=qaSjakSaTsRtnU7Hcb4lJk71tbwk7M0oWmjRqXExCLA,10
54
54
  datachain/func/aggregate.py,sha256=7_IPrIwb2XSs3zG4iOr1eTvzn6kNVe2mkzvNzjusDHk,10942
55
55
  datachain/func/array.py,sha256=zHDNWuWLA7HVa9FEvQeHhVi00_xqenyleTqcLwkXWBI,5477
56
56
  datachain/func/base.py,sha256=wA0sBQAVyN9LPxoo7Ox83peS0zUVnyuKxukwAcjGLfY,534
57
- datachain/func/conditional.py,sha256=7f-fQfpCbirWSgN-pMn2FmDUp7l7LTqEa5L17bzBbIk,6037
58
- datachain/func/func.py,sha256=IfNOj5s4oyiViyAyaUJPgWAyT6TCdIIalur_U442Icg,16115
57
+ datachain/func/conditional.py,sha256=g46zwW-i87uA45zWJnPHtHaqr6qOXSg6xLb4p9W3Gtk,6400
58
+ datachain/func/func.py,sha256=PnwTRAiEJUus3e4NYdQ-hldqLzKS9hY0FjiyBMZhsSo,16183
59
59
  datachain/func/numeric.py,sha256=gMe1Ks0dqQKHkjcpvj7I5S-neECzQ_gltPQLNoaWOyo,5632
60
60
  datachain/func/path.py,sha256=mqN_mfkwv44z2II7DMTp_fGGw95hmTCNls_TOFNpr4k,3155
61
61
  datachain/func/random.py,sha256=pENOLj9rSmWfGCnOsUIaCsVC5486zQb66qfQvXaz9Z4,452
@@ -111,7 +111,7 @@ datachain/query/session.py,sha256=fQAtl5zRESRDfRS2d5J9KgrWauunCtrd96vP4Ns1KlE,59
111
111
  datachain/query/udf.py,sha256=GY8E9pnzPE7ZKl_jvetZpn9R2rlUtMlhoYj4UmrzFzw,594
112
112
  datachain/query/utils.py,sha256=u0A_BwG9PNs0DxoDcvSWgWLpj3ByTUv8CqH13CIuGag,1293
113
113
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
- datachain/remote/studio.py,sha256=8Ml5OMZsE2pA7fqdmVbLSyXeXcQ61mHxUWII9roAYQU,13362
114
+ datachain/remote/studio.py,sha256=3v4ZqP06BwBMLXQ4mbcTS95oUodYgBv9A5XisL6ffWo,12915
115
115
  datachain/sql/__init__.py,sha256=6SQRdbljO3d2hx3EAVXEZrHQKv5jth0Jh98PogT59No,262
116
116
  datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
117
117
  datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
@@ -133,9 +133,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
133
133
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
134
134
  datachain/toolkit/split.py,sha256=z3zRJNzjWrpPuRw-zgFbCOBKInyYxJew8ygrYQRQLNc,2930
135
135
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
136
- datachain-0.8.11.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
137
- datachain-0.8.11.dist-info/METADATA,sha256=6zSquPCm1gTPDYu2kYk0juVlOjtbTteekcERpBoD9uw,10880
138
- datachain-0.8.11.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
139
- datachain-0.8.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
140
- datachain-0.8.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
141
- datachain-0.8.11.dist-info/RECORD,,
136
+ datachain-0.8.12.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
137
+ datachain-0.8.12.dist-info/METADATA,sha256=C1vaFTVw44GIVe32CcfLthfCi5nbbqTgS7HL61iSFGg,10880
138
+ datachain-0.8.12.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
139
+ datachain-0.8.12.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
140
+ datachain-0.8.12.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
141
+ datachain-0.8.12.dist-info/RECORD,,