datachain 0.34.6__py3-none-any.whl → 0.35.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (105) hide show
  1. datachain/asyn.py +11 -12
  2. datachain/cache.py +5 -5
  3. datachain/catalog/catalog.py +75 -83
  4. datachain/catalog/loader.py +3 -3
  5. datachain/checkpoint.py +1 -2
  6. datachain/cli/__init__.py +2 -4
  7. datachain/cli/commands/datasets.py +13 -13
  8. datachain/cli/commands/ls.py +4 -4
  9. datachain/cli/commands/query.py +3 -3
  10. datachain/cli/commands/show.py +2 -2
  11. datachain/cli/parser/job.py +1 -1
  12. datachain/cli/parser/utils.py +1 -2
  13. datachain/cli/utils.py +1 -2
  14. datachain/client/azure.py +2 -2
  15. datachain/client/fsspec.py +11 -21
  16. datachain/client/gcs.py +3 -3
  17. datachain/client/http.py +4 -4
  18. datachain/client/local.py +4 -4
  19. datachain/client/s3.py +3 -3
  20. datachain/config.py +4 -8
  21. datachain/data_storage/db_engine.py +5 -5
  22. datachain/data_storage/metastore.py +107 -107
  23. datachain/data_storage/schema.py +18 -24
  24. datachain/data_storage/sqlite.py +21 -28
  25. datachain/data_storage/warehouse.py +13 -13
  26. datachain/dataset.py +64 -70
  27. datachain/delta.py +21 -18
  28. datachain/diff/__init__.py +13 -13
  29. datachain/func/aggregate.py +9 -11
  30. datachain/func/array.py +12 -12
  31. datachain/func/base.py +7 -4
  32. datachain/func/conditional.py +9 -13
  33. datachain/func/func.py +45 -42
  34. datachain/func/numeric.py +5 -7
  35. datachain/func/string.py +2 -2
  36. datachain/hash_utils.py +54 -81
  37. datachain/job.py +8 -8
  38. datachain/lib/arrow.py +17 -14
  39. datachain/lib/audio.py +6 -6
  40. datachain/lib/clip.py +5 -4
  41. datachain/lib/convert/python_to_sql.py +4 -22
  42. datachain/lib/convert/values_to_tuples.py +4 -9
  43. datachain/lib/data_model.py +20 -19
  44. datachain/lib/dataset_info.py +6 -6
  45. datachain/lib/dc/csv.py +10 -10
  46. datachain/lib/dc/database.py +28 -29
  47. datachain/lib/dc/datachain.py +98 -97
  48. datachain/lib/dc/datasets.py +22 -22
  49. datachain/lib/dc/hf.py +4 -4
  50. datachain/lib/dc/json.py +9 -10
  51. datachain/lib/dc/listings.py +5 -8
  52. datachain/lib/dc/pandas.py +3 -6
  53. datachain/lib/dc/parquet.py +5 -5
  54. datachain/lib/dc/records.py +5 -5
  55. datachain/lib/dc/storage.py +12 -12
  56. datachain/lib/dc/storage_pattern.py +2 -2
  57. datachain/lib/dc/utils.py +11 -14
  58. datachain/lib/dc/values.py +3 -6
  59. datachain/lib/file.py +32 -28
  60. datachain/lib/hf.py +7 -5
  61. datachain/lib/image.py +13 -13
  62. datachain/lib/listing.py +5 -5
  63. datachain/lib/listing_info.py +1 -2
  64. datachain/lib/meta_formats.py +1 -2
  65. datachain/lib/model_store.py +3 -3
  66. datachain/lib/namespaces.py +4 -6
  67. datachain/lib/projects.py +5 -9
  68. datachain/lib/pytorch.py +10 -10
  69. datachain/lib/settings.py +23 -23
  70. datachain/lib/signal_schema.py +52 -44
  71. datachain/lib/text.py +8 -7
  72. datachain/lib/udf.py +25 -17
  73. datachain/lib/udf_signature.py +11 -11
  74. datachain/lib/video.py +3 -4
  75. datachain/lib/webdataset.py +30 -35
  76. datachain/lib/webdataset_laion.py +15 -16
  77. datachain/listing.py +4 -4
  78. datachain/model/bbox.py +3 -1
  79. datachain/namespace.py +4 -4
  80. datachain/node.py +6 -6
  81. datachain/nodes_thread_pool.py +0 -1
  82. datachain/plugins.py +1 -7
  83. datachain/project.py +4 -4
  84. datachain/query/batch.py +7 -8
  85. datachain/query/dataset.py +80 -87
  86. datachain/query/dispatch.py +7 -7
  87. datachain/query/metrics.py +3 -4
  88. datachain/query/params.py +2 -3
  89. datachain/query/schema.py +7 -6
  90. datachain/query/session.py +7 -7
  91. datachain/query/udf.py +8 -7
  92. datachain/query/utils.py +3 -5
  93. datachain/remote/studio.py +33 -39
  94. datachain/script_meta.py +12 -12
  95. datachain/sql/sqlite/base.py +6 -9
  96. datachain/studio.py +30 -30
  97. datachain/toolkit/split.py +1 -2
  98. datachain/utils.py +21 -21
  99. {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/METADATA +2 -3
  100. datachain-0.35.0.dist-info/RECORD +173 -0
  101. datachain-0.34.6.dist-info/RECORD +0 -173
  102. {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/WHEEL +0 -0
  103. {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/entry_points.txt +0 -0
  104. {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/licenses/LICENSE +0 -0
  105. {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/top_level.txt +0 -0
@@ -1,18 +1,11 @@
1
1
  import logging
2
2
  import os
3
3
  import sqlite3
4
- from collections.abc import Iterable, Sequence
4
+ from collections.abc import Callable, Iterable, Sequence
5
5
  from contextlib import contextmanager
6
6
  from functools import cached_property, wraps
7
7
  from time import sleep
8
- from typing import (
9
- TYPE_CHECKING,
10
- Any,
11
- Callable,
12
- ClassVar,
13
- Optional,
14
- Union,
15
- )
8
+ from typing import TYPE_CHECKING, Any, ClassVar, Union
16
9
 
17
10
  import sqlalchemy
18
11
  from sqlalchemy import (
@@ -105,8 +98,8 @@ def retry_sqlite_locks(func):
105
98
 
106
99
 
107
100
  def get_db_file_in_memory(
108
- db_file: Optional[str] = None, in_memory: bool = False
109
- ) -> Optional[str]:
101
+ db_file: str | None = None, in_memory: bool = False
102
+ ) -> str | None:
110
103
  """Get in-memory db_file and check that conflicting arguments are not provided."""
111
104
  if in_memory:
112
105
  if db_file and db_file != ":memory:":
@@ -119,7 +112,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
119
112
  dialect = sqlite_dialect
120
113
 
121
114
  db: sqlite3.Connection
122
- db_file: Optional[str]
115
+ db_file: str | None
123
116
  is_closed: bool
124
117
 
125
118
  def __init__(
@@ -127,8 +120,8 @@ class SQLiteDatabaseEngine(DatabaseEngine):
127
120
  engine: "Engine",
128
121
  metadata: "MetaData",
129
122
  db: sqlite3.Connection,
130
- db_file: Optional[str] = None,
131
- max_variable_number: Optional[int] = 999,
123
+ db_file: str | None = None,
124
+ max_variable_number: int | None = 999,
132
125
  ):
133
126
  self.engine = engine
134
127
  self.metadata = metadata
@@ -138,12 +131,12 @@ class SQLiteDatabaseEngine(DatabaseEngine):
138
131
  self.max_variable_number = max_variable_number
139
132
 
140
133
  @classmethod
141
- def from_db_file(cls, db_file: Optional[str] = None) -> "SQLiteDatabaseEngine":
134
+ def from_db_file(cls, db_file: str | None = None) -> "SQLiteDatabaseEngine":
142
135
  return cls(*cls._connect(db_file=db_file))
143
136
 
144
137
  @staticmethod
145
138
  def _connect(
146
- db_file: Optional[str] = None,
139
+ db_file: str | None = None,
147
140
  ) -> tuple["Engine", "MetaData", sqlite3.Connection, str, int]:
148
141
  try:
149
142
  if db_file == ":memory:":
@@ -232,7 +225,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
232
225
  def execute(
233
226
  self,
234
227
  query,
235
- cursor: Optional[sqlite3.Cursor] = None,
228
+ cursor: sqlite3.Cursor | None = None,
236
229
  conn=None,
237
230
  ) -> sqlite3.Cursor:
238
231
  if self.is_closed:
@@ -251,7 +244,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
251
244
 
252
245
  @retry_sqlite_locks
253
246
  def executemany(
254
- self, query, params, cursor: Optional[sqlite3.Cursor] = None, conn=None
247
+ self, query, params, cursor: sqlite3.Cursor | None = None, conn=None
255
248
  ) -> sqlite3.Cursor:
256
249
  if cursor:
257
250
  return cursor.executemany(self.compile(query).string, params)
@@ -351,13 +344,13 @@ class SQLiteMetastore(AbstractDBMetastore):
351
344
 
352
345
  META_TABLE = "meta"
353
346
 
354
- db: "SQLiteDatabaseEngine"
347
+ db: SQLiteDatabaseEngine
355
348
 
356
349
  def __init__(
357
350
  self,
358
- uri: Optional[StorageURI] = None,
359
- db: Optional["SQLiteDatabaseEngine"] = None,
360
- db_file: Optional[str] = None,
351
+ uri: StorageURI | None = None,
352
+ db: SQLiteDatabaseEngine | None = None,
353
+ db_file: str | None = None,
361
354
  in_memory: bool = False,
362
355
  ):
363
356
  uri = uri or StorageURI("")
@@ -384,7 +377,7 @@ class SQLiteMetastore(AbstractDBMetastore):
384
377
 
385
378
  def clone(
386
379
  self,
387
- uri: Optional[StorageURI] = None,
380
+ uri: StorageURI | None = None,
388
381
  use_new_connection: bool = False,
389
382
  ) -> "SQLiteMetastore":
390
383
  uri = uri or StorageURI("")
@@ -582,15 +575,15 @@ class SQLiteWarehouse(AbstractWarehouse):
582
575
  This is currently used for the local cli.
583
576
  """
584
577
 
585
- db: "SQLiteDatabaseEngine"
578
+ db: SQLiteDatabaseEngine
586
579
 
587
580
  # Cache for our defined column types to dialect specific TypeEngine relations
588
581
  _col_python_type: ClassVar[dict[type, "TypeEngine"]] = {}
589
582
 
590
583
  def __init__(
591
584
  self,
592
- db: Optional["SQLiteDatabaseEngine"] = None,
593
- db_file: Optional[str] = None,
585
+ db: SQLiteDatabaseEngine | None = None,
586
+ db_file: str | None = None,
594
587
  in_memory: bool = False,
595
588
  ):
596
589
  self.schema: DefaultSchema = DefaultSchema()
@@ -645,7 +638,7 @@ class SQLiteWarehouse(AbstractWarehouse):
645
638
  only=filter_tables,
646
639
  )
647
640
 
648
- def is_ready(self, timeout: Optional[int] = None) -> bool:
641
+ def is_ready(self, timeout: int | None = None) -> bool:
649
642
  return True
650
643
 
651
644
  def create_dataset_rows_table(
@@ -791,7 +784,7 @@ class SQLiteWarehouse(AbstractWarehouse):
791
784
  self,
792
785
  table: Table,
793
786
  query: Select,
794
- progress_cb: Optional[Callable[[int], None]] = None,
787
+ progress_cb: Callable[[int], None] | None = None,
795
788
  ) -> None:
796
789
  col_id = (
797
790
  query.selected_columns.sys__id
@@ -4,8 +4,8 @@ import posixpath
4
4
  import random
5
5
  import string
6
6
  from abc import ABC, abstractmethod
7
- from collections.abc import Generator, Iterable, Iterator, Sequence
8
- from typing import TYPE_CHECKING, Any, Callable, Optional, Union
7
+ from collections.abc import Callable, Generator, Iterable, Iterator, Sequence
8
+ from typing import TYPE_CHECKING, Any, Union
9
9
  from urllib.parse import urlparse
10
10
 
11
11
  import attrs
@@ -174,12 +174,12 @@ class AbstractWarehouse(ABC, Serializable):
174
174
  #
175
175
 
176
176
  @abstractmethod
177
- def is_ready(self, timeout: Optional[int] = None) -> bool: ...
177
+ def is_ready(self, timeout: int | None = None) -> bool: ...
178
178
 
179
179
  def dataset_rows(
180
180
  self,
181
181
  dataset: DatasetRecord,
182
- version: Optional[str] = None,
182
+ version: str | None = None,
183
183
  column: str = "file",
184
184
  ):
185
185
  version = version or dataset.latest_version
@@ -424,7 +424,7 @@ class AbstractWarehouse(ABC, Serializable):
424
424
 
425
425
  def dataset_stats(
426
426
  self, dataset: DatasetRecord, version: str
427
- ) -> tuple[Optional[int], Optional[int]]:
427
+ ) -> tuple[int | None, int | None]:
428
428
  """
429
429
  Returns tuple with dataset stats: total number of rows and total dataset size.
430
430
  """
@@ -549,7 +549,7 @@ class AbstractWarehouse(ABC, Serializable):
549
549
  dr = dataset_rows
550
550
  columns = [c.name for c in query.selected_columns]
551
551
  for row in self.db.execute(query):
552
- d = dict(zip(columns, row))
552
+ d = dict(zip(columns, row, strict=False))
553
553
  yield Node(**{dr.without_object(k): v for k, v in d.items()})
554
554
 
555
555
  def get_dirs_by_parent_path(
@@ -786,7 +786,7 @@ class AbstractWarehouse(ABC, Serializable):
786
786
  def size(
787
787
  self,
788
788
  dataset_rows: "DataTable",
789
- node: Union[Node, dict[str, Any]],
789
+ node: Node | dict[str, Any],
790
790
  count_files: bool = False,
791
791
  ) -> tuple[int, int]:
792
792
  """
@@ -828,10 +828,10 @@ class AbstractWarehouse(ABC, Serializable):
828
828
  self,
829
829
  dataset_rows: "DataTable",
830
830
  parent_path: str,
831
- fields: Optional[Sequence[str]] = None,
832
- type: Optional[str] = None,
831
+ fields: Sequence[str] | None = None,
832
+ type: str | None = None,
833
833
  conds=None,
834
- order_by: Optional[Union[str, list[str]]] = None,
834
+ order_by: str | list[str] | None = None,
835
835
  include_subobjects: bool = True,
836
836
  ) -> sa.Select:
837
837
  if not conds:
@@ -869,7 +869,7 @@ class AbstractWarehouse(ABC, Serializable):
869
869
  self,
870
870
  dataset_rows: "DataTable",
871
871
  node: Node,
872
- sort: Union[list[str], str, None] = None,
872
+ sort: list[str] | str | None = None,
873
873
  include_subobjects: bool = True,
874
874
  ) -> Iterator[NodeWithPath]:
875
875
  """
@@ -927,7 +927,7 @@ class AbstractWarehouse(ABC, Serializable):
927
927
  def create_udf_table(
928
928
  self,
929
929
  columns: Sequence["sa.Column"] = (),
930
- name: Optional[str] = None,
930
+ name: str | None = None,
931
931
  ) -> sa.Table:
932
932
  """
933
933
  Create a temporary table for storing custom signals generated by a UDF.
@@ -948,7 +948,7 @@ class AbstractWarehouse(ABC, Serializable):
948
948
  self,
949
949
  table: sa.Table,
950
950
  query: sa.Select,
951
- progress_cb: Optional[Callable[[int], None]] = None,
951
+ progress_cb: Callable[[int], None] | None = None,
952
952
  ) -> None:
953
953
  """
954
954
  Copy the results of a query into a table.
datachain/dataset.py CHANGED
@@ -3,13 +3,7 @@ import json
3
3
  from dataclasses import dataclass, fields
4
4
  from datetime import datetime
5
5
  from functools import cached_property
6
- from typing import (
7
- Any,
8
- NewType,
9
- Optional,
10
- TypeVar,
11
- Union,
12
- )
6
+ from typing import Any, NewType, TypeVar
13
7
  from urllib.parse import urlparse
14
8
 
15
9
  from packaging.specifiers import SpecifierSet
@@ -43,7 +37,7 @@ DATASET_NAME_REPLACEMENT_CHAR = "_"
43
37
  StorageURI = NewType("StorageURI", str)
44
38
 
45
39
 
46
- def parse_dataset_uri(uri: str) -> tuple[str, Optional[str]]:
40
+ def parse_dataset_uri(uri: str) -> tuple[str, str | None]:
47
41
  """
48
42
  Parse dataser uri to extract name and version out of it (if version is defined)
49
43
  Example:
@@ -65,7 +59,7 @@ def parse_dataset_uri(uri: str) -> tuple[str, Optional[str]]:
65
59
 
66
60
 
67
61
  def create_dataset_uri(
68
- name: str, namespace: str, project: str, version: Optional[str] = None
62
+ name: str, namespace: str, project: str, version: str | None = None
69
63
  ) -> str:
70
64
  """
71
65
  Creates a dataset uri based on namespace, project, dataset name and optionally
@@ -81,7 +75,7 @@ def create_dataset_uri(
81
75
  return uri
82
76
 
83
77
 
84
- def parse_dataset_name(name: str) -> tuple[Optional[str], Optional[str], str]:
78
+ def parse_dataset_name(name: str) -> tuple[str | None, str | None, str]:
85
79
  """Parses dataset name and returns namespace, project and name"""
86
80
  if not name:
87
81
  raise InvalidDatasetNameError("Name must be defined to parse it")
@@ -111,7 +105,7 @@ class DatasetDependency:
111
105
  name: str
112
106
  version: str
113
107
  created_at: datetime
114
- dependencies: list[Optional["DatasetDependency"]]
108
+ dependencies: list["DatasetDependency | None"]
115
109
 
116
110
  @property
117
111
  def dataset_name(self) -> str:
@@ -131,12 +125,12 @@ class DatasetDependency:
131
125
  namespace_name: str,
132
126
  project_name: str,
133
127
  id: int,
134
- dataset_id: Optional[int],
135
- dataset_version_id: Optional[int],
136
- dataset_name: Optional[str],
137
- dataset_version: Optional[str],
138
- dataset_version_created_at: Optional[datetime],
139
- ) -> Optional["DatasetDependency"]:
128
+ dataset_id: int | None,
129
+ dataset_version_id: int | None,
130
+ dataset_name: str | None,
131
+ dataset_version: str | None,
132
+ dataset_version_created_at: datetime | None,
133
+ ) -> "DatasetDependency | None":
140
134
  from datachain.lib.listing import is_listing_dataset
141
135
 
142
136
  if not dataset_id:
@@ -198,17 +192,17 @@ class DatasetVersion:
198
192
  status: int
199
193
  feature_schema: dict
200
194
  created_at: datetime
201
- finished_at: Optional[datetime]
195
+ finished_at: datetime | None
202
196
  error_message: str
203
197
  error_stack: str
204
198
  script_output: str
205
- schema: dict[str, Union[SQLType, type[SQLType]]]
206
- num_objects: Optional[int]
207
- size: Optional[int]
208
- _preview_data: Optional[Union[str, list[dict]]]
199
+ schema: dict[str, SQLType | type[SQLType]]
200
+ num_objects: int | None
201
+ size: int | None
202
+ _preview_data: str | list[dict] | None
209
203
  sources: str = ""
210
204
  query_script: str = ""
211
- job_id: Optional[str] = None
205
+ job_id: str | None = None
212
206
 
213
207
  @classmethod
214
208
  def parse( # noqa: PLR0913
@@ -218,19 +212,19 @@ class DatasetVersion:
218
212
  dataset_id: int,
219
213
  version: str,
220
214
  status: int,
221
- feature_schema: Optional[str],
215
+ feature_schema: str | None,
222
216
  created_at: datetime,
223
- finished_at: Optional[datetime],
217
+ finished_at: datetime | None,
224
218
  error_message: str,
225
219
  error_stack: str,
226
220
  script_output: str,
227
- num_objects: Optional[int],
228
- size: Optional[int],
229
- preview: Optional[Union[str, list[dict]]],
230
- schema: dict[str, Union[SQLType, type[SQLType]]],
221
+ num_objects: int | None,
222
+ size: int | None,
223
+ preview: str | list[dict] | None,
224
+ schema: dict[str, SQLType | type[SQLType]],
231
225
  sources: str = "",
232
226
  query_script: str = "",
233
- job_id: Optional[str] = None,
227
+ job_id: str | None = None,
234
228
  ):
235
229
  return cls(
236
230
  id,
@@ -292,7 +286,7 @@ class DatasetVersion:
292
286
  }
293
287
 
294
288
  @cached_property
295
- def preview(self) -> Optional[list[dict]]:
289
+ def preview(self) -> list[dict] | None:
296
290
  if isinstance(self._preview_data, str):
297
291
  return json.loads(self._preview_data)
298
292
  return self._preview_data if self._preview_data else None
@@ -313,13 +307,13 @@ class DatasetListVersion:
313
307
  version: str
314
308
  status: int
315
309
  created_at: datetime
316
- finished_at: Optional[datetime]
310
+ finished_at: datetime | None
317
311
  error_message: str
318
312
  error_stack: str
319
- num_objects: Optional[int]
320
- size: Optional[int]
313
+ num_objects: int | None
314
+ size: int | None
321
315
  query_script: str = ""
322
- job_id: Optional[str] = None
316
+ job_id: str | None = None
323
317
 
324
318
  @classmethod
325
319
  def parse(
@@ -330,13 +324,13 @@ class DatasetListVersion:
330
324
  version: str,
331
325
  status: int,
332
326
  created_at: datetime,
333
- finished_at: Optional[datetime],
327
+ finished_at: datetime | None,
334
328
  error_message: str,
335
329
  error_stack: str,
336
- num_objects: Optional[int],
337
- size: Optional[int],
330
+ num_objects: int | None,
331
+ size: int | None,
338
332
  query_script: str = "",
339
- job_id: Optional[str] = None,
333
+ job_id: str | None = None,
340
334
  **kwargs,
341
335
  ):
342
336
  return cls(
@@ -368,14 +362,14 @@ class DatasetRecord:
368
362
  id: int
369
363
  name: str
370
364
  project: Project
371
- description: Optional[str]
365
+ description: str | None
372
366
  attrs: list[str]
373
- schema: dict[str, Union[SQLType, type[SQLType]]]
367
+ schema: dict[str, SQLType | type[SQLType]]
374
368
  feature_schema: dict
375
369
  versions: list[DatasetVersion]
376
370
  status: int = DatasetStatus.CREATED
377
- created_at: Optional[datetime] = None
378
- finished_at: Optional[datetime] = None
371
+ created_at: datetime | None = None
372
+ finished_at: datetime | None = None
379
373
  error_message: str = ""
380
374
  error_stack: str = ""
381
375
  script_output: str = ""
@@ -388,7 +382,7 @@ class DatasetRecord:
388
382
  @staticmethod
389
383
  def parse_schema(
390
384
  ct: dict[str, Any],
391
- ) -> dict[str, Union[SQLType, type[SQLType]]]:
385
+ ) -> dict[str, SQLType | type[SQLType]]:
392
386
  return {
393
387
  c_name: NAME_TYPES_MAPPING[c_type["type"]].from_dict(c_type) # type: ignore [attr-defined]
394
388
  for c_name, c_type in ct.items()
@@ -409,23 +403,23 @@ class DatasetRecord:
409
403
  namespace_id: int,
410
404
  namespace_uuid: str,
411
405
  namespace_name: str,
412
- namespace_description: Optional[str],
406
+ namespace_description: str | None,
413
407
  namespace_created_at: datetime,
414
408
  project_id: int,
415
409
  project_uuid: str,
416
410
  project_name: str,
417
- project_description: Optional[str],
411
+ project_description: str | None,
418
412
  project_created_at: datetime,
419
413
  project_namespace_id: int,
420
414
  dataset_id: int,
421
415
  dataset_project_id: int,
422
416
  name: str,
423
- description: Optional[str],
417
+ description: str | None,
424
418
  attrs: str,
425
419
  status: int,
426
- feature_schema: Optional[str],
420
+ feature_schema: str | None,
427
421
  created_at: datetime,
428
- finished_at: Optional[datetime],
422
+ finished_at: datetime | None,
429
423
  error_message: str,
430
424
  error_stack: str,
431
425
  script_output: str,
@@ -437,19 +431,19 @@ class DatasetRecord:
437
431
  version_dataset_id: int,
438
432
  version: str,
439
433
  version_status: int,
440
- version_feature_schema: Optional[str],
434
+ version_feature_schema: str | None,
441
435
  version_created_at: datetime,
442
- version_finished_at: Optional[datetime],
436
+ version_finished_at: datetime | None,
443
437
  version_error_message: str,
444
438
  version_error_stack: str,
445
439
  version_script_output: str,
446
- version_num_objects: Optional[int],
447
- version_size: Optional[int],
448
- version_preview: Optional[str],
449
- version_sources: Optional[str],
450
- version_query_script: Optional[str],
440
+ version_num_objects: int | None,
441
+ version_size: int | None,
442
+ version_preview: str | None,
443
+ version_sources: str | None,
444
+ version_query_script: str | None,
451
445
  version_schema: str,
452
- version_job_id: Optional[str] = None,
446
+ version_job_id: str | None = None,
453
447
  ) -> "DatasetRecord":
454
448
  attrs_lst: list[str] = json.loads(attrs) if attrs else []
455
449
  schema_dct: dict[str, Any] = json.loads(schema) if schema else {}
@@ -527,7 +521,7 @@ class DatasetRecord:
527
521
  def full_name(self) -> str:
528
522
  return f"{self.project.namespace.name}.{self.project.name}.{self.name}"
529
523
 
530
- def get_schema(self, version: str) -> dict[str, Union[SQLType, type[SQLType]]]:
524
+ def get_schema(self, version: str) -> dict[str, SQLType | type[SQLType]]:
531
525
  return self.get_version(version).schema if version else self.schema
532
526
 
533
527
  def update(self, **kwargs):
@@ -649,7 +643,7 @@ class DatasetRecord:
649
643
  """Returns latest version of a dataset"""
650
644
  return max(self.versions).version
651
645
 
652
- def latest_major_version(self, major: int) -> Optional[str]:
646
+ def latest_major_version(self, major: int) -> str | None:
653
647
  """
654
648
  Returns latest specific major version, e.g if dataset has versions:
655
649
  - 1.4.1
@@ -664,7 +658,7 @@ class DatasetRecord:
664
658
  return None
665
659
  return max(versions).version
666
660
 
667
- def latest_compatible_version(self, version_spec: str) -> Optional[str]:
661
+ def latest_compatible_version(self, version_spec: str) -> str | None:
668
662
  """
669
663
  Returns the latest version that matches the given version specifier.
670
664
 
@@ -711,10 +705,10 @@ class DatasetListRecord:
711
705
  id: int
712
706
  name: str
713
707
  project: Project
714
- description: Optional[str]
708
+ description: str | None
715
709
  attrs: list[str]
716
710
  versions: list[DatasetListVersion]
717
- created_at: Optional[datetime] = None
711
+ created_at: datetime | None = None
718
712
 
719
713
  @classmethod
720
714
  def parse( # noqa: PLR0913
@@ -722,17 +716,17 @@ class DatasetListRecord:
722
716
  namespace_id: int,
723
717
  namespace_uuid: str,
724
718
  namespace_name: str,
725
- namespace_description: Optional[str],
719
+ namespace_description: str | None,
726
720
  namespace_created_at: datetime,
727
721
  project_id: int,
728
722
  project_uuid: str,
729
723
  project_name: str,
730
- project_description: Optional[str],
724
+ project_description: str | None,
731
725
  project_created_at: datetime,
732
726
  project_namespace_id: int,
733
727
  dataset_id: int,
734
728
  name: str,
735
- description: Optional[str],
729
+ description: str | None,
736
730
  attrs: str,
737
731
  created_at: datetime,
738
732
  version_id: int,
@@ -741,13 +735,13 @@ class DatasetListRecord:
741
735
  version: str,
742
736
  version_status: int,
743
737
  version_created_at: datetime,
744
- version_finished_at: Optional[datetime],
738
+ version_finished_at: datetime | None,
745
739
  version_error_message: str,
746
740
  version_error_stack: str,
747
- version_num_objects: Optional[int],
748
- version_size: Optional[int],
749
- version_query_script: Optional[str],
750
- version_job_id: Optional[str] = None,
741
+ version_num_objects: int | None,
742
+ version_size: int | None,
743
+ version_query_script: str | None,
744
+ version_job_id: str | None = None,
751
745
  ) -> "DatasetListRecord":
752
746
  attrs_lst: list[str] = json.loads(attrs) if attrs else []
753
747
 
datachain/delta.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from collections.abc import Sequence
2
2
  from copy import copy
3
3
  from functools import wraps
4
- from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
4
+ from typing import TYPE_CHECKING, TypeVar
5
5
 
6
6
  import datachain
7
7
  from datachain.dataset import DatasetDependency, DatasetRecord
@@ -9,7 +9,10 @@ from datachain.error import DatasetNotFoundError
9
9
  from datachain.project import Project
10
10
 
11
11
  if TYPE_CHECKING:
12
- from typing_extensions import Concatenate, ParamSpec
12
+ from collections.abc import Callable
13
+ from typing import Concatenate
14
+
15
+ from typing_extensions import ParamSpec
13
16
 
14
17
  from datachain.lib.dc import DataChain
15
18
 
@@ -55,8 +58,8 @@ def _get_delta_chain(
55
58
  source_ds_project: Project,
56
59
  source_ds_version: str,
57
60
  source_ds_latest_version: str,
58
- on: Union[str, Sequence[str]],
59
- compare: Optional[Union[str, Sequence[str]]] = None,
61
+ on: str | Sequence[str],
62
+ compare: str | Sequence[str] | None = None,
60
63
  ) -> "DataChain":
61
64
  """Get delta chain for processing changes between versions."""
62
65
  source_dc = datachain.read_dataset(
@@ -84,11 +87,11 @@ def _get_retry_chain(
84
87
  source_ds_name: str,
85
88
  source_ds_project: Project,
86
89
  source_ds_version: str,
87
- on: Union[str, Sequence[str]],
88
- right_on: Optional[Union[str, Sequence[str]]],
89
- delta_retry: Optional[Union[bool, str]],
90
+ on: str | Sequence[str],
91
+ right_on: str | Sequence[str] | None,
92
+ delta_retry: bool | str | None,
90
93
  diff_chain: "DataChain",
91
- ) -> Optional["DataChain"]:
94
+ ) -> "DataChain | None":
92
95
  """Get retry chain for processing error records and missing records."""
93
96
  # Import here to avoid circular import
94
97
  from datachain.lib.dc import C
@@ -144,11 +147,11 @@ def _get_source_info(
144
147
  latest_version: str,
145
148
  catalog,
146
149
  ) -> tuple[
147
- Optional[str],
148
- Optional[Project],
149
- Optional[str],
150
- Optional[str],
151
- Optional[list[DatasetDependency]],
150
+ str | None,
151
+ Project | None,
152
+ str | None,
153
+ str | None,
154
+ list[DatasetDependency] | None,
152
155
  ]:
153
156
  """Get source dataset information and dependencies.
154
157
 
@@ -190,11 +193,11 @@ def delta_retry_update(
190
193
  namespace_name: str,
191
194
  project_name: str,
192
195
  name: str,
193
- on: Union[str, Sequence[str]],
194
- right_on: Optional[Union[str, Sequence[str]]] = None,
195
- compare: Optional[Union[str, Sequence[str]]] = None,
196
- delta_retry: Optional[Union[bool, str]] = None,
197
- ) -> tuple[Optional["DataChain"], Optional[list[DatasetDependency]], bool]:
196
+ on: str | Sequence[str],
197
+ right_on: str | Sequence[str] | None = None,
198
+ compare: str | Sequence[str] | None = None,
199
+ delta_retry: bool | str | None = None,
200
+ ) -> tuple["DataChain | None", list[DatasetDependency] | None, bool]:
198
201
  """
199
202
  Creates new chain that consists of the last version of current delta dataset
200
203
  plus diff from the source with all needed modifications.