datachain 0.34.6__py3-none-any.whl → 0.34.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (105) hide show
  1. datachain/asyn.py +11 -12
  2. datachain/cache.py +5 -5
  3. datachain/catalog/catalog.py +75 -83
  4. datachain/catalog/loader.py +3 -3
  5. datachain/checkpoint.py +1 -2
  6. datachain/cli/__init__.py +2 -4
  7. datachain/cli/commands/datasets.py +13 -13
  8. datachain/cli/commands/ls.py +4 -4
  9. datachain/cli/commands/query.py +3 -3
  10. datachain/cli/commands/show.py +2 -2
  11. datachain/cli/parser/job.py +1 -1
  12. datachain/cli/parser/utils.py +1 -2
  13. datachain/cli/utils.py +1 -2
  14. datachain/client/azure.py +2 -2
  15. datachain/client/fsspec.py +11 -21
  16. datachain/client/gcs.py +3 -3
  17. datachain/client/http.py +4 -4
  18. datachain/client/local.py +4 -4
  19. datachain/client/s3.py +3 -3
  20. datachain/config.py +4 -8
  21. datachain/data_storage/db_engine.py +5 -5
  22. datachain/data_storage/metastore.py +107 -107
  23. datachain/data_storage/schema.py +18 -24
  24. datachain/data_storage/sqlite.py +21 -28
  25. datachain/data_storage/warehouse.py +13 -13
  26. datachain/dataset.py +64 -70
  27. datachain/delta.py +21 -18
  28. datachain/diff/__init__.py +13 -13
  29. datachain/func/aggregate.py +9 -11
  30. datachain/func/array.py +12 -12
  31. datachain/func/base.py +7 -4
  32. datachain/func/conditional.py +9 -13
  33. datachain/func/func.py +45 -42
  34. datachain/func/numeric.py +5 -7
  35. datachain/func/string.py +2 -2
  36. datachain/hash_utils.py +54 -81
  37. datachain/job.py +8 -8
  38. datachain/lib/arrow.py +17 -14
  39. datachain/lib/audio.py +6 -6
  40. datachain/lib/clip.py +5 -4
  41. datachain/lib/convert/python_to_sql.py +4 -22
  42. datachain/lib/convert/values_to_tuples.py +4 -9
  43. datachain/lib/data_model.py +20 -19
  44. datachain/lib/dataset_info.py +6 -6
  45. datachain/lib/dc/csv.py +10 -10
  46. datachain/lib/dc/database.py +28 -29
  47. datachain/lib/dc/datachain.py +98 -97
  48. datachain/lib/dc/datasets.py +22 -22
  49. datachain/lib/dc/hf.py +4 -4
  50. datachain/lib/dc/json.py +9 -10
  51. datachain/lib/dc/listings.py +5 -8
  52. datachain/lib/dc/pandas.py +3 -6
  53. datachain/lib/dc/parquet.py +5 -5
  54. datachain/lib/dc/records.py +5 -5
  55. datachain/lib/dc/storage.py +12 -12
  56. datachain/lib/dc/storage_pattern.py +2 -2
  57. datachain/lib/dc/utils.py +11 -14
  58. datachain/lib/dc/values.py +3 -6
  59. datachain/lib/file.py +26 -26
  60. datachain/lib/hf.py +7 -5
  61. datachain/lib/image.py +13 -13
  62. datachain/lib/listing.py +5 -5
  63. datachain/lib/listing_info.py +1 -2
  64. datachain/lib/meta_formats.py +1 -2
  65. datachain/lib/model_store.py +3 -3
  66. datachain/lib/namespaces.py +4 -6
  67. datachain/lib/projects.py +5 -9
  68. datachain/lib/pytorch.py +10 -10
  69. datachain/lib/settings.py +23 -23
  70. datachain/lib/signal_schema.py +52 -44
  71. datachain/lib/text.py +8 -7
  72. datachain/lib/udf.py +25 -17
  73. datachain/lib/udf_signature.py +11 -11
  74. datachain/lib/video.py +3 -4
  75. datachain/lib/webdataset.py +30 -35
  76. datachain/lib/webdataset_laion.py +15 -16
  77. datachain/listing.py +4 -4
  78. datachain/model/bbox.py +3 -1
  79. datachain/namespace.py +4 -4
  80. datachain/node.py +6 -6
  81. datachain/nodes_thread_pool.py +0 -1
  82. datachain/plugins.py +1 -7
  83. datachain/project.py +4 -4
  84. datachain/query/batch.py +7 -8
  85. datachain/query/dataset.py +80 -87
  86. datachain/query/dispatch.py +7 -7
  87. datachain/query/metrics.py +3 -4
  88. datachain/query/params.py +2 -3
  89. datachain/query/schema.py +7 -6
  90. datachain/query/session.py +7 -7
  91. datachain/query/udf.py +8 -7
  92. datachain/query/utils.py +3 -5
  93. datachain/remote/studio.py +33 -39
  94. datachain/script_meta.py +12 -12
  95. datachain/sql/sqlite/base.py +6 -9
  96. datachain/studio.py +30 -30
  97. datachain/toolkit/split.py +1 -2
  98. datachain/utils.py +21 -21
  99. {datachain-0.34.6.dist-info → datachain-0.34.7.dist-info}/METADATA +2 -3
  100. datachain-0.34.7.dist-info/RECORD +173 -0
  101. datachain-0.34.6.dist-info/RECORD +0 -173
  102. {datachain-0.34.6.dist-info → datachain-0.34.7.dist-info}/WHEEL +0 -0
  103. {datachain-0.34.6.dist-info → datachain-0.34.7.dist-info}/entry_points.txt +0 -0
  104. {datachain-0.34.6.dist-info → datachain-0.34.7.dist-info}/licenses/LICENSE +0 -0
  105. {datachain-0.34.6.dist-info → datachain-0.34.7.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,4 @@
1
- from typing import (
2
- TYPE_CHECKING,
3
- Optional,
4
- )
1
+ from typing import TYPE_CHECKING
5
2
 
6
3
  from datachain.query import Session
7
4
 
@@ -19,8 +16,8 @@ if TYPE_CHECKING:
19
16
  def read_pandas( # type: ignore[override]
20
17
  df: "pd.DataFrame",
21
18
  name: str = "",
22
- session: Optional[Session] = None,
23
- settings: Optional[dict] = None,
19
+ session: Session | None = None,
20
+ settings: dict | None = None,
24
21
  in_memory: bool = False,
25
22
  column: str = "",
26
23
  ) -> "DataChain":
@@ -1,5 +1,5 @@
1
1
  import os
2
- from typing import TYPE_CHECKING, Any, Optional, Union
2
+ from typing import TYPE_CHECKING, Any
3
3
 
4
4
  from datachain.lib.data_model import DataType
5
5
  from datachain.query import Session
@@ -13,14 +13,14 @@ if TYPE_CHECKING:
13
13
 
14
14
 
15
15
  def read_parquet(
16
- path: Union[str, os.PathLike[str], list[str], list[os.PathLike[str]]],
16
+ path: str | os.PathLike[str] | list[str] | list[os.PathLike[str]],
17
17
  partitioning: Any = "hive",
18
- output: Optional[dict[str, DataType]] = None,
18
+ output: dict[str, DataType] | None = None,
19
19
  column: str = "",
20
20
  model_name: str = "",
21
21
  source: bool = True,
22
- session: Optional[Session] = None,
23
- settings: Optional[dict] = None,
22
+ session: Session | None = None,
23
+ settings: dict | None = None,
24
24
  **kwargs,
25
25
  ) -> "DataChain":
26
26
  """Generate chain from parquet files.
@@ -1,5 +1,5 @@
1
1
  from collections.abc import Iterable
2
- from typing import TYPE_CHECKING, Optional, Union
2
+ from typing import TYPE_CHECKING
3
3
 
4
4
  import sqlalchemy
5
5
 
@@ -19,11 +19,11 @@ READ_RECORDS_BATCH_SIZE = 10000
19
19
 
20
20
 
21
21
  def read_records(
22
- to_insert: Optional[Union[dict, Iterable[dict]]],
23
- session: Optional[Session] = None,
24
- settings: Optional[dict] = None,
22
+ to_insert: dict | Iterable[dict] | None,
23
+ session: Session | None = None,
24
+ settings: dict | None = None,
25
25
  in_memory: bool = False,
26
- schema: Optional[dict[str, DataType]] = None,
26
+ schema: dict[str, DataType] | None = None,
27
27
  ) -> "DataChain":
28
28
  """Create a DataChain from the provided records. This method can be used for
29
29
  programmatically generating a chain in contrast of reading data from storages
@@ -1,7 +1,7 @@
1
1
  import os
2
2
  from collections.abc import Sequence
3
3
  from functools import reduce
4
- from typing import TYPE_CHECKING, Optional, Union
4
+ from typing import TYPE_CHECKING
5
5
 
6
6
  from datachain.lib.dc.storage_pattern import (
7
7
  apply_glob_filter,
@@ -19,27 +19,27 @@ if TYPE_CHECKING:
19
19
 
20
20
 
21
21
  def read_storage(
22
- uri: Union[str, os.PathLike[str], list[str], list[os.PathLike[str]]],
22
+ uri: str | os.PathLike[str] | list[str] | list[os.PathLike[str]],
23
23
  *,
24
24
  type: FileType = "binary",
25
- session: Optional[Session] = None,
26
- settings: Optional[dict] = None,
25
+ session: Session | None = None,
26
+ settings: dict | None = None,
27
27
  in_memory: bool = False,
28
- recursive: Optional[bool] = True,
28
+ recursive: bool | None = True,
29
29
  column: str = "file",
30
30
  update: bool = False,
31
- anon: Optional[bool] = None,
32
- delta: Optional[bool] = False,
33
- delta_on: Optional[Union[str, Sequence[str]]] = (
31
+ anon: bool | None = None,
32
+ delta: bool | None = False,
33
+ delta_on: str | Sequence[str] | None = (
34
34
  "file.path",
35
35
  "file.etag",
36
36
  "file.version",
37
37
  ),
38
- delta_result_on: Optional[Union[str, Sequence[str]]] = None,
39
- delta_compare: Optional[Union[str, Sequence[str]]] = None,
40
- delta_retry: Optional[Union[bool, str]] = None,
38
+ delta_result_on: str | Sequence[str] | None = None,
39
+ delta_compare: str | Sequence[str] | None = None,
40
+ delta_retry: bool | str | None = None,
41
41
  delta_unsafe: bool = False,
42
- client_config: Optional[dict] = None,
42
+ client_config: dict | None = None,
43
43
  ) -> "DataChain":
44
44
  """Get data from storage(s) as a list of file with all file attributes.
45
45
  It returns the chain itself as usual.
@@ -1,5 +1,5 @@
1
1
  import glob
2
- from typing import TYPE_CHECKING, Union
2
+ from typing import TYPE_CHECKING
3
3
 
4
4
  from datachain.client.fsspec import is_cloud_uri
5
5
  from datachain.lib.listing import ls
@@ -32,7 +32,7 @@ def validate_cloud_bucket_name(uri: str) -> None:
32
32
  raise ValueError(f"Glob patterns in bucket names are not supported: {uri}")
33
33
 
34
34
 
35
- def split_uri_pattern(uri: str) -> tuple[str, Union[str, None]]:
35
+ def split_uri_pattern(uri: str) -> tuple[str, str | None]:
36
36
  """Split a URI into base path and glob pattern."""
37
37
  if not any(char in uri for char in ["*", "?", "[", "{", "}"]):
38
38
  return uri, None
datachain/lib/dc/utils.py CHANGED
@@ -1,12 +1,6 @@
1
1
  from collections.abc import Sequence
2
2
  from functools import wraps
3
- from typing import (
4
- TYPE_CHECKING,
5
- Callable,
6
- Optional,
7
- TypeVar,
8
- Union,
9
- )
3
+ from typing import TYPE_CHECKING, TypeVar
10
4
 
11
5
  import sqlalchemy
12
6
  from sqlalchemy.sql.functions import GenericFunction
@@ -18,7 +12,10 @@ from datachain.query.schema import DEFAULT_DELIMITER
18
12
  from datachain.utils import getenv_bool
19
13
 
20
14
  if TYPE_CHECKING:
21
- from typing_extensions import Concatenate, ParamSpec
15
+ from collections.abc import Callable
16
+ from typing import Concatenate
17
+
18
+ from typing_extensions import ParamSpec
22
19
 
23
20
  from .datachain import DataChain
24
21
 
@@ -70,11 +67,11 @@ class DatasetFromValuesError(DataChainParamsError):
70
67
  super().__init__(f"Dataset{name} from values error: {msg}")
71
68
 
72
69
 
73
- MergeColType = Union[str, Function, sqlalchemy.ColumnElement]
70
+ MergeColType = str | Function | sqlalchemy.ColumnElement
74
71
 
75
72
 
76
73
  def _validate_merge_on(
77
- on: Union[MergeColType, Sequence[MergeColType]],
74
+ on: MergeColType | Sequence[MergeColType],
78
75
  ds: "DataChain",
79
76
  ) -> Sequence[MergeColType]:
80
77
  if isinstance(on, (str, sqlalchemy.ColumnElement)):
@@ -103,12 +100,12 @@ def _get_merge_error_str(col: MergeColType) -> str:
103
100
  class DatasetMergeError(DataChainParamsError):
104
101
  def __init__(
105
102
  self,
106
- on: Union[MergeColType, Sequence[MergeColType]],
107
- right_on: Optional[Union[MergeColType, Sequence[MergeColType]]],
103
+ on: MergeColType | Sequence[MergeColType],
104
+ right_on: MergeColType | Sequence[MergeColType] | None,
108
105
  msg: str,
109
106
  ):
110
107
  def _get_str(
111
- on: Union[MergeColType, Sequence[MergeColType]],
108
+ on: MergeColType | Sequence[MergeColType],
112
109
  ) -> str:
113
110
  if not isinstance(on, Sequence):
114
111
  return str(on) # type: ignore[unreachable]
@@ -123,7 +120,7 @@ class DatasetMergeError(DataChainParamsError):
123
120
  super().__init__(f"Merge error on='{on_str}'{right_on_str}: {msg}")
124
121
 
125
122
 
126
- OutputType = Union[None, DataType, Sequence[str], dict[str, DataType]]
123
+ OutputType = DataType | Sequence[str] | dict[str, DataType] | None
127
124
 
128
125
 
129
126
  class Sys(DataModel):
@@ -1,8 +1,5 @@
1
1
  from collections.abc import Iterator
2
- from typing import (
3
- TYPE_CHECKING,
4
- Optional,
5
- )
2
+ from typing import TYPE_CHECKING
6
3
 
7
4
  from datachain.lib.convert.values_to_tuples import values_to_tuples
8
5
  from datachain.lib.data_model import dict_to_data_model
@@ -20,8 +17,8 @@ if TYPE_CHECKING:
20
17
 
21
18
  def read_values(
22
19
  ds_name: str = "",
23
- session: Optional[Session] = None,
24
- settings: Optional[dict] = None,
20
+ session: Session | None = None,
21
+ settings: dict | None = None,
25
22
  in_memory: bool = False,
26
23
  output: OutputType = None,
27
24
  column: str = "",
datachain/lib/file.py CHANGED
@@ -13,7 +13,7 @@ from datetime import datetime
13
13
  from functools import partial
14
14
  from io import BytesIO
15
15
  from pathlib import Path, PurePath, PurePosixPath
16
- from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
16
+ from typing import TYPE_CHECKING, Any, ClassVar, Literal
17
17
  from urllib.parse import unquote, urlparse
18
18
  from urllib.request import url2pathname
19
19
 
@@ -53,12 +53,12 @@ class FileExporter(NodesThreadPool):
53
53
 
54
54
  def __init__(
55
55
  self,
56
- output: Union[str, os.PathLike[str]],
56
+ output: str | os.PathLike[str],
57
57
  placement: ExportPlacement,
58
58
  use_cache: bool,
59
59
  link_type: Literal["copy", "symlink"],
60
60
  max_threads: int = EXPORT_FILES_MAX_THREADS,
61
- client_config: Optional[dict] = None,
61
+ client_config: dict | None = None,
62
62
  ):
63
63
  super().__init__(max_threads)
64
64
  self.output = output
@@ -221,7 +221,7 @@ class File(DataModel):
221
221
  etag: str = Field(default="")
222
222
  is_latest: bool = Field(default=True)
223
223
  last_modified: datetime = Field(default=TIME_ZERO)
224
- location: Optional[Union[dict, list[dict]]] = Field(default=None)
224
+ location: dict | list[dict] | None = Field(default=None)
225
225
 
226
226
  _datachain_column_types: ClassVar[dict[str, Any]] = {
227
227
  "source": String,
@@ -264,8 +264,8 @@ class File(DataModel):
264
264
 
265
265
  @staticmethod
266
266
  def _validate_dict(
267
- v: Optional[Union[str, dict, list[dict]]],
268
- ) -> Optional[Union[str, dict, list[dict]]]:
267
+ v: str | dict | list[dict] | None,
268
+ ) -> str | dict | list[dict] | None:
269
269
  if v is None or v == "":
270
270
  return None
271
271
  if isinstance(v, str):
@@ -334,8 +334,8 @@ class File(DataModel):
334
334
  def upload(
335
335
  cls,
336
336
  data: bytes,
337
- path: Union[str, os.PathLike[str]],
338
- catalog: Optional["Catalog"] = None,
337
+ path: str | os.PathLike[str],
338
+ catalog: "Catalog | None" = None,
339
339
  ) -> "Self":
340
340
  if catalog is None:
341
341
  from datachain.catalog.loader import get_catalog
@@ -357,7 +357,7 @@ class File(DataModel):
357
357
 
358
358
  @classmethod
359
359
  def at(
360
- cls, uri: Union[str, os.PathLike[str]], session: Optional["Session"] = None
360
+ cls, uri: str | os.PathLike[str], session: "Session | None" = None
361
361
  ) -> "Self":
362
362
  """Construct a File from a full URI in one call.
363
363
 
@@ -470,7 +470,7 @@ class File(DataModel):
470
470
  """Returns file contents."""
471
471
  return self.read_bytes(length)
472
472
 
473
- def save(self, destination: str, client_config: Optional[dict] = None):
473
+ def save(self, destination: str, client_config: dict | None = None):
474
474
  """Writes it's content to destination"""
475
475
  destination = stringify_path(destination)
476
476
  client: Client = self._catalog.get_client(destination, **(client_config or {}))
@@ -497,11 +497,11 @@ class File(DataModel):
497
497
 
498
498
  def export(
499
499
  self,
500
- output: Union[str, os.PathLike[str]],
500
+ output: str | os.PathLike[str],
501
501
  placement: ExportPlacement = "fullpath",
502
502
  use_cache: bool = True,
503
503
  link_type: Literal["copy", "symlink"] = "copy",
504
- client_config: Optional[dict] = None,
504
+ client_config: dict | None = None,
505
505
  ) -> None:
506
506
  """Export file to new location."""
507
507
  self._caching_enabled = use_cache
@@ -537,7 +537,7 @@ class File(DataModel):
537
537
  client = self._catalog.get_client(self.source)
538
538
  client.download(self, callback=self._download_cb)
539
539
 
540
- async def _prefetch(self, download_cb: Optional["Callback"] = None) -> bool:
540
+ async def _prefetch(self, download_cb: "Callback | None" = None) -> bool:
541
541
  if self._catalog is None:
542
542
  raise RuntimeError("cannot prefetch file because catalog is not setup")
543
543
 
@@ -552,7 +552,7 @@ class File(DataModel):
552
552
  )
553
553
  return True
554
554
 
555
- def get_local_path(self) -> Optional[str]:
555
+ def get_local_path(self) -> str | None:
556
556
  """Return path to a file in a local cache.
557
557
 
558
558
  Returns None if file is not cached.
@@ -629,7 +629,7 @@ class File(DataModel):
629
629
  return path
630
630
 
631
631
  def get_destination_path(
632
- self, output: Union[str, os.PathLike[str]], placement: ExportPlacement
632
+ self, output: str | os.PathLike[str], placement: ExportPlacement
633
633
  ) -> str:
634
634
  """
635
635
  Returns full destination path of a file for exporting to some output
@@ -796,7 +796,7 @@ class TextFile(File):
796
796
  with self.open(**open_kwargs) as stream:
797
797
  return stream.read()
798
798
 
799
- def save(self, destination: str, client_config: Optional[dict] = None):
799
+ def save(self, destination: str, client_config: dict | None = None):
800
800
  """Writes it's content to destination"""
801
801
  destination = stringify_path(destination)
802
802
 
@@ -829,8 +829,8 @@ class ImageFile(File):
829
829
  def save( # type: ignore[override]
830
830
  self,
831
831
  destination: str,
832
- format: Optional[str] = None,
833
- client_config: Optional[dict] = None,
832
+ format: str | None = None,
833
+ client_config: dict | None = None,
834
834
  ):
835
835
  """Writes it's content to destination"""
836
836
  destination = stringify_path(destination)
@@ -912,7 +912,7 @@ class VideoFile(File):
912
912
  def get_frames(
913
913
  self,
914
914
  start: int = 0,
915
- end: Optional[int] = None,
915
+ end: int | None = None,
916
916
  step: int = 1,
917
917
  ) -> "Iterator[VideoFrame]":
918
918
  """
@@ -962,7 +962,7 @@ class VideoFile(File):
962
962
  self,
963
963
  duration: float,
964
964
  start: float = 0,
965
- end: Optional[float] = None,
965
+ end: float | None = None,
966
966
  ) -> "Iterator[VideoFragment]":
967
967
  """
968
968
  Splits the video into multiple fragments of a specified duration.
@@ -1048,7 +1048,7 @@ class AudioFile(File):
1048
1048
  self,
1049
1049
  duration: float,
1050
1050
  start: float = 0,
1051
- end: Optional[float] = None,
1051
+ end: float | None = None,
1052
1052
  ) -> "Iterator[AudioFragment]":
1053
1053
  """
1054
1054
  Splits the audio into multiple fragments of a specified duration.
@@ -1086,10 +1086,10 @@ class AudioFile(File):
1086
1086
  def save( # type: ignore[override]
1087
1087
  self,
1088
1088
  output: str,
1089
- format: Optional[str] = None,
1089
+ format: str | None = None,
1090
1090
  start: float = 0,
1091
- end: Optional[float] = None,
1092
- client_config: Optional[dict] = None,
1091
+ end: float | None = None,
1092
+ client_config: dict | None = None,
1093
1093
  ) -> "AudioFile":
1094
1094
  """Save audio file or extract fragment to specified format.
1095
1095
 
@@ -1160,7 +1160,7 @@ class AudioFragment(DataModel):
1160
1160
  duration = self.end - self.start
1161
1161
  return audio_to_bytes(self.audio, format, self.start, duration)
1162
1162
 
1163
- def save(self, output: str, format: Optional[str] = None) -> "AudioFile":
1163
+ def save(self, output: str, format: str | None = None) -> "AudioFile":
1164
1164
  """
1165
1165
  Saves the audio fragment as a new audio file.
1166
1166
 
@@ -1263,7 +1263,7 @@ class VideoFragment(DataModel):
1263
1263
  start: float
1264
1264
  end: float
1265
1265
 
1266
- def save(self, output: str, format: Optional[str] = None) -> "VideoFile":
1266
+ def save(self, output: str, format: str | None = None) -> "VideoFile":
1267
1267
  """
1268
1268
  Saves the video fragment as a new video file.
1269
1269
 
datachain/lib/hf.py CHANGED
@@ -26,7 +26,7 @@ except ImportError as exc:
26
26
  ) from exc
27
27
 
28
28
  from io import BytesIO
29
- from typing import TYPE_CHECKING, Any, Optional, Union
29
+ from typing import TYPE_CHECKING, Any, TypeAlias
30
30
 
31
31
  import PIL
32
32
  from tqdm.auto import tqdm
@@ -41,7 +41,9 @@ if TYPE_CHECKING:
41
41
  from pydantic import BaseModel
42
42
 
43
43
 
44
- HFDatasetType = Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]
44
+ HFDatasetType: TypeAlias = (
45
+ str | DatasetDict | Dataset | IterableDatasetDict | IterableDataset
46
+ )
45
47
 
46
48
 
47
49
  class HFClassLabel(DataModel):
@@ -67,7 +69,7 @@ class HFAudio(DataModel):
67
69
  class HFGenerator(Generator):
68
70
  def __init__(
69
71
  self,
70
- ds: Union[str, HFDatasetType],
72
+ ds: HFDatasetType,
71
73
  output_schema: type["BaseModel"],
72
74
  limit: int = 0,
73
75
  *args,
@@ -117,7 +119,7 @@ class HFGenerator(Generator):
117
119
  pbar.update(1)
118
120
 
119
121
 
120
- def stream_splits(ds: Union[str, HFDatasetType], *args, **kwargs):
122
+ def stream_splits(ds: HFDatasetType, *args, **kwargs):
121
123
  if isinstance(ds, str):
122
124
  ds = load_dataset(ds, *args, **kwargs)
123
125
  if isinstance(ds, (DatasetDict, IterableDatasetDict)):
@@ -153,7 +155,7 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
153
155
 
154
156
 
155
157
  def get_output_schema(
156
- features: Features, existing_column_names: Optional[list[str]] = None
158
+ features: Features, existing_column_names: list[str] | None = None
157
159
  ) -> tuple[dict[str, DataType], dict[str, str]]:
158
160
  """
159
161
  Generate UDF output schema from Hugging Face datasets features. It normalizes the
datachain/lib/image.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Callable, Optional, Union
1
+ from collections.abc import Callable
2
2
 
3
3
  import torch
4
4
  from PIL import Image as PILImage
@@ -6,7 +6,7 @@ from PIL import Image as PILImage
6
6
  from datachain.lib.file import File, FileError, Image, ImageFile
7
7
 
8
8
 
9
- def image_info(file: Union[File, ImageFile]) -> Image:
9
+ def image_info(file: File | ImageFile) -> Image:
10
10
  """
11
11
  Returns image file information.
12
12
 
@@ -31,11 +31,11 @@ def image_info(file: Union[File, ImageFile]) -> Image:
31
31
  def convert_image(
32
32
  img: PILImage.Image,
33
33
  mode: str = "RGB",
34
- size: Optional[tuple[int, int]] = None,
35
- transform: Optional[Callable] = None,
36
- encoder: Optional[Callable] = None,
37
- device: Optional[Union[str, torch.device]] = None,
38
- ) -> Union[PILImage.Image, torch.Tensor]:
34
+ size: tuple[int, int] | None = None,
35
+ transform: Callable | None = None,
36
+ encoder: Callable | None = None,
37
+ device: str | torch.device | None = None,
38
+ ) -> PILImage.Image | torch.Tensor:
39
39
  """
40
40
  Resize, transform, and otherwise convert an image.
41
41
 
@@ -71,13 +71,13 @@ def convert_image(
71
71
 
72
72
 
73
73
  def convert_images(
74
- images: Union[PILImage.Image, list[PILImage.Image]],
74
+ images: PILImage.Image | list[PILImage.Image],
75
75
  mode: str = "RGB",
76
- size: Optional[tuple[int, int]] = None,
77
- transform: Optional[Callable] = None,
78
- encoder: Optional[Callable] = None,
79
- device: Optional[Union[str, torch.device]] = None,
80
- ) -> Union[list[PILImage.Image], torch.Tensor]:
76
+ size: tuple[int, int] | None = None,
77
+ transform: Callable | None = None,
78
+ encoder: Callable | None = None,
79
+ device: str | torch.device | None = None,
80
+ ) -> list[PILImage.Image] | torch.Tensor:
81
81
  """
82
82
  Resize, transform, and otherwise convert one or more images.
83
83
 
datachain/lib/listing.py CHANGED
@@ -2,10 +2,10 @@ import glob
2
2
  import logging
3
3
  import os
4
4
  import posixpath
5
- from collections.abc import Iterator
5
+ from collections.abc import Callable, Iterator
6
6
  from contextlib import contextmanager
7
7
  from datetime import datetime, timedelta, timezone
8
- from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
8
+ from typing import TYPE_CHECKING, TypeVar
9
9
 
10
10
  from fsspec.asyn import get_loop
11
11
  from sqlalchemy.sql.expression import true
@@ -73,7 +73,7 @@ def get_file_info(uri: str, cache, client_config=None) -> File:
73
73
  def ls(
74
74
  dc: D,
75
75
  path: str,
76
- recursive: Optional[bool] = True,
76
+ recursive: bool | None = True,
77
77
  column="file",
78
78
  ) -> D:
79
79
  """
@@ -150,8 +150,8 @@ def _reraise_as_client_error() -> Iterator[None]:
150
150
 
151
151
 
152
152
  def get_listing(
153
- uri: Union[str, os.PathLike[str]], session: "Session", update: bool = False
154
- ) -> tuple[Optional[str], str, str, bool]:
153
+ uri: str | os.PathLike[str], session: "Session", update: bool = False
154
+ ) -> tuple[str | None, str, str, bool]:
155
155
  """Returns correct listing dataset name that must be used for saving listing
156
156
  operation. It takes into account existing listings and reusability of those.
157
157
  It also returns boolean saying if returned dataset name is reused / already
@@ -1,5 +1,4 @@
1
1
  from datetime import datetime, timedelta, timezone
2
- from typing import Optional
3
2
 
4
3
  from datachain.client import Client
5
4
  from datachain.lib.dataset_info import DatasetInfo
@@ -17,7 +16,7 @@ class ListingInfo(DatasetInfo):
17
16
  return uri
18
17
 
19
18
  @property
20
- def expires(self) -> Optional[datetime]:
19
+ def expires(self) -> datetime | None:
21
20
  if not self.finished_at:
22
21
  return None
23
22
  return self.finished_at + timedelta(seconds=LISTING_TTL)
@@ -2,9 +2,8 @@ import csv
2
2
  import json
3
3
  import tempfile
4
4
  import uuid
5
- from collections.abc import Iterator
5
+ from collections.abc import Callable, Iterator
6
6
  from pathlib import Path
7
- from typing import Callable
8
7
 
9
8
  import jmespath as jsp
10
9
  from pydantic import BaseModel, ConfigDict, Field, ValidationError # noqa: F401
@@ -1,6 +1,6 @@
1
1
  import inspect
2
2
  import logging
3
- from typing import Any, ClassVar, Optional
3
+ from typing import Any, ClassVar
4
4
 
5
5
  from pydantic import BaseModel
6
6
 
@@ -39,7 +39,7 @@ class ModelStore:
39
39
  cls.register(anno)
40
40
 
41
41
  @classmethod
42
- def get(cls, name: str, version: Optional[int] = None) -> Optional[type]:
42
+ def get(cls, name: str, version: int | None = None) -> type | None:
43
43
  class_dict = cls.store.get(name, None)
44
44
  if class_dict is None:
45
45
  return None
@@ -77,7 +77,7 @@ class ModelStore:
77
77
  )
78
78
 
79
79
  @staticmethod
80
- def to_pydantic(val) -> Optional[type[BaseModel]]:
80
+ def to_pydantic(val) -> type[BaseModel] | None:
81
81
  if val is None or not ModelStore.is_pydantic(val):
82
82
  return None
83
83
  return val
@@ -1,5 +1,3 @@
1
- from typing import Optional
2
-
3
1
  from datachain.error import (
4
2
  NamespaceCreateNotAllowedError,
5
3
  NamespaceDeleteNotAllowedError,
@@ -10,7 +8,7 @@ from datachain.query import Session
10
8
 
11
9
 
12
10
  def create(
13
- name: str, descr: Optional[str] = None, session: Optional[Session] = None
11
+ name: str, descr: str | None = None, session: Session | None = None
14
12
  ) -> Namespace:
15
13
  """
16
14
  Creates a new namespace.
@@ -42,7 +40,7 @@ def create(
42
40
  return session.catalog.metastore.create_namespace(name, descr)
43
41
 
44
42
 
45
- def get(name: str, session: Optional[Session] = None) -> Namespace:
43
+ def get(name: str, session: Session | None = None) -> Namespace:
46
44
  """
47
45
  Gets a namespace by name.
48
46
  If the namespace is not found, a `NamespaceNotFoundError` is raised.
@@ -61,7 +59,7 @@ def get(name: str, session: Optional[Session] = None) -> Namespace:
61
59
  return session.catalog.metastore.get_namespace(name)
62
60
 
63
61
 
64
- def ls(session: Optional[Session] = None) -> list[Namespace]:
62
+ def ls(session: Session | None = None) -> list[Namespace]:
65
63
  """
66
64
  Gets a list of all namespaces.
67
65
 
@@ -77,7 +75,7 @@ def ls(session: Optional[Session] = None) -> list[Namespace]:
77
75
  return Session.get(session).catalog.metastore.list_namespaces()
78
76
 
79
77
 
80
- def delete_namespace(name: str, session: Optional[Session] = None) -> None:
78
+ def delete_namespace(name: str, session: Session | None = None) -> None:
81
79
  """
82
80
  Removes a namespace by name.
83
81