datachain 0.34.6__py3-none-any.whl → 0.35.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (105) hide show
  1. datachain/asyn.py +11 -12
  2. datachain/cache.py +5 -5
  3. datachain/catalog/catalog.py +75 -83
  4. datachain/catalog/loader.py +3 -3
  5. datachain/checkpoint.py +1 -2
  6. datachain/cli/__init__.py +2 -4
  7. datachain/cli/commands/datasets.py +13 -13
  8. datachain/cli/commands/ls.py +4 -4
  9. datachain/cli/commands/query.py +3 -3
  10. datachain/cli/commands/show.py +2 -2
  11. datachain/cli/parser/job.py +1 -1
  12. datachain/cli/parser/utils.py +1 -2
  13. datachain/cli/utils.py +1 -2
  14. datachain/client/azure.py +2 -2
  15. datachain/client/fsspec.py +11 -21
  16. datachain/client/gcs.py +3 -3
  17. datachain/client/http.py +4 -4
  18. datachain/client/local.py +4 -4
  19. datachain/client/s3.py +3 -3
  20. datachain/config.py +4 -8
  21. datachain/data_storage/db_engine.py +5 -5
  22. datachain/data_storage/metastore.py +107 -107
  23. datachain/data_storage/schema.py +18 -24
  24. datachain/data_storage/sqlite.py +21 -28
  25. datachain/data_storage/warehouse.py +13 -13
  26. datachain/dataset.py +64 -70
  27. datachain/delta.py +21 -18
  28. datachain/diff/__init__.py +13 -13
  29. datachain/func/aggregate.py +9 -11
  30. datachain/func/array.py +12 -12
  31. datachain/func/base.py +7 -4
  32. datachain/func/conditional.py +9 -13
  33. datachain/func/func.py +45 -42
  34. datachain/func/numeric.py +5 -7
  35. datachain/func/string.py +2 -2
  36. datachain/hash_utils.py +54 -81
  37. datachain/job.py +8 -8
  38. datachain/lib/arrow.py +17 -14
  39. datachain/lib/audio.py +6 -6
  40. datachain/lib/clip.py +5 -4
  41. datachain/lib/convert/python_to_sql.py +4 -22
  42. datachain/lib/convert/values_to_tuples.py +4 -9
  43. datachain/lib/data_model.py +20 -19
  44. datachain/lib/dataset_info.py +6 -6
  45. datachain/lib/dc/csv.py +10 -10
  46. datachain/lib/dc/database.py +28 -29
  47. datachain/lib/dc/datachain.py +98 -97
  48. datachain/lib/dc/datasets.py +22 -22
  49. datachain/lib/dc/hf.py +4 -4
  50. datachain/lib/dc/json.py +9 -10
  51. datachain/lib/dc/listings.py +5 -8
  52. datachain/lib/dc/pandas.py +3 -6
  53. datachain/lib/dc/parquet.py +5 -5
  54. datachain/lib/dc/records.py +5 -5
  55. datachain/lib/dc/storage.py +12 -12
  56. datachain/lib/dc/storage_pattern.py +2 -2
  57. datachain/lib/dc/utils.py +11 -14
  58. datachain/lib/dc/values.py +3 -6
  59. datachain/lib/file.py +32 -28
  60. datachain/lib/hf.py +7 -5
  61. datachain/lib/image.py +13 -13
  62. datachain/lib/listing.py +5 -5
  63. datachain/lib/listing_info.py +1 -2
  64. datachain/lib/meta_formats.py +1 -2
  65. datachain/lib/model_store.py +3 -3
  66. datachain/lib/namespaces.py +4 -6
  67. datachain/lib/projects.py +5 -9
  68. datachain/lib/pytorch.py +10 -10
  69. datachain/lib/settings.py +23 -23
  70. datachain/lib/signal_schema.py +52 -44
  71. datachain/lib/text.py +8 -7
  72. datachain/lib/udf.py +25 -17
  73. datachain/lib/udf_signature.py +11 -11
  74. datachain/lib/video.py +3 -4
  75. datachain/lib/webdataset.py +30 -35
  76. datachain/lib/webdataset_laion.py +15 -16
  77. datachain/listing.py +4 -4
  78. datachain/model/bbox.py +3 -1
  79. datachain/namespace.py +4 -4
  80. datachain/node.py +6 -6
  81. datachain/nodes_thread_pool.py +0 -1
  82. datachain/plugins.py +1 -7
  83. datachain/project.py +4 -4
  84. datachain/query/batch.py +7 -8
  85. datachain/query/dataset.py +80 -87
  86. datachain/query/dispatch.py +7 -7
  87. datachain/query/metrics.py +3 -4
  88. datachain/query/params.py +2 -3
  89. datachain/query/schema.py +7 -6
  90. datachain/query/session.py +7 -7
  91. datachain/query/udf.py +8 -7
  92. datachain/query/utils.py +3 -5
  93. datachain/remote/studio.py +33 -39
  94. datachain/script_meta.py +12 -12
  95. datachain/sql/sqlite/base.py +6 -9
  96. datachain/studio.py +30 -30
  97. datachain/toolkit/split.py +1 -2
  98. datachain/utils.py +21 -21
  99. {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/METADATA +2 -3
  100. datachain-0.35.0.dist-info/RECORD +173 -0
  101. datachain-0.34.6.dist-info/RECORD +0 -173
  102. {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/WHEEL +0 -0
  103. {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/entry_points.txt +0 -0
  104. {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/licenses/LICENSE +0 -0
  105. {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/top_level.txt +0 -0
@@ -4,18 +4,15 @@ import os
4
4
  import os.path
5
5
  import sys
6
6
  import warnings
7
- from collections.abc import Iterator, Sequence
7
+ from collections.abc import Callable, Iterator, Sequence
8
8
  from typing import (
9
9
  IO,
10
10
  TYPE_CHECKING,
11
11
  Any,
12
12
  BinaryIO,
13
- Callable,
14
13
  ClassVar,
15
14
  Literal,
16
- Optional,
17
15
  TypeVar,
18
- Union,
19
16
  cast,
20
17
  overload,
21
18
  )
@@ -85,19 +82,20 @@ if TYPE_CHECKING:
85
82
  import sqlite3
86
83
 
87
84
  import pandas as pd
85
+ from sqlalchemy.orm import Session as OrmSession
88
86
  from typing_extensions import ParamSpec, Self
89
87
 
90
88
  P = ParamSpec("P")
91
89
 
92
- ConnectionType = Union[
93
- str,
94
- sqlalchemy.engine.URL,
95
- sqlalchemy.engine.interfaces.Connectable,
96
- sqlalchemy.engine.Engine,
97
- sqlalchemy.engine.Connection,
98
- "sqlalchemy.orm.Session",
99
- sqlite3.Connection,
100
- ]
90
+ ConnectionType = (
91
+ str
92
+ | sqlalchemy.engine.URL
93
+ | sqlalchemy.engine.interfaces.Connectable
94
+ | sqlalchemy.engine.Engine
95
+ | sqlalchemy.engine.Connection
96
+ | OrmSession
97
+ | sqlite3.Connection
98
+ )
101
99
 
102
100
 
103
101
  T = TypeVar("T", bound="DataChain")
@@ -186,7 +184,7 @@ class DataChain:
186
184
  query: DatasetQuery,
187
185
  settings: Settings,
188
186
  signal_schema: SignalSchema,
189
- setup: Optional[dict] = None,
187
+ setup: dict | None = None,
190
188
  _sys: bool = False,
191
189
  ) -> None:
192
190
  """Don't instantiate this directly, use one of the from_XXX constructors."""
@@ -197,10 +195,10 @@ class DataChain:
197
195
  self._sys = _sys
198
196
  self._delta = False
199
197
  self._delta_unsafe = False
200
- self._delta_on: Optional[Union[str, Sequence[str]]] = None
201
- self._delta_result_on: Optional[Union[str, Sequence[str]]] = None
202
- self._delta_compare: Optional[Union[str, Sequence[str]]] = None
203
- self._delta_retry: Optional[Union[bool, str]] = None
198
+ self._delta_on: str | Sequence[str] | None = None
199
+ self._delta_result_on: str | Sequence[str] | None = None
200
+ self._delta_compare: str | Sequence[str] | None = None
201
+ self._delta_retry: bool | str | None = None
204
202
 
205
203
  def __repr__(self) -> str:
206
204
  """Return a string representation of the chain."""
@@ -224,10 +222,10 @@ class DataChain:
224
222
 
225
223
  def _as_delta(
226
224
  self,
227
- on: Optional[Union[str, Sequence[str]]] = None,
228
- right_on: Optional[Union[str, Sequence[str]]] = None,
229
- compare: Optional[Union[str, Sequence[str]]] = None,
230
- delta_retry: Optional[Union[bool, str]] = None,
225
+ on: str | Sequence[str] | None = None,
226
+ right_on: str | Sequence[str] | None = None,
227
+ compare: str | Sequence[str] | None = None,
228
+ delta_retry: bool | str | None = None,
231
229
  delta_unsafe: bool = False,
232
230
  ) -> "Self":
233
231
  """Marks this chain as delta, which means special delta process will be
@@ -277,7 +275,7 @@ class DataChain:
277
275
 
278
276
  raise ValueError(f"Column with name {name} not found in the schema")
279
277
 
280
- def c(self, column: Union[str, Column]) -> Column:
278
+ def c(self, column: str | Column) -> Column:
281
279
  """Returns Column instance attached to the current chain."""
282
280
  c = self.column(column) if isinstance(column, str) else self.column(column.name)
283
281
  c.table = self._query.table
@@ -289,17 +287,17 @@ class DataChain:
289
287
  return self._query.session
290
288
 
291
289
  @property
292
- def name(self) -> Optional[str]:
290
+ def name(self) -> str | None:
293
291
  """Name of the underlying dataset, if there is one."""
294
292
  return self._query.name
295
293
 
296
294
  @property
297
- def version(self) -> Optional[str]:
295
+ def version(self) -> str | None:
298
296
  """Version of the underlying dataset, if there is one."""
299
297
  return self._query.version
300
298
 
301
299
  @property
302
- def dataset(self) -> Optional[DatasetRecord]:
300
+ def dataset(self) -> DatasetRecord | None:
303
301
  """Underlying dataset, if there is one."""
304
302
  if not self.name:
305
303
  return None
@@ -313,7 +311,7 @@ class DataChain:
313
311
  """Return `self.union(other)`."""
314
312
  return self.union(other)
315
313
 
316
- def print_schema(self, file: Optional[IO] = None) -> None:
314
+ def print_schema(self, file: IO | None = None) -> None:
317
315
  """Print schema of the chain."""
318
316
  self._effective_signals_schema.print_tree(file=file)
319
317
 
@@ -324,8 +322,8 @@ class DataChain:
324
322
  def _evolve(
325
323
  self,
326
324
  *,
327
- query: Optional[DatasetQuery] = None,
328
- settings: Optional[Settings] = None,
325
+ query: DatasetQuery | None = None,
326
+ settings: Settings | None = None,
329
327
  signal_schema=None,
330
328
  _sys=None,
331
329
  ) -> "Self":
@@ -353,15 +351,15 @@ class DataChain:
353
351
 
354
352
  def settings(
355
353
  self,
356
- cache: Optional[bool] = None,
357
- prefetch: Optional[Union[bool, int]] = None,
358
- parallel: Optional[Union[bool, int]] = None,
359
- workers: Optional[int] = None,
360
- namespace: Optional[str] = None,
361
- project: Optional[str] = None,
362
- min_task_size: Optional[int] = None,
363
- batch_size: Optional[int] = None,
364
- sys: Optional[bool] = None,
354
+ cache: bool | None = None,
355
+ prefetch: bool | int | None = None,
356
+ parallel: bool | int | None = None,
357
+ workers: int | None = None,
358
+ namespace: str | None = None,
359
+ project: str | None = None,
360
+ min_task_size: int | None = None,
361
+ batch_size: int | None = None,
362
+ sys: bool | None = None,
365
363
  ) -> "Self":
366
364
  """
367
365
  Set chain execution parameters. Returns the chain itself, allowing method
@@ -412,7 +410,7 @@ class DataChain:
412
410
  )
413
411
  return self._evolve(settings=settings, _sys=sys)
414
412
 
415
- def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
413
+ def reset_settings(self, settings: Settings | None = None) -> "Self":
416
414
  """Reset all chain settings to default values."""
417
415
  self._settings = settings if settings else Settings()
418
416
  return self
@@ -464,8 +462,8 @@ class DataChain:
464
462
  def explode(
465
463
  self,
466
464
  col: str,
467
- model_name: Optional[str] = None,
468
- column: Optional[str] = None,
465
+ model_name: str | None = None,
466
+ column: str | None = None,
469
467
  schema_sample_size: int = 1,
470
468
  ) -> "DataChain":
471
469
  """Explodes a column containing JSON objects (dict or str DataChain type) into
@@ -506,7 +504,7 @@ class DataChain:
506
504
 
507
505
  model = dict_to_data_model(model_name, output, original_names)
508
506
 
509
- def json_to_model(json_value: Union[str, dict]):
507
+ def json_to_model(json_value: str | dict):
510
508
  json_dict = (
511
509
  json.loads(json_value) if isinstance(json_value, str) else json_value
512
510
  )
@@ -599,10 +597,10 @@ class DataChain:
599
597
  def save( # type: ignore[override]
600
598
  self,
601
599
  name: str,
602
- version: Optional[str] = None,
603
- description: Optional[str] = None,
604
- attrs: Optional[list[str]] = None,
605
- update_version: Optional[str] = "patch",
600
+ version: str | None = None,
601
+ description: str | None = None,
602
+ attrs: list[str] | None = None,
603
+ update_version: str | None = "patch",
606
604
  **kwargs,
607
605
  ) -> "DataChain":
608
606
  """Save to a Dataset. It returns the chain itself.
@@ -666,12 +664,12 @@ class DataChain:
666
664
 
667
665
  return result
668
666
 
669
- def _validate_version(self, version: Optional[str]) -> None:
667
+ def _validate_version(self, version: str | None) -> None:
670
668
  """Validate dataset version if provided."""
671
669
  if version is not None:
672
670
  semver.validate(version)
673
671
 
674
- def _validate_update_version(self, update_version: Optional[str]) -> None:
672
+ def _validate_update_version(self, update_version: str | None) -> None:
675
673
  """Ensure update_version is one of: major, minor, patch."""
676
674
  allowed = ["major", "minor", "patch"]
677
675
  if update_version not in allowed:
@@ -693,7 +691,7 @@ class DataChain:
693
691
  name: str,
694
692
  project: Project,
695
693
  kwargs: dict,
696
- ) -> tuple[Optional[Job], Optional[str], Optional["DataChain"]]:
694
+ ) -> tuple[Job | None, str | None, "DataChain | None"]:
697
695
  """Check if checkpoint exists and return cached dataset if possible."""
698
696
  from .datasets import read_dataset
699
697
 
@@ -727,11 +725,11 @@ class DataChain:
727
725
  def _handle_delta(
728
726
  self,
729
727
  name: str,
730
- version: Optional[str],
728
+ version: str | None,
731
729
  project: Project,
732
730
  schema: dict,
733
731
  kwargs: dict,
734
- ) -> Optional["DataChain"]:
732
+ ) -> "DataChain | None":
735
733
  """Try to save as a delta dataset.
736
734
  Returns:
737
735
  A DataChain if delta logic could handle it, otherwise None to fall back
@@ -811,8 +809,8 @@ class DataChain:
811
809
 
812
810
  def map(
813
811
  self,
814
- func: Optional[Callable] = None,
815
- params: Union[None, str, Sequence[str]] = None,
812
+ func: Callable | None = None,
813
+ params: str | Sequence[str] | None = None,
816
814
  output: OutputType = None,
817
815
  **signal_map: Any,
818
816
  ) -> "Self":
@@ -863,8 +861,8 @@ class DataChain:
863
861
 
864
862
  def gen(
865
863
  self,
866
- func: Optional[Union[Callable, Generator]] = None,
867
- params: Union[None, str, Sequence[str]] = None,
864
+ func: Callable | Generator | None = None,
865
+ params: str | Sequence[str] | None = None,
868
866
  output: OutputType = None,
869
867
  **signal_map,
870
868
  ) -> "Self":
@@ -903,9 +901,9 @@ class DataChain:
903
901
  def agg(
904
902
  self,
905
903
  /,
906
- func: Optional[Callable] = None,
907
- partition_by: Optional[PartitionByType] = None,
908
- params: Union[None, str, Sequence[str]] = None,
904
+ func: Callable | None = None,
905
+ partition_by: PartitionByType | None = None,
906
+ params: str | Sequence[str] | None = None,
909
907
  output: OutputType = None,
910
908
  **signal_map: Callable,
911
909
  ) -> "Self":
@@ -1038,8 +1036,8 @@ class DataChain:
1038
1036
 
1039
1037
  def batch_map(
1040
1038
  self,
1041
- func: Optional[Callable] = None,
1042
- params: Union[None, str, Sequence[str]] = None,
1039
+ func: Callable | None = None,
1040
+ params: str | Sequence[str] | None = None,
1043
1041
  output: OutputType = None,
1044
1042
  batch: int = 1000,
1045
1043
  **signal_map,
@@ -1087,8 +1085,8 @@ class DataChain:
1087
1085
  def _udf_to_obj(
1088
1086
  self,
1089
1087
  target_class: type[UDFObjT],
1090
- func: Optional[Union[Callable, UDFObjT]],
1091
- params: Union[None, str, Sequence[str]],
1088
+ func: Callable | UDFObjT | None,
1089
+ params: str | Sequence[str] | None,
1092
1090
  output: OutputType,
1093
1091
  signal_map: dict[str, Callable],
1094
1092
  ) -> UDFObjT:
@@ -1180,7 +1178,7 @@ class DataChain:
1180
1178
  def group_by( # noqa: C901, PLR0912
1181
1179
  self,
1182
1180
  *,
1183
- partition_by: Optional[Union[str, Func, Sequence[Union[str, Func]]]] = None,
1181
+ partition_by: str | Func | Sequence[str | Func] | None = None,
1184
1182
  **kwargs: Func,
1185
1183
  ) -> "Self":
1186
1184
  """Group rows by specified set of signals and return new signals
@@ -1486,7 +1484,7 @@ class DataChain:
1486
1484
  """Convert every row to a dictionary."""
1487
1485
 
1488
1486
  def to_dict(cols: list[str], row: tuple[Any, ...]) -> dict[str, Any]:
1489
- return dict(zip(cols, row))
1487
+ return dict(zip(cols, row, strict=False))
1490
1488
 
1491
1489
  return self.results(row_factory=to_dict)
1492
1490
 
@@ -1544,7 +1542,7 @@ class DataChain:
1544
1542
  @overload
1545
1543
  def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
1546
1544
 
1547
- def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
1545
+ def collect(self, *cols: str) -> Iterator[DataValue | tuple[DataValue, ...]]: # type: ignore[overload-overlap,misc]
1548
1546
  """
1549
1547
  Deprecated. Use `to_iter` method instead.
1550
1548
  """
@@ -1609,8 +1607,8 @@ class DataChain:
1609
1607
  def merge(
1610
1608
  self,
1611
1609
  right_ds: "DataChain",
1612
- on: Union[MergeColType, Sequence[MergeColType]],
1613
- right_on: Optional[Union[MergeColType, Sequence[MergeColType]]] = None,
1610
+ on: MergeColType | Sequence[MergeColType],
1611
+ right_on: MergeColType | Sequence[MergeColType] | None = None,
1614
1612
  inner=False,
1615
1613
  full=False,
1616
1614
  rname="right_",
@@ -1678,8 +1676,8 @@ class DataChain:
1678
1676
 
1679
1677
  def _resolve(
1680
1678
  ds: DataChain,
1681
- col: Union[str, Function, sqlalchemy.ColumnElement],
1682
- side: Union[str, None],
1679
+ col: str | Function | sqlalchemy.ColumnElement,
1680
+ side: str | None,
1683
1681
  ):
1684
1682
  try:
1685
1683
  if isinstance(col, Function):
@@ -1692,7 +1690,7 @@ class DataChain:
1692
1690
  ops = [
1693
1691
  _resolve(self, left, "left")
1694
1692
  == _resolve(right_ds, right, "right" if right_on else None)
1695
- for left, right in zip(on, right_on or on)
1693
+ for left, right in zip(on, right_on or on, strict=False)
1696
1694
  ]
1697
1695
 
1698
1696
  if errors:
@@ -1730,8 +1728,8 @@ class DataChain:
1730
1728
  def subtract( # type: ignore[override]
1731
1729
  self,
1732
1730
  other: "DataChain",
1733
- on: Optional[Union[str, Sequence[str]]] = None,
1734
- right_on: Optional[Union[str, Sequence[str]]] = None,
1731
+ on: str | Sequence[str] | None = None,
1732
+ right_on: str | Sequence[str] | None = None,
1735
1733
  ) -> "Self":
1736
1734
  """Remove rows that appear in another chain.
1737
1735
 
@@ -1788,6 +1786,7 @@ class DataChain:
1788
1786
  zip(
1789
1787
  self.signals_schema.resolve(*on).db_signals(),
1790
1788
  other.signals_schema.resolve(*right_on).db_signals(),
1789
+ strict=False,
1791
1790
  ) # type: ignore[arg-type]
1792
1791
  )
1793
1792
  return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
@@ -1795,15 +1794,15 @@ class DataChain:
1795
1794
  def diff(
1796
1795
  self,
1797
1796
  other: "DataChain",
1798
- on: Union[str, Sequence[str]],
1799
- right_on: Optional[Union[str, Sequence[str]]] = None,
1800
- compare: Optional[Union[str, Sequence[str]]] = None,
1801
- right_compare: Optional[Union[str, Sequence[str]]] = None,
1797
+ on: str | Sequence[str],
1798
+ right_on: str | Sequence[str] | None = None,
1799
+ compare: str | Sequence[str] | None = None,
1800
+ right_compare: str | Sequence[str] | None = None,
1802
1801
  added: bool = True,
1803
1802
  deleted: bool = True,
1804
1803
  modified: bool = True,
1805
1804
  same: bool = False,
1806
- status_col: Optional[str] = None,
1805
+ status_col: str | None = None,
1807
1806
  ) -> "DataChain":
1808
1807
  """Calculate differences between two chains.
1809
1808
 
@@ -1864,12 +1863,12 @@ class DataChain:
1864
1863
  self,
1865
1864
  other: "DataChain",
1866
1865
  on: str = "file",
1867
- right_on: Optional[str] = None,
1866
+ right_on: str | None = None,
1868
1867
  added: bool = True,
1869
1868
  modified: bool = True,
1870
1869
  deleted: bool = False,
1871
1870
  same: bool = False,
1872
- status_col: Optional[str] = None,
1871
+ status_col: str | None = None,
1873
1872
  ) -> "DataChain":
1874
1873
  """Calculate differences between two chains containing files.
1875
1874
 
@@ -1985,6 +1984,8 @@ class DataChain:
1985
1984
  headers, max_length = self._effective_signals_schema.get_headers_with_length(
1986
1985
  include_hidden=include_hidden
1987
1986
  )
1987
+
1988
+ columns: list[str] | pd.MultiIndex
1988
1989
  if flatten or max_length < 2:
1989
1990
  columns = [".".join(filter(None, header)) for header in headers]
1990
1991
  else:
@@ -2080,7 +2081,7 @@ class DataChain:
2080
2081
  column: str = "",
2081
2082
  model_name: str = "",
2082
2083
  source: bool = True,
2083
- nrows: Optional[int] = None,
2084
+ nrows: int | None = None,
2084
2085
  **kwargs: Any,
2085
2086
  ) -> "Self":
2086
2087
  """Generate chain from list of tabular files.
@@ -2214,10 +2215,10 @@ class DataChain:
2214
2215
 
2215
2216
  def to_parquet(
2216
2217
  self,
2217
- path: Union[str, os.PathLike[str], BinaryIO],
2218
- partition_cols: Optional[Sequence[str]] = None,
2218
+ path: str | os.PathLike[str] | BinaryIO,
2219
+ partition_cols: Sequence[str] | None = None,
2219
2220
  chunk_size: int = DEFAULT_PARQUET_CHUNK_SIZE,
2220
- fs_kwargs: Optional[dict[str, Any]] = None,
2221
+ fs_kwargs: dict[str, Any] | None = None,
2221
2222
  **kwargs,
2222
2223
  ) -> None:
2223
2224
  """Save chain to parquet file with SignalSchema metadata.
@@ -2274,7 +2275,7 @@ class DataChain:
2274
2275
  # pyarrow infers the best parquet schema from the python types of
2275
2276
  # the input data.
2276
2277
  table = pa.Table.from_pydict(
2277
- dict(zip(column_names, chunk)),
2278
+ dict(zip(column_names, chunk, strict=False)),
2278
2279
  schema=parquet_schema,
2279
2280
  )
2280
2281
 
@@ -2312,9 +2313,9 @@ class DataChain:
2312
2313
 
2313
2314
  def to_csv(
2314
2315
  self,
2315
- path: Union[str, os.PathLike[str]],
2316
+ path: str | os.PathLike[str],
2316
2317
  delimiter: str = ",",
2317
- fs_kwargs: Optional[dict[str, Any]] = None,
2318
+ fs_kwargs: dict[str, Any] | None = None,
2318
2319
  **kwargs,
2319
2320
  ) -> None:
2320
2321
  """Save chain to a csv (comma-separated values) file.
@@ -2359,8 +2360,8 @@ class DataChain:
2359
2360
 
2360
2361
  def to_json(
2361
2362
  self,
2362
- path: Union[str, os.PathLike[str]],
2363
- fs_kwargs: Optional[dict[str, Any]] = None,
2363
+ path: str | os.PathLike[str],
2364
+ fs_kwargs: dict[str, Any] | None = None,
2364
2365
  include_outer_list: bool = True,
2365
2366
  ) -> None:
2366
2367
  """Save chain to a JSON file.
@@ -2420,8 +2421,8 @@ class DataChain:
2420
2421
 
2421
2422
  def to_jsonl(
2422
2423
  self,
2423
- path: Union[str, os.PathLike[str]],
2424
- fs_kwargs: Optional[dict[str, Any]] = None,
2424
+ path: str | os.PathLike[str],
2425
+ fs_kwargs: dict[str, Any] | None = None,
2425
2426
  ) -> None:
2426
2427
  """Save chain to a JSON lines file.
2427
2428
 
@@ -2440,9 +2441,9 @@ class DataChain:
2440
2441
  connection: "ConnectionType",
2441
2442
  *,
2442
2443
  batch_size: int = DEFAULT_DATABASE_BATCH_SIZE,
2443
- on_conflict: Optional[str] = None,
2444
- conflict_columns: Optional[list[str]] = None,
2445
- column_mapping: Optional[dict[str, Optional[str]]] = None,
2444
+ on_conflict: str | None = None,
2445
+ conflict_columns: list[str] | None = None,
2446
+ column_mapping: dict[str, str | None] | None = None,
2446
2447
  ) -> int:
2447
2448
  """Save chain to a database table using a given database connection.
2448
2449
 
@@ -2678,13 +2679,13 @@ class DataChain:
2678
2679
 
2679
2680
  def to_storage(
2680
2681
  self,
2681
- output: Union[str, os.PathLike[str]],
2682
+ output: str | os.PathLike[str],
2682
2683
  signal: str = "file",
2683
2684
  placement: FileExportPlacement = "fullpath",
2684
2685
  link_type: Literal["copy", "symlink"] = "copy",
2685
- num_threads: Optional[int] = EXPORT_FILES_MAX_THREADS,
2686
- anon: Optional[bool] = None,
2687
- client_config: Optional[dict] = None,
2686
+ num_threads: int | None = EXPORT_FILES_MAX_THREADS,
2687
+ anon: bool | None = None,
2688
+ client_config: dict | None = None,
2688
2689
  ) -> None:
2689
2690
  """Export files from a specified signal to a directory. Files can be
2690
2691
  exported to a local or cloud directory.
@@ -1,5 +1,5 @@
1
1
  from collections.abc import Sequence
2
- from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
2
+ from typing import TYPE_CHECKING, get_origin, get_type_hints
3
3
 
4
4
  from datachain.error import (
5
5
  DatasetNotFoundError,
@@ -26,20 +26,20 @@ if TYPE_CHECKING:
26
26
 
27
27
  def read_dataset(
28
28
  name: str,
29
- namespace: Optional[str] = None,
30
- project: Optional[str] = None,
31
- version: Optional[Union[str, int]] = None,
32
- session: Optional[Session] = None,
33
- settings: Optional[dict] = None,
34
- delta: Optional[bool] = False,
35
- delta_on: Optional[Union[str, Sequence[str]]] = (
29
+ namespace: str | None = None,
30
+ project: str | None = None,
31
+ version: str | int | None = None,
32
+ session: Session | None = None,
33
+ settings: dict | None = None,
34
+ delta: bool | None = False,
35
+ delta_on: str | Sequence[str] | None = (
36
36
  "file.path",
37
37
  "file.etag",
38
38
  "file.version",
39
39
  ),
40
- delta_result_on: Optional[Union[str, Sequence[str]]] = None,
41
- delta_compare: Optional[Union[str, Sequence[str]]] = None,
42
- delta_retry: Optional[Union[bool, str]] = None,
40
+ delta_result_on: str | Sequence[str] | None = None,
41
+ delta_compare: str | Sequence[str] | None = None,
42
+ delta_retry: bool | str | None = None,
43
43
  delta_unsafe: bool = False,
44
44
  update: bool = False,
45
45
  ) -> "DataChain":
@@ -215,13 +215,13 @@ def read_dataset(
215
215
 
216
216
 
217
217
  def datasets(
218
- session: Optional[Session] = None,
219
- settings: Optional[dict] = None,
218
+ session: Session | None = None,
219
+ settings: dict | None = None,
220
220
  in_memory: bool = False,
221
- column: Optional[str] = None,
221
+ column: str | None = None,
222
222
  include_listing: bool = False,
223
223
  studio: bool = False,
224
- attrs: Optional[list[str]] = None,
224
+ attrs: list[str] | None = None,
225
225
  ) -> "DataChain":
226
226
  """Generate chain with list of registered datasets.
227
227
 
@@ -298,12 +298,12 @@ def datasets(
298
298
 
299
299
  def delete_dataset(
300
300
  name: str,
301
- namespace: Optional[str] = None,
302
- project: Optional[str] = None,
303
- version: Optional[str] = None,
304
- force: Optional[bool] = False,
305
- studio: Optional[bool] = False,
306
- session: Optional[Session] = None,
301
+ namespace: str | None = None,
302
+ project: str | None = None,
303
+ version: str | None = None,
304
+ force: bool | None = False,
305
+ studio: bool | None = False,
306
+ session: Session | None = None,
307
307
  in_memory: bool = False,
308
308
  ) -> None:
309
309
  """Removes specific dataset version or all dataset versions, depending on
@@ -377,7 +377,7 @@ def delete_dataset(
377
377
  def move_dataset(
378
378
  src: str,
379
379
  dest: str,
380
- session: Optional[Session] = None,
380
+ session: Session | None = None,
381
381
  in_memory: bool = False,
382
382
  ) -> None:
383
383
  """Moves an entire dataset between namespaces and projects.
datachain/lib/dc/hf.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import TYPE_CHECKING, Any, Optional, Union
1
+ from typing import TYPE_CHECKING, Any
2
2
 
3
3
  from datachain.lib.data_model import dict_to_data_model
4
4
  from datachain.query import Session
@@ -15,10 +15,10 @@ if TYPE_CHECKING:
15
15
 
16
16
 
17
17
  def read_hf(
18
- dataset: Union[str, "HFDatasetType"],
18
+ dataset: "HFDatasetType",
19
19
  *args: Any,
20
- session: Optional[Session] = None,
21
- settings: Optional[dict] = None,
20
+ session: Session | None = None,
21
+ settings: dict | None = None,
22
22
  column: str = "",
23
23
  model_name: str = "",
24
24
  limit: int = 0,
datachain/lib/dc/json.py CHANGED
@@ -1,7 +1,6 @@
1
1
  import os
2
- import os.path
3
2
  import re
4
- from typing import TYPE_CHECKING, Optional, Union
3
+ from typing import TYPE_CHECKING
5
4
 
6
5
  import cloudpickle
7
6
 
@@ -18,15 +17,15 @@ if TYPE_CHECKING:
18
17
 
19
18
 
20
19
  def read_json(
21
- path: Union[str, os.PathLike[str]],
20
+ path: str | os.PathLike[str],
22
21
  type: FileType = "text",
23
- spec: Optional[DataType] = None,
24
- schema_from: Optional[str] = "auto",
25
- jmespath: Optional[str] = None,
26
- column: Optional[str] = "",
27
- model_name: Optional[str] = None,
28
- format: Optional[str] = "json",
29
- nrows: Optional[int] = None,
22
+ spec: DataType | None = None,
23
+ schema_from: str | None = "auto",
24
+ jmespath: str | None = None,
25
+ column: str | None = "",
26
+ model_name: str | None = None,
27
+ format: str | None = "json",
28
+ nrows: int | None = None,
30
29
  **kwargs,
31
30
  ) -> "DataChain":
32
31
  """Get data from JSON. It returns the chain itself.
@@ -1,7 +1,4 @@
1
- from typing import (
2
- TYPE_CHECKING,
3
- Optional,
4
- )
1
+ from typing import TYPE_CHECKING
5
2
 
6
3
  from datachain.lib.listing import LISTING_PREFIX, ls
7
4
  from datachain.lib.listing_info import ListingInfo
@@ -56,7 +53,7 @@ class ReadOnlyQueryStep(QueryStep):
56
53
 
57
54
 
58
55
  def listings(
59
- session: Optional[Session] = None,
56
+ session: Session | None = None,
60
57
  in_memory: bool = False,
61
58
  column: str = "listing",
62
59
  **kwargs,
@@ -84,10 +81,10 @@ def listings(
84
81
 
85
82
  def read_listing_dataset(
86
83
  name: str,
87
- version: Optional[str] = None,
84
+ version: str | None = None,
88
85
  path: str = "",
89
- session: Optional["Session"] = None,
90
- settings: Optional[dict] = None,
86
+ session: Session | None = None,
87
+ settings: dict | None = None,
91
88
  ) -> tuple["DataChain", "DatasetVersion"]:
92
89
  """Read a listing dataset and return a DataChain and listing version.
93
90