datachain 0.34.5__py3-none-any.whl → 0.34.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (105) hide show
  1. datachain/asyn.py +11 -12
  2. datachain/cache.py +5 -5
  3. datachain/catalog/catalog.py +75 -83
  4. datachain/catalog/loader.py +3 -3
  5. datachain/checkpoint.py +1 -2
  6. datachain/cli/__init__.py +2 -4
  7. datachain/cli/commands/datasets.py +13 -13
  8. datachain/cli/commands/ls.py +4 -4
  9. datachain/cli/commands/query.py +3 -3
  10. datachain/cli/commands/show.py +2 -2
  11. datachain/cli/parser/job.py +1 -1
  12. datachain/cli/parser/utils.py +1 -2
  13. datachain/cli/utils.py +1 -2
  14. datachain/client/azure.py +2 -2
  15. datachain/client/fsspec.py +11 -21
  16. datachain/client/gcs.py +3 -3
  17. datachain/client/http.py +4 -4
  18. datachain/client/local.py +4 -4
  19. datachain/client/s3.py +3 -3
  20. datachain/config.py +4 -8
  21. datachain/data_storage/db_engine.py +5 -5
  22. datachain/data_storage/metastore.py +107 -107
  23. datachain/data_storage/schema.py +18 -24
  24. datachain/data_storage/sqlite.py +21 -28
  25. datachain/data_storage/warehouse.py +13 -13
  26. datachain/dataset.py +64 -70
  27. datachain/delta.py +21 -18
  28. datachain/diff/__init__.py +13 -13
  29. datachain/func/aggregate.py +9 -11
  30. datachain/func/array.py +12 -12
  31. datachain/func/base.py +7 -4
  32. datachain/func/conditional.py +9 -13
  33. datachain/func/func.py +45 -42
  34. datachain/func/numeric.py +5 -7
  35. datachain/func/string.py +2 -2
  36. datachain/hash_utils.py +54 -81
  37. datachain/job.py +8 -8
  38. datachain/lib/arrow.py +17 -14
  39. datachain/lib/audio.py +6 -6
  40. datachain/lib/clip.py +5 -4
  41. datachain/lib/convert/python_to_sql.py +4 -22
  42. datachain/lib/convert/values_to_tuples.py +4 -9
  43. datachain/lib/data_model.py +20 -19
  44. datachain/lib/dataset_info.py +6 -6
  45. datachain/lib/dc/csv.py +10 -10
  46. datachain/lib/dc/database.py +28 -29
  47. datachain/lib/dc/datachain.py +98 -97
  48. datachain/lib/dc/datasets.py +22 -22
  49. datachain/lib/dc/hf.py +4 -4
  50. datachain/lib/dc/json.py +9 -10
  51. datachain/lib/dc/listings.py +5 -8
  52. datachain/lib/dc/pandas.py +3 -6
  53. datachain/lib/dc/parquet.py +5 -5
  54. datachain/lib/dc/records.py +5 -5
  55. datachain/lib/dc/storage.py +12 -12
  56. datachain/lib/dc/storage_pattern.py +2 -2
  57. datachain/lib/dc/utils.py +11 -14
  58. datachain/lib/dc/values.py +3 -6
  59. datachain/lib/file.py +26 -26
  60. datachain/lib/hf.py +7 -5
  61. datachain/lib/image.py +13 -13
  62. datachain/lib/listing.py +5 -5
  63. datachain/lib/listing_info.py +1 -2
  64. datachain/lib/meta_formats.py +1 -2
  65. datachain/lib/model_store.py +3 -3
  66. datachain/lib/namespaces.py +4 -6
  67. datachain/lib/projects.py +5 -9
  68. datachain/lib/pytorch.py +10 -10
  69. datachain/lib/settings.py +23 -23
  70. datachain/lib/signal_schema.py +52 -44
  71. datachain/lib/text.py +8 -7
  72. datachain/lib/udf.py +25 -17
  73. datachain/lib/udf_signature.py +11 -11
  74. datachain/lib/video.py +3 -4
  75. datachain/lib/webdataset.py +30 -35
  76. datachain/lib/webdataset_laion.py +15 -16
  77. datachain/listing.py +4 -4
  78. datachain/model/bbox.py +3 -1
  79. datachain/namespace.py +4 -4
  80. datachain/node.py +6 -6
  81. datachain/nodes_thread_pool.py +0 -1
  82. datachain/plugins.py +1 -7
  83. datachain/project.py +4 -4
  84. datachain/query/batch.py +7 -8
  85. datachain/query/dataset.py +80 -87
  86. datachain/query/dispatch.py +7 -7
  87. datachain/query/metrics.py +3 -4
  88. datachain/query/params.py +2 -3
  89. datachain/query/schema.py +7 -6
  90. datachain/query/session.py +7 -7
  91. datachain/query/udf.py +8 -7
  92. datachain/query/utils.py +8 -6
  93. datachain/remote/studio.py +33 -39
  94. datachain/script_meta.py +12 -12
  95. datachain/sql/sqlite/base.py +6 -9
  96. datachain/studio.py +30 -30
  97. datachain/toolkit/split.py +1 -2
  98. datachain/utils.py +21 -21
  99. {datachain-0.34.5.dist-info → datachain-0.34.7.dist-info}/METADATA +2 -3
  100. datachain-0.34.7.dist-info/RECORD +173 -0
  101. datachain-0.34.5.dist-info/RECORD +0 -173
  102. {datachain-0.34.5.dist-info → datachain-0.34.7.dist-info}/WHEEL +0 -0
  103. {datachain-0.34.5.dist-info → datachain-0.34.7.dist-info}/entry_points.txt +0 -0
  104. {datachain-0.34.5.dist-info → datachain-0.34.7.dist-info}/licenses/LICENSE +0 -0
  105. {datachain-0.34.5.dist-info → datachain-0.34.7.dist-info}/top_level.txt +0 -0
@@ -4,13 +4,7 @@ import os
4
4
  from collections.abc import AsyncIterator, Iterable, Iterator
5
5
  from datetime import datetime, timedelta, timezone
6
6
  from struct import unpack
7
- from typing import (
8
- Any,
9
- BinaryIO,
10
- Generic,
11
- Optional,
12
- TypeVar,
13
- )
7
+ from typing import Any, BinaryIO, Generic, TypeVar
14
8
  from urllib.parse import urlparse, urlunparse
15
9
 
16
10
  import websockets
@@ -22,14 +16,14 @@ from datachain.error import DataChainError
22
16
  from datachain.utils import STUDIO_URL, retry_with_backoff
23
17
 
24
18
  T = TypeVar("T")
25
- LsData = Optional[list[dict[str, Any]]]
26
- DatasetInfoData = Optional[dict[str, Any]]
27
- DatasetRowsData = Optional[Iterable[dict[str, Any]]]
28
- DatasetJobVersionsData = Optional[dict[str, Any]]
29
- DatasetExportStatus = Optional[dict[str, Any]]
30
- DatasetExportSignedUrls = Optional[list[str]]
31
- FileUploadData = Optional[dict[str, Any]]
32
- JobData = Optional[dict[str, Any]]
19
+ LsData = list[dict[str, Any]] | None
20
+ DatasetInfoData = dict[str, Any] | None
21
+ DatasetRowsData = Iterable[dict[str, Any]] | None
22
+ DatasetJobVersionsData = dict[str, Any] | None
23
+ DatasetExportStatus = dict[str, Any] | None
24
+ DatasetExportSignedUrls = list[str] | None
25
+ FileUploadData = dict[str, Any] | None
26
+ JobData = dict[str, Any] | None
33
27
  JobListData = list[dict[str, Any]]
34
28
  ClusterListData = list[dict[str, Any]]
35
29
 
@@ -93,7 +87,7 @@ class Response(Generic[T]):
93
87
 
94
88
 
95
89
  class StudioClient:
96
- def __init__(self, timeout: float = 3600.0, team: Optional[str] = None) -> None:
90
+ def __init__(self, timeout: float = 3600.0, team: str | None = None) -> None:
97
91
  self._check_dependencies()
98
92
  self.timeout = timeout
99
93
  self._config = None
@@ -154,7 +148,7 @@ class StudioClient:
154
148
  ) from None
155
149
 
156
150
  def _send_request_msgpack(
157
- self, route: str, data: dict[str, Any], method: Optional[str] = "POST"
151
+ self, route: str, data: dict[str, Any], method: str | None = "POST"
158
152
  ) -> Response[Any]:
159
153
  import msgpack
160
154
  import requests
@@ -192,7 +186,7 @@ class StudioClient:
192
186
 
193
187
  @retry_with_backoff(retries=3, errors=(HTTPError, Timeout))
194
188
  def _send_request(
195
- self, route: str, data: dict[str, Any], method: Optional[str] = "POST"
189
+ self, route: str, data: dict[str, Any], method: str | None = "POST"
196
190
  ) -> Response[Any]:
197
191
  """
198
192
  Function that communicate Studio API.
@@ -241,7 +235,7 @@ class StudioClient:
241
235
  return Response(data, ok, message, response.status_code)
242
236
 
243
237
  def _send_multipart_request(
244
- self, route: str, files: dict[str, Any], params: Optional[dict[str, Any]] = None
238
+ self, route: str, files: dict[str, Any], params: dict[str, Any] | None = None
245
239
  ) -> Response[Any]:
246
240
  """
247
241
  Function that communicates with Studio API using multipart/form-data.
@@ -345,7 +339,7 @@ class StudioClient:
345
339
  response = self._send_request_msgpack("datachain/ls", {"source": path})
346
340
  yield path, response
347
341
 
348
- def ls_datasets(self, prefix: Optional[str] = None) -> Response[LsData]:
342
+ def ls_datasets(self, prefix: str | None = None) -> Response[LsData]:
349
343
  return self._send_request(
350
344
  "datachain/datasets", {"prefix": prefix}, method="GET"
351
345
  )
@@ -355,9 +349,9 @@ class StudioClient:
355
349
  name: str,
356
350
  namespace: str,
357
351
  project: str,
358
- new_name: Optional[str] = None,
359
- description: Optional[str] = None,
360
- attrs: Optional[list[str]] = None,
352
+ new_name: str | None = None,
353
+ description: str | None = None,
354
+ attrs: list[str] | None = None,
361
355
  ) -> Response[DatasetInfoData]:
362
356
  body = {
363
357
  "new_name": new_name,
@@ -378,8 +372,8 @@ class StudioClient:
378
372
  name: str,
379
373
  namespace: str,
380
374
  project: str,
381
- version: Optional[str] = None,
382
- force: Optional[bool] = False,
375
+ version: str | None = None,
376
+ force: bool | None = False,
383
377
  ) -> Response[DatasetInfoData]:
384
378
  return self._send_request(
385
379
  "datachain/datasets",
@@ -461,18 +455,18 @@ class StudioClient:
461
455
  self,
462
456
  query: str,
463
457
  query_type: str,
464
- environment: Optional[str] = None,
465
- workers: Optional[int] = None,
466
- query_name: Optional[str] = None,
467
- files: Optional[list[str]] = None,
468
- python_version: Optional[str] = None,
469
- requirements: Optional[str] = None,
470
- repository: Optional[str] = None,
471
- priority: Optional[int] = None,
472
- cluster: Optional[str] = None,
473
- start_time: Optional[str] = None,
474
- cron: Optional[str] = None,
475
- credentials_name: Optional[str] = None,
458
+ environment: str | None = None,
459
+ workers: int | None = None,
460
+ query_name: str | None = None,
461
+ files: list[str] | None = None,
462
+ python_version: str | None = None,
463
+ requirements: str | None = None,
464
+ repository: str | None = None,
465
+ priority: int | None = None,
466
+ cluster: str | None = None,
467
+ start_time: str | None = None,
468
+ cron: str | None = None,
469
+ credentials_name: str | None = None,
476
470
  ) -> Response[JobData]:
477
471
  data = {
478
472
  "query": query,
@@ -494,9 +488,9 @@ class StudioClient:
494
488
 
495
489
  def get_jobs(
496
490
  self,
497
- status: Optional[str] = None,
491
+ status: str | None = None,
498
492
  limit: int = 20,
499
- job_id: Optional[str] = None,
493
+ job_id: str | None = None,
500
494
  ) -> Response[JobListData]:
501
495
  params: dict[str, Any] = {"limit": limit}
502
496
  if status is not None:
datachain/script_meta.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import re
2
2
  from dataclasses import dataclass
3
- from typing import Any, Optional
3
+ from typing import Any
4
4
 
5
5
  try:
6
6
  import tomllib
@@ -59,23 +59,23 @@ class ScriptConfig:
59
59
 
60
60
  """
61
61
 
62
- python_version: Optional[str]
62
+ python_version: str | None
63
63
  dependencies: list[str]
64
64
  attachments: dict[str, str]
65
65
  params: dict[str, Any]
66
66
  inputs: dict[str, Any]
67
67
  outputs: dict[str, Any]
68
- num_workers: Optional[int] = None
68
+ num_workers: int | None = None
69
69
 
70
70
  def __init__(
71
71
  self,
72
- python_version: Optional[str] = None,
73
- dependencies: Optional[list[str]] = None,
74
- attachments: Optional[dict[str, str]] = None,
75
- params: Optional[dict[str, Any]] = None,
76
- inputs: Optional[dict[str, Any]] = None,
77
- outputs: Optional[dict[str, Any]] = None,
78
- num_workers: Optional[int] = None,
72
+ python_version: str | None = None,
73
+ dependencies: list[str] | None = None,
74
+ attachments: dict[str, str] | None = None,
75
+ params: dict[str, Any] | None = None,
76
+ inputs: dict[str, Any] | None = None,
77
+ outputs: dict[str, Any] | None = None,
78
+ num_workers: int | None = None,
79
79
  ):
80
80
  self.python_version = python_version
81
81
  self.dependencies = dependencies or []
@@ -98,7 +98,7 @@ class ScriptConfig:
98
98
  return self.attachments.get(name, default)
99
99
 
100
100
  @staticmethod
101
- def read(script: str) -> Optional[dict]:
101
+ def read(script: str) -> dict | None:
102
102
  """Converts inline script metadata to dict with all found data"""
103
103
  regex = (
104
104
  r"(?m)^# \/\/\/ (?P<type>[a-zA-Z0-9-]+)[ \t]*$[\r\n|\r|\n]"
@@ -119,7 +119,7 @@ class ScriptConfig:
119
119
  return None
120
120
 
121
121
  @staticmethod
122
- def parse(script: str) -> Optional["ScriptConfig"]:
122
+ def parse(script: str) -> "ScriptConfig | None":
123
123
  """
124
124
  Method that is parsing inline script metadata from datachain script and
125
125
  instantiating ScriptConfig class with found data. If no inline metadata is
@@ -2,11 +2,10 @@ import logging
2
2
  import re
3
3
  import sqlite3
4
4
  import warnings
5
- from collections.abc import Iterable
5
+ from collections.abc import Callable, Iterable
6
6
  from datetime import MAXYEAR, MINYEAR, datetime, timezone
7
7
  from functools import cache
8
8
  from types import MappingProxyType
9
- from typing import Callable, Optional
10
9
 
11
10
  import sqlalchemy as sa
12
11
  import ujson as json
@@ -132,7 +131,7 @@ def run_compiler_hook(name):
132
131
 
133
132
 
134
133
  def functions_exist(
135
- names: Iterable[str], connection: Optional[sqlite3.Connection] = None
134
+ names: Iterable[str], connection: sqlite3.Connection | None = None
136
135
  ) -> bool:
137
136
  """
138
137
  Returns True if all function names are defined for the given connection.
@@ -201,9 +200,7 @@ def sqlite_int_hash_64(x: int) -> int:
201
200
  def sqlite_bit_hamming_distance(a: int, b: int) -> int:
202
201
  """Calculate the Hamming distance between two integers."""
203
202
  diff = (a & MAX_INT64) ^ (b & MAX_INT64)
204
- if hasattr(diff, "bit_count"):
205
- return diff.bit_count()
206
- return bin(diff).count("1")
203
+ return diff.bit_count()
207
204
 
208
205
 
209
206
  def sqlite_byte_hamming_distance(a: str, b: str) -> int:
@@ -215,7 +212,7 @@ def sqlite_byte_hamming_distance(a: str, b: str) -> int:
215
212
  elif len(b) < len(a):
216
213
  diff = len(a) - len(b)
217
214
  a = a[: len(b)]
218
- return diff + sum(c1 != c2 for c1, c2 in zip(a, b))
215
+ return diff + sum(c1 != c2 for c1, c2 in zip(a, b, strict=False))
219
216
 
220
217
 
221
218
  def register_user_defined_sql_functions() -> None:
@@ -470,7 +467,7 @@ def py_json_array_get_element(val, idx):
470
467
  return None
471
468
 
472
469
 
473
- def py_json_array_slice(val, offset: int, length: Optional[int] = None):
470
+ def py_json_array_slice(val, offset: int, length: int | None = None):
474
471
  arr = json.loads(val)
475
472
  try:
476
473
  return json.dumps(
@@ -605,7 +602,7 @@ def compile_collect(element, compiler, **kwargs):
605
602
 
606
603
 
607
604
  @cache
608
- def usearch_sqlite_path() -> Optional[str]:
605
+ def usearch_sqlite_path() -> str | None:
609
606
  try:
610
607
  import usearch
611
608
  except ImportError:
datachain/studio.py CHANGED
@@ -2,7 +2,7 @@ import asyncio
2
2
  import os
3
3
  import sys
4
4
  from datetime import datetime, timezone
5
- from typing import TYPE_CHECKING, Optional
5
+ from typing import TYPE_CHECKING
6
6
 
7
7
  import dateparser
8
8
  import tabulate
@@ -175,7 +175,7 @@ def token():
175
175
  print(token)
176
176
 
177
177
 
178
- def list_datasets(team: Optional[str] = None, name: Optional[str] = None):
178
+ def list_datasets(team: str | None = None, name: str | None = None):
179
179
  def ds_full_name(ds: dict) -> str:
180
180
  return (
181
181
  f"{ds['project']['namespace']['name']}.{ds['project']['name']}.{ds['name']}"
@@ -206,7 +206,7 @@ def list_datasets(team: Optional[str] = None, name: Optional[str] = None):
206
206
  yield (full_name, version)
207
207
 
208
208
 
209
- def list_dataset_versions(team: Optional[str] = None, name: str = ""):
209
+ def list_dataset_versions(team: str | None = None, name: str = ""):
210
210
  client = StudioClient(team=team)
211
211
 
212
212
  namespace_name, project_name, name = parse_dataset_name(name)
@@ -226,13 +226,13 @@ def list_dataset_versions(team: Optional[str] = None, name: str = ""):
226
226
 
227
227
 
228
228
  def edit_studio_dataset(
229
- team_name: Optional[str],
229
+ team_name: str | None,
230
230
  name: str,
231
231
  namespace: str,
232
232
  project: str,
233
- new_name: Optional[str] = None,
234
- description: Optional[str] = None,
235
- attrs: Optional[list[str]] = None,
233
+ new_name: str | None = None,
234
+ description: str | None = None,
235
+ attrs: list[str] | None = None,
236
236
  ):
237
237
  client = StudioClient(team=team_name)
238
238
  response = client.edit_dataset(
@@ -245,12 +245,12 @@ def edit_studio_dataset(
245
245
 
246
246
 
247
247
  def remove_studio_dataset(
248
- team_name: Optional[str],
248
+ team_name: str | None,
249
249
  name: str,
250
250
  namespace: str,
251
251
  project: str,
252
- version: Optional[str] = None,
253
- force: Optional[bool] = False,
252
+ version: str | None = None,
253
+ force: bool | None = False,
254
254
  ):
255
255
  client = StudioClient(team=team_name)
256
256
  response = client.rm_dataset(name, namespace, project, version, force)
@@ -271,7 +271,7 @@ def save_config(hostname, token, level=ConfigLevel.GLOBAL):
271
271
  return config.config_file()
272
272
 
273
273
 
274
- def parse_start_time(start_time_str: Optional[str]) -> Optional[str]:
274
+ def parse_start_time(start_time_str: str | None) -> str | None:
275
275
  if not start_time_str:
276
276
  return None
277
277
 
@@ -343,21 +343,21 @@ def show_logs_from_client(client, job_id):
343
343
 
344
344
  def create_job(
345
345
  query_file: str,
346
- team_name: Optional[str],
347
- env_file: Optional[str] = None,
348
- env: Optional[list[str]] = None,
349
- workers: Optional[int] = None,
350
- files: Optional[list[str]] = None,
351
- python_version: Optional[str] = None,
352
- repository: Optional[str] = None,
353
- req: Optional[list[str]] = None,
354
- req_file: Optional[str] = None,
355
- priority: Optional[int] = None,
356
- cluster: Optional[str] = None,
357
- start_time: Optional[str] = None,
358
- cron: Optional[str] = None,
359
- no_wait: Optional[bool] = False,
360
- credentials_name: Optional[str] = None,
346
+ team_name: str | None,
347
+ env_file: str | None = None,
348
+ env: list[str] | None = None,
349
+ workers: int | None = None,
350
+ files: list[str] | None = None,
351
+ python_version: str | None = None,
352
+ repository: str | None = None,
353
+ req: list[str] | None = None,
354
+ req_file: str | None = None,
355
+ priority: int | None = None,
356
+ cluster: str | None = None,
357
+ start_time: str | None = None,
358
+ cron: str | None = None,
359
+ no_wait: bool | None = False,
360
+ credentials_name: str | None = None,
361
361
  ):
362
362
  query_type = "PYTHON" if query_file.endswith(".py") else "SHELL"
363
363
  with open(query_file) as f:
@@ -433,7 +433,7 @@ def upload_files(client: StudioClient, files: list[str]) -> list[str]:
433
433
  return file_ids
434
434
 
435
435
 
436
- def cancel_job(job_id: str, team_name: Optional[str]):
436
+ def cancel_job(job_id: str, team_name: str | None):
437
437
  token = Config().read().get("studio", {}).get("token")
438
438
  if not token:
439
439
  raise DataChainError(
@@ -448,7 +448,7 @@ def cancel_job(job_id: str, team_name: Optional[str]):
448
448
  print(f"Job {job_id} canceled")
449
449
 
450
450
 
451
- def list_jobs(status: Optional[str], team_name: Optional[str], limit: int):
451
+ def list_jobs(status: str | None, team_name: str | None, limit: int):
452
452
  client = StudioClient(team=team_name)
453
453
  response = client.get_jobs(status, limit)
454
454
  if not response.ok:
@@ -473,7 +473,7 @@ def list_jobs(status: Optional[str], team_name: Optional[str], limit: int):
473
473
  print(tabulate.tabulate(rows, headers="keys", tablefmt="grid"))
474
474
 
475
475
 
476
- def show_job_logs(job_id: str, team_name: Optional[str]):
476
+ def show_job_logs(job_id: str, team_name: str | None):
477
477
  token = Config().read().get("studio", {}).get("token")
478
478
  if not token:
479
479
  raise DataChainError(
@@ -484,7 +484,7 @@ def show_job_logs(job_id: str, team_name: Optional[str]):
484
484
  return show_logs_from_client(client, job_id)
485
485
 
486
486
 
487
- def list_clusters(team_name: Optional[str]):
487
+ def list_clusters(team_name: str | None):
488
488
  client = StudioClient(team=team_name)
489
489
  response = client.get_clusters()
490
490
  if not response.ok:
@@ -1,5 +1,4 @@
1
1
  import random
2
- from typing import Optional
3
2
 
4
3
  from datachain import C, DataChain
5
4
 
@@ -9,7 +8,7 @@ RESOLUTION = 2**31 - 1 # Maximum positive value for a 32-bit signed integer.
9
8
  def train_test_split(
10
9
  dc: DataChain,
11
10
  weights: list[float],
12
- seed: Optional[int] = None,
11
+ seed: int | None = None,
13
12
  ) -> list[DataChain]:
14
13
  """
15
14
  Splits a DataChain into multiple subsets based on the provided weights.
datachain/utils.py CHANGED
@@ -11,7 +11,7 @@ import time
11
11
  from collections.abc import Iterable, Iterator, Sequence
12
12
  from contextlib import contextmanager
13
13
  from datetime import date, datetime, timezone
14
- from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
14
+ from typing import TYPE_CHECKING, Any, TypeVar
15
15
  from uuid import UUID
16
16
 
17
17
  import cloudpickle
@@ -53,11 +53,11 @@ class DataChainDir:
53
53
 
54
54
  def __init__(
55
55
  self,
56
- root: Optional[str] = None,
57
- cache: Optional[str] = None,
58
- tmp: Optional[str] = None,
59
- db: Optional[str] = None,
60
- config: Optional[str] = None,
56
+ root: str | None = None,
57
+ cache: str | None = None,
58
+ tmp: str | None = None,
59
+ db: str | None = None,
60
+ config: str | None = None,
61
61
  ) -> None:
62
62
  self.root = osp.abspath(root) if root is not None else self.default_root()
63
63
  self.cache = (
@@ -122,7 +122,7 @@ def global_config_dir():
122
122
  )
123
123
 
124
124
 
125
- def human_time_to_int(time: str) -> Optional[int]:
125
+ def human_time_to_int(time: str) -> int | None:
126
126
  if not time:
127
127
  return None
128
128
 
@@ -146,7 +146,7 @@ def time_to_str(dt):
146
146
  return dt.strftime("%Y-%m-%d %H:%M:%S")
147
147
 
148
148
 
149
- def time_to_local(dt: Union[datetime, str]) -> datetime:
149
+ def time_to_local(dt: datetime | str) -> datetime:
150
150
  # TODO check usage
151
151
  if isinstance(dt, str):
152
152
  dt = isoparse(dt)
@@ -156,11 +156,11 @@ def time_to_local(dt: Union[datetime, str]) -> datetime:
156
156
  return dt
157
157
 
158
158
 
159
- def time_to_local_str(dt: Union[datetime, str]) -> str:
159
+ def time_to_local_str(dt: datetime | str) -> str:
160
160
  return time_to_str(time_to_local(dt))
161
161
 
162
162
 
163
- def is_expired(expires: Optional[Union[datetime, str]]):
163
+ def is_expired(expires: datetime | str | None):
164
164
  if expires:
165
165
  return time_to_local(expires) < time_to_local(datetime.now()) # noqa: DTZ005
166
166
 
@@ -301,9 +301,9 @@ def retry_with_backoff(retries=5, backoff_sec=1, errors=(Exception,)):
301
301
 
302
302
 
303
303
  def determine_workers(
304
- workers: Union[bool, int],
305
- rows_total: Optional[int] = None,
306
- ) -> Union[bool, int]:
304
+ workers: bool | int,
305
+ rows_total: int | None = None,
306
+ ) -> bool | int:
307
307
  """Determine the number of workers to use for distributed processing."""
308
308
  if rows_total is not None and rows_total <= 1:
309
309
  # Disable distributed processing if there is no rows or only one row.
@@ -322,9 +322,9 @@ def determine_workers(
322
322
 
323
323
 
324
324
  def determine_processes(
325
- parallel: Optional[Union[bool, int]] = None,
326
- rows_total: Optional[int] = None,
327
- ) -> Union[bool, int]:
325
+ parallel: bool | int | None = None,
326
+ rows_total: int | None = None,
327
+ ) -> bool | int:
328
328
  """Determine the number of processes to use for parallel processing."""
329
329
  if rows_total is not None and rows_total <= 1:
330
330
  # Disable parallel processing if there is no rows or only one row.
@@ -344,8 +344,8 @@ def determine_processes(
344
344
 
345
345
 
346
346
  def get_env_list(
347
- key: str, default: Optional[Sequence] = None, sep: str = ","
348
- ) -> Optional[Sequence[str]]:
347
+ key: str, default: Sequence | None = None, sep: str = ","
348
+ ) -> Sequence[str] | None:
349
349
  try:
350
350
  str_val = os.environ[key]
351
351
  except KeyError:
@@ -386,10 +386,10 @@ def show_df(
386
386
 
387
387
 
388
388
  def show_records(
389
- records: Optional[list[dict]],
389
+ records: list[dict] | None,
390
390
  collapse_columns: bool = False,
391
391
  system_columns: bool = False,
392
- hidden_fields: Optional[list[str]] = None,
392
+ hidden_fields: list[str] | None = None,
393
393
  ) -> None:
394
394
  import pandas as pd
395
395
 
@@ -518,7 +518,7 @@ def row_to_nested_dict(
518
518
  ) -> dict[str, Any]:
519
519
  """Converts a row to a nested dict based on the provided headers."""
520
520
  result: dict[str, Any] = {}
521
- for h, v in zip(headers, row):
521
+ for h, v in zip(headers, row, strict=False):
522
522
  nested_dict_path_set(result, h, v)
523
523
  return result
524
524
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.34.5
3
+ Version: 0.34.7
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -8,13 +8,12 @@ Project-URL: Documentation, https://datachain.dvc.ai
8
8
  Project-URL: Issues, https://github.com/iterative/datachain/issues
9
9
  Project-URL: Source, https://github.com/iterative/datachain
10
10
  Classifier: Programming Language :: Python :: 3
11
- Classifier: Programming Language :: Python :: 3.9
12
11
  Classifier: Programming Language :: Python :: 3.10
13
12
  Classifier: Programming Language :: Python :: 3.11
14
13
  Classifier: Programming Language :: Python :: 3.12
15
14
  Classifier: Programming Language :: Python :: 3.13
16
15
  Classifier: Development Status :: 2 - Pre-Alpha
17
- Requires-Python: >=3.9
16
+ Requires-Python: >=3.10
18
17
  Description-Content-Type: text/x-rst
19
18
  License-File: LICENSE
20
19
  Requires-Dist: pyyaml