datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. datachain/__init__.py +4 -0
  2. datachain/asyn.py +11 -12
  3. datachain/cache.py +5 -5
  4. datachain/catalog/__init__.py +0 -2
  5. datachain/catalog/catalog.py +276 -354
  6. datachain/catalog/dependency.py +164 -0
  7. datachain/catalog/loader.py +8 -3
  8. datachain/checkpoint.py +43 -0
  9. datachain/cli/__init__.py +10 -17
  10. datachain/cli/commands/__init__.py +1 -8
  11. datachain/cli/commands/datasets.py +42 -27
  12. datachain/cli/commands/ls.py +15 -15
  13. datachain/cli/commands/show.py +2 -2
  14. datachain/cli/parser/__init__.py +3 -43
  15. datachain/cli/parser/job.py +1 -1
  16. datachain/cli/parser/utils.py +1 -2
  17. datachain/cli/utils.py +2 -15
  18. datachain/client/azure.py +2 -2
  19. datachain/client/fsspec.py +34 -23
  20. datachain/client/gcs.py +3 -3
  21. datachain/client/http.py +157 -0
  22. datachain/client/local.py +11 -7
  23. datachain/client/s3.py +3 -3
  24. datachain/config.py +4 -8
  25. datachain/data_storage/db_engine.py +12 -6
  26. datachain/data_storage/job.py +2 -0
  27. datachain/data_storage/metastore.py +716 -137
  28. datachain/data_storage/schema.py +20 -27
  29. datachain/data_storage/serializer.py +105 -15
  30. datachain/data_storage/sqlite.py +114 -114
  31. datachain/data_storage/warehouse.py +140 -48
  32. datachain/dataset.py +109 -89
  33. datachain/delta.py +117 -42
  34. datachain/diff/__init__.py +25 -33
  35. datachain/error.py +24 -0
  36. datachain/func/aggregate.py +9 -11
  37. datachain/func/array.py +12 -12
  38. datachain/func/base.py +7 -4
  39. datachain/func/conditional.py +9 -13
  40. datachain/func/func.py +63 -45
  41. datachain/func/numeric.py +5 -7
  42. datachain/func/string.py +2 -2
  43. datachain/hash_utils.py +123 -0
  44. datachain/job.py +11 -7
  45. datachain/json.py +138 -0
  46. datachain/lib/arrow.py +18 -15
  47. datachain/lib/audio.py +60 -59
  48. datachain/lib/clip.py +14 -13
  49. datachain/lib/convert/python_to_sql.py +6 -10
  50. datachain/lib/convert/values_to_tuples.py +151 -53
  51. datachain/lib/data_model.py +23 -19
  52. datachain/lib/dataset_info.py +7 -7
  53. datachain/lib/dc/__init__.py +2 -1
  54. datachain/lib/dc/csv.py +22 -26
  55. datachain/lib/dc/database.py +37 -34
  56. datachain/lib/dc/datachain.py +518 -324
  57. datachain/lib/dc/datasets.py +38 -30
  58. datachain/lib/dc/hf.py +16 -20
  59. datachain/lib/dc/json.py +17 -18
  60. datachain/lib/dc/listings.py +5 -8
  61. datachain/lib/dc/pandas.py +3 -6
  62. datachain/lib/dc/parquet.py +33 -21
  63. datachain/lib/dc/records.py +9 -13
  64. datachain/lib/dc/storage.py +103 -65
  65. datachain/lib/dc/storage_pattern.py +251 -0
  66. datachain/lib/dc/utils.py +17 -14
  67. datachain/lib/dc/values.py +3 -6
  68. datachain/lib/file.py +187 -50
  69. datachain/lib/hf.py +7 -5
  70. datachain/lib/image.py +13 -13
  71. datachain/lib/listing.py +5 -5
  72. datachain/lib/listing_info.py +1 -2
  73. datachain/lib/meta_formats.py +2 -3
  74. datachain/lib/model_store.py +20 -8
  75. datachain/lib/namespaces.py +59 -7
  76. datachain/lib/projects.py +51 -9
  77. datachain/lib/pytorch.py +31 -23
  78. datachain/lib/settings.py +188 -85
  79. datachain/lib/signal_schema.py +302 -64
  80. datachain/lib/text.py +8 -7
  81. datachain/lib/udf.py +103 -63
  82. datachain/lib/udf_signature.py +59 -34
  83. datachain/lib/utils.py +20 -0
  84. datachain/lib/video.py +3 -4
  85. datachain/lib/webdataset.py +31 -36
  86. datachain/lib/webdataset_laion.py +15 -16
  87. datachain/listing.py +12 -5
  88. datachain/model/bbox.py +3 -1
  89. datachain/namespace.py +22 -3
  90. datachain/node.py +6 -6
  91. datachain/nodes_thread_pool.py +0 -1
  92. datachain/plugins.py +24 -0
  93. datachain/project.py +4 -4
  94. datachain/query/batch.py +10 -12
  95. datachain/query/dataset.py +376 -194
  96. datachain/query/dispatch.py +112 -84
  97. datachain/query/metrics.py +3 -4
  98. datachain/query/params.py +2 -3
  99. datachain/query/queue.py +2 -1
  100. datachain/query/schema.py +7 -6
  101. datachain/query/session.py +190 -33
  102. datachain/query/udf.py +9 -6
  103. datachain/remote/studio.py +90 -53
  104. datachain/script_meta.py +12 -12
  105. datachain/sql/sqlite/base.py +37 -25
  106. datachain/sql/sqlite/types.py +1 -1
  107. datachain/sql/types.py +36 -5
  108. datachain/studio.py +49 -40
  109. datachain/toolkit/split.py +31 -10
  110. datachain/utils.py +39 -48
  111. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
  112. datachain-0.39.0.dist-info/RECORD +173 -0
  113. datachain/cli/commands/query.py +0 -54
  114. datachain/query/utils.py +0 -36
  115. datachain-0.30.5.dist-info/RECORD +0 -168
  116. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
  117. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
  118. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
  119. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,4 @@
1
1
  from argparse import Action, ArgumentParser, ArgumentTypeError, HelpFormatter
2
- from typing import Union
3
2
 
4
3
  from datachain.cli.utils import CommaSeparatedArgs
5
4
 
@@ -44,7 +43,7 @@ def parse_find_column(column: str) -> str:
44
43
  )
45
44
 
46
45
 
47
- def add_sources_arg(parser: ArgumentParser, nargs: Union[str, int] = "+") -> Action:
46
+ def add_sources_arg(parser: ArgumentParser, nargs: str | int = "+") -> Action:
48
47
  return parser.add_argument(
49
48
  "sources",
50
49
  type=str,
datachain/cli/utils.py CHANGED
@@ -1,6 +1,5 @@
1
1
  import logging
2
- from argparse import SUPPRESS, Action, ArgumentError, Namespace, _AppendAction
3
- from typing import Optional
2
+ from argparse import SUPPRESS, Action, Namespace, _AppendAction
4
3
 
5
4
  from datachain.error import DataChainError
6
5
 
@@ -64,18 +63,6 @@ class CommaSeparatedArgs(_AppendAction): # pylint: disable=protected-access
64
63
  setattr(namespace, self.dest, list(dict.fromkeys(items)))
65
64
 
66
65
 
67
- class KeyValueArgs(_AppendAction): # pylint: disable=protected-access
68
- def __call__(self, parser, namespace, values, option_string=None):
69
- items = getattr(namespace, self.dest) or {}
70
- for raw_value in filter(bool, values):
71
- key, sep, value = raw_value.partition("=")
72
- if not key or not sep or value == "":
73
- raise ArgumentError(self, f"expected 'key=value', got {raw_value!r}")
74
- items[key.strip()] = value
75
-
76
- setattr(namespace, self.dest, items)
77
-
78
-
79
66
  def get_logging_level(args: Namespace) -> int:
80
67
  if args.quiet:
81
68
  return logging.CRITICAL
@@ -84,7 +71,7 @@ def get_logging_level(args: Namespace) -> int:
84
71
  return logging.INFO
85
72
 
86
73
 
87
- def determine_flavors(studio: bool, local: bool, all: bool, token: Optional[str]):
74
+ def determine_flavors(studio: bool, local: bool, all: bool, token: str | None):
88
75
  if studio and not token:
89
76
  raise DataChainError(
90
77
  "Not logged in to Studio. Log in with 'datachain auth login'."
datachain/client/azure.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Any, Optional
1
+ from typing import Any
2
2
  from urllib.parse import parse_qs, urlsplit, urlunsplit
3
3
 
4
4
  from adlfs import AzureBlobFileSystem
@@ -73,7 +73,7 @@ class AzureClient(Client):
73
73
  result_queue.put_nowait(None)
74
74
 
75
75
  @classmethod
76
- def version_path(cls, path: str, version_id: Optional[str]) -> str:
76
+ def version_path(cls, path: str, version_id: str | None) -> str:
77
77
  parts = list(urlsplit(path))
78
78
  query = parse_qs(parts[3])
79
79
  if "versionid" in query:
@@ -10,15 +10,7 @@ from abc import ABC, abstractmethod
10
10
  from collections.abc import AsyncIterator, Iterator, Sequence
11
11
  from datetime import datetime
12
12
  from shutil import copy2
13
- from typing import (
14
- TYPE_CHECKING,
15
- Any,
16
- BinaryIO,
17
- ClassVar,
18
- NamedTuple,
19
- Optional,
20
- Union,
21
- )
13
+ from typing import TYPE_CHECKING, Any, BinaryIO, ClassVar, NamedTuple
22
14
  from urllib.parse import urlparse
23
15
 
24
16
  from dvc_objects.fs.system import reflink
@@ -44,11 +36,12 @@ FETCH_WORKERS = 100
44
36
  DELIMITER = "/" # Path delimiter.
45
37
 
46
38
  DATA_SOURCE_URI_PATTERN = re.compile(r"^[\w]+:\/\/.*$")
39
+ CLOUD_STORAGE_PROTOCOLS = {"s3", "gs", "az", "hf"}
47
40
 
48
- ResultQueue = asyncio.Queue[Optional[Sequence["File"]]]
41
+ ResultQueue = asyncio.Queue[Sequence["File"] | None]
49
42
 
50
43
 
51
- def _is_win_local_path(uri: str) -> bool:
44
+ def is_win_local_path(uri: str) -> bool:
52
45
  if sys.platform == "win32":
53
46
  if len(uri) >= 1 and uri[0] == "\\":
54
47
  return True
@@ -62,10 +55,20 @@ def _is_win_local_path(uri: str) -> bool:
62
55
  return False
63
56
 
64
57
 
58
+ def is_cloud_uri(uri: str) -> bool:
59
+ protocol = urlparse(uri).scheme
60
+ return protocol in CLOUD_STORAGE_PROTOCOLS
61
+
62
+
63
+ def get_cloud_schemes() -> list[str]:
64
+ """Get list of cloud storage scheme prefixes."""
65
+ return [f"{p}://" for p in CLOUD_STORAGE_PROTOCOLS]
66
+
67
+
65
68
  class Bucket(NamedTuple):
66
69
  name: str
67
70
  uri: "StorageURI"
68
- created: Optional[datetime]
71
+ created: datetime | None
69
72
 
70
73
 
71
74
  class Client(ABC):
@@ -77,21 +80,22 @@ class Client(ABC):
77
80
  def __init__(self, name: str, fs_kwargs: dict[str, Any], cache: Cache) -> None:
78
81
  self.name = name
79
82
  self.fs_kwargs = fs_kwargs
80
- self._fs: Optional[AbstractFileSystem] = None
83
+ self._fs: AbstractFileSystem | None = None
81
84
  self.cache = cache
82
85
  self.uri = self.get_uri(self.name)
83
86
 
84
87
  @staticmethod
85
- def get_implementation(url: Union[str, os.PathLike[str]]) -> type["Client"]:
88
+ def get_implementation(url: str | os.PathLike[str]) -> type["Client"]: # noqa: PLR0911
86
89
  from .azure import AzureClient
87
90
  from .gcs import GCSClient
88
91
  from .hf import HfClient
92
+ from .http import HTTPClient, HTTPSClient
89
93
  from .local import FileClient
90
94
  from .s3 import ClientS3
91
95
 
92
96
  protocol = urlparse(os.fspath(url)).scheme
93
97
 
94
- if not protocol or _is_win_local_path(os.fspath(url)):
98
+ if not protocol or is_win_local_path(os.fspath(url)):
95
99
  return FileClient
96
100
  if protocol == ClientS3.protocol:
97
101
  return ClientS3
@@ -103,9 +107,18 @@ class Client(ABC):
103
107
  return FileClient
104
108
  if protocol == HfClient.protocol:
105
109
  return HfClient
110
+ if protocol == HTTPClient.protocol:
111
+ return HTTPClient
112
+ if protocol == HTTPSClient.protocol:
113
+ return HTTPSClient
106
114
 
107
115
  raise NotImplementedError(f"Unsupported protocol: {protocol}")
108
116
 
117
+ @classmethod
118
+ def path_to_uri(cls, path: str) -> str:
119
+ """Convert a path-like object to a URI. Default: identity."""
120
+ return path
121
+
109
122
  @staticmethod
110
123
  def is_data_source_uri(name: str) -> bool:
111
124
  # Returns True if name is one of supported data sources URIs, e.g s3 bucket
@@ -118,9 +131,7 @@ class Client(ABC):
118
131
  return cls.get_uri(storage_name), rel_path
119
132
 
120
133
  @staticmethod
121
- def get_client(
122
- source: Union[str, os.PathLike[str]], cache: Cache, **kwargs
123
- ) -> "Client":
134
+ def get_client(source: str | os.PathLike[str], cache: Cache, **kwargs) -> "Client":
124
135
  cls = Client.get_implementation(source)
125
136
  storage_url, _ = cls.split_url(os.fspath(source))
126
137
  if os.name == "nt":
@@ -136,7 +147,7 @@ class Client(ABC):
136
147
  return fs
137
148
 
138
149
  @classmethod
139
- def version_path(cls, path: str, version_id: Optional[str]) -> str:
150
+ def version_path(cls, path: str, version_id: str | None) -> str:
140
151
  return path
141
152
 
142
153
  @classmethod
@@ -216,16 +227,16 @@ class Client(ABC):
216
227
  )
217
228
  return self.info_to_file(info, file_path).etag
218
229
 
219
- def get_file_info(self, path: str, version_id: Optional[str] = None) -> "File":
230
+ def get_file_info(self, path: str, version_id: str | None = None) -> "File":
220
231
  info = self.fs.info(self.get_full_path(path, version_id), version_id=version_id)
221
232
  return self.info_to_file(info, path)
222
233
 
223
- async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
234
+ async def get_size(self, path: str, version_id: str | None = None) -> int:
224
235
  return await self.fs._size(
225
236
  self.version_path(path, version_id), version_id=version_id
226
237
  )
227
238
 
228
- async def get_file(self, lpath, rpath, callback, version_id: Optional[str] = None):
239
+ async def get_file(self, lpath, rpath, callback, version_id: str | None = None):
229
240
  return await self.fs._get_file(
230
241
  self.version_path(lpath, version_id),
231
242
  rpath,
@@ -339,7 +350,7 @@ class Client(ABC):
339
350
  def rel_path(self, path: str) -> str:
340
351
  return self.fs.split_path(path)[1]
341
352
 
342
- def get_full_path(self, rel_path: str, version_id: Optional[str] = None) -> str:
353
+ def get_full_path(self, rel_path: str, version_id: str | None = None) -> str:
343
354
  return self.version_path(f"{self.PREFIX}{self.name}/{rel_path}", version_id)
344
355
 
345
356
  @abstractmethod
datachain/client/gcs.py CHANGED
@@ -3,7 +3,7 @@ import json
3
3
  import os
4
4
  from collections.abc import Iterable
5
5
  from datetime import datetime
6
- from typing import Any, Optional, cast
6
+ from typing import Any, cast
7
7
 
8
8
  from dateutil.parser import isoparse
9
9
  from gcsfs import GCSFileSystem
@@ -15,7 +15,7 @@ from .fsspec import DELIMITER, Client, ResultQueue
15
15
 
16
16
  # Patch gcsfs for consistency with s3fs
17
17
  GCSFileSystem.set_session = GCSFileSystem._set_session
18
- PageQueue = asyncio.Queue[Optional[Iterable[dict[str, Any]]]]
18
+ PageQueue = asyncio.Queue[Iterable[dict[str, Any]] | None]
19
19
 
20
20
 
21
21
  class GCSClient(Client):
@@ -141,5 +141,5 @@ class GCSClient(Client):
141
141
  )
142
142
 
143
143
  @classmethod
144
- def version_path(cls, path: str, version_id: Optional[str]) -> str:
144
+ def version_path(cls, path: str, version_id: str | None) -> str:
145
145
  return f"{path}#{version_id}" if version_id else path
@@ -0,0 +1,157 @@
1
+ from datetime import datetime, timezone
2
+ from typing import TYPE_CHECKING, Any, ClassVar, cast
3
+ from urllib.parse import urlparse
4
+
5
+ from fsspec.implementations.http import HTTPFileSystem
6
+
7
+ from datachain.dataset import StorageURI
8
+ from datachain.lib.file import File
9
+
10
+ from .fsspec import Client
11
+
12
+ if TYPE_CHECKING:
13
+ from datachain.cache import Cache
14
+
15
+
16
+ class HTTPClient(Client):
17
+ FS_CLASS = HTTPFileSystem
18
+ PREFIX: ClassVar[str] = "http://"
19
+ protocol: ClassVar[str] = "http"
20
+
21
+ @classmethod
22
+ def create_fs(cls, **kwargs) -> HTTPFileSystem:
23
+ # Configure HTTPFileSystem options
24
+ kwargs.setdefault("simple_links", True)
25
+ kwargs.setdefault("same_scheme", True)
26
+ kwargs.setdefault("cache_type", "bytes")
27
+
28
+ kwargs.pop("version_aware", None)
29
+
30
+ fs = cls.FS_CLASS(**kwargs)
31
+ fs.invalidate_cache()
32
+ return cast("HTTPFileSystem", fs)
33
+
34
+ @classmethod
35
+ def from_name(
36
+ cls,
37
+ name: str,
38
+ cache: "Cache",
39
+ kwargs: dict[str, Any],
40
+ ) -> "HTTPClient":
41
+ parsed = urlparse(name)
42
+
43
+ if parsed.scheme:
44
+ name = parsed.netloc + parsed.path
45
+
46
+ return cls(name, kwargs, cache)
47
+
48
+ @classmethod
49
+ def split_url(cls, url: str) -> tuple[str, str]:
50
+ """Split HTTP/HTTPS URL into domain (bucket equivalent) and path."""
51
+ parsed = urlparse(url)
52
+ domain = parsed.netloc
53
+ path = parsed.path.lstrip("/")
54
+
55
+ if parsed.query:
56
+ path += f"?{parsed.query}"
57
+ if parsed.fragment:
58
+ path += f"#{parsed.fragment}"
59
+
60
+ return domain, path
61
+
62
+ @classmethod
63
+ def get_uri(cls, name: str) -> "StorageURI":
64
+ if not name.startswith(("http://", "https://")):
65
+ return StorageURI(f"{cls.PREFIX}{name}")
66
+ return StorageURI(name)
67
+
68
+ @classmethod
69
+ def is_root_url(cls, url: str) -> bool:
70
+ parsed = urlparse(url)
71
+ return parsed.path in ("", "/") and not parsed.query and not parsed.fragment
72
+
73
+ def get_full_path(self, rel_path: str, version_id: str | None = None) -> str:
74
+ if self.name.startswith(("http://", "https://")):
75
+ base_url = self.name
76
+ else:
77
+ if rel_path and "/" in rel_path:
78
+ first_part = rel_path.split("/")[0]
79
+ if "." in first_part and not first_part.startswith("."):
80
+ return f"{self.protocol}://{rel_path}"
81
+
82
+ base_url = f"{self.protocol}://{self.name}"
83
+
84
+ if rel_path:
85
+ if not base_url.endswith("/") and not rel_path.startswith("/"):
86
+ base_url += "/"
87
+ full_url = base_url + rel_path
88
+ else:
89
+ full_url = base_url
90
+
91
+ return full_url
92
+
93
+ def url(self, path: str, expires: int = 3600, **kwargs) -> str:
94
+ """
95
+ Generate URL for the given path.
96
+ Note: HTTP URLs don't support signed/expiring URLs.
97
+ """
98
+ return self.get_full_path(path, kwargs.pop("version_id", None))
99
+
100
+ def info_to_file(self, v: dict[str, Any], path: str) -> File:
101
+ etag = v.get("ETag", "").strip('"')
102
+ last_modified = v.get("last_modified")
103
+ if last_modified:
104
+ if isinstance(last_modified, str):
105
+ try:
106
+ from email.utils import parsedate_to_datetime
107
+
108
+ last_modified = parsedate_to_datetime(last_modified)
109
+ except (ValueError, TypeError):
110
+ last_modified = datetime.now(timezone.utc)
111
+ elif isinstance(last_modified, (int, float)):
112
+ last_modified = datetime.fromtimestamp(last_modified, timezone.utc)
113
+ else:
114
+ last_modified = datetime.now(timezone.utc)
115
+
116
+ return File(
117
+ source=self.uri,
118
+ path=path,
119
+ size=v.get("size", 0),
120
+ etag=etag,
121
+ version="",
122
+ is_latest=True,
123
+ last_modified=last_modified,
124
+ )
125
+
126
+ def upload(self, data: bytes, path: str) -> "File":
127
+ raise NotImplementedError(
128
+ "HTTP/HTTPS client is read-only. Upload operations are not supported."
129
+ )
130
+
131
+ def get_file_info(self, path: str, version_id: str | None = None) -> "File":
132
+ info = self.fs.info(self.get_full_path(path))
133
+ return self.info_to_file(info, path)
134
+
135
+ def open_object(self, file: "File", use_cache: bool = True, cb=None):
136
+ from datachain.client.fileslice import FileWrapper
137
+
138
+ if use_cache and (cache_path := self.cache.get_path(file)):
139
+ return open(cache_path, mode="rb")
140
+
141
+ assert not file.location
142
+ return FileWrapper(
143
+ self.fs.open(self.get_full_path(file.get_path_normalized())),
144
+ cb or (lambda x: None),
145
+ )
146
+
147
+ async def get_file(self, lpath, rpath, callback, version_id: str | None = None):
148
+ return await self.fs._get_file(lpath, rpath, callback=callback)
149
+
150
+ async def _fetch_dir(self, prefix: str, pbar, result_queue) -> set[str]:
151
+ full_url = self.get_full_path(prefix)
152
+ raise NotImplementedError(f"Cannot download file from {full_url}")
153
+
154
+
155
+ class HTTPSClient(HTTPClient):
156
+ protocol = "https"
157
+ PREFIX = "https://"
datachain/client/local.py CHANGED
@@ -2,14 +2,14 @@ import os
2
2
  import posixpath
3
3
  from datetime import datetime, timezone
4
4
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Any, Optional
5
+ from typing import TYPE_CHECKING, Any
6
6
  from urllib.parse import urlparse
7
7
 
8
8
  from fsspec.implementations.local import LocalFileSystem
9
9
 
10
10
  from datachain.lib.file import File
11
11
 
12
- from .fsspec import Client
12
+ from .fsspec import Client, is_win_local_path
13
13
 
14
14
  if TYPE_CHECKING:
15
15
  from datachain.cache import Cache
@@ -57,9 +57,13 @@ class FileClient(Client):
57
57
  /home/user/animals/ -> file:///home/user/animals/
58
58
  C:\\windows\animals -> file:///C:/windows/animals
59
59
  """
60
+ parsed = urlparse(path)
61
+ if parsed.scheme and not is_win_local_path(path):
62
+ return path
63
+
60
64
  uri = Path(path).expanduser().absolute().resolve().as_uri()
61
- if path[-1] == os.sep:
62
- # we should keep os separator from the end of the path
65
+ if path and path[-1] in (os.sep, "/"):
66
+ # keep trailing separator so directory URIs stay rooted
63
67
  uri += "/" # in uri (file:///...) all separators are / regardless of os
64
68
 
65
69
  return uri
@@ -102,10 +106,10 @@ class FileClient(Client):
102
106
  info = self.fs.info(self.get_full_path(file.get_path_normalized()))
103
107
  return self.info_to_file(info, "").etag
104
108
 
105
- async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
109
+ async def get_size(self, path: str, version_id: str | None = None) -> int:
106
110
  return self.fs.size(path)
107
111
 
108
- async def get_file(self, lpath, rpath, callback, version_id: Optional[str] = None):
112
+ async def get_file(self, lpath, rpath, callback, version_id: str | None = None):
109
113
  return self.fs.get_file(lpath, rpath, callback=callback)
110
114
 
111
115
  async def ls_dir(self, path):
@@ -114,7 +118,7 @@ class FileClient(Client):
114
118
  def rel_path(self, path):
115
119
  return posixpath.relpath(path, self.name)
116
120
 
117
- def get_full_path(self, rel_path, version_id: Optional[str] = None):
121
+ def get_full_path(self, rel_path, version_id: str | None = None):
118
122
  full_path = Path(self.name, rel_path).as_posix()
119
123
  if rel_path.endswith("/") or not rel_path:
120
124
  full_path += "/"
datachain/client/s3.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  import os
3
- from typing import Any, Optional, cast
3
+ from typing import Any, cast
4
4
  from urllib.parse import parse_qs, urlsplit, urlunsplit
5
5
 
6
6
  from botocore.exceptions import NoCredentialsError
@@ -148,7 +148,7 @@ class ClientS3(Client):
148
148
  )
149
149
 
150
150
  @classmethod
151
- def version_path(cls, path: str, version_id: Optional[str]) -> str:
151
+ def version_path(cls, path: str, version_id: str | None) -> str:
152
152
  parts = list(urlsplit(path))
153
153
  query = parse_qs(parts[3])
154
154
  if "versionId" in query:
@@ -187,7 +187,7 @@ class ClientS3(Client):
187
187
  return subdirs
188
188
 
189
189
  @staticmethod
190
- def clean_s3_version(ver: Optional[str]) -> str:
190
+ def clean_s3_version(ver: str | None) -> str:
191
191
  return ver if (ver is not None and ver != "null") else ""
192
192
 
193
193
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
datachain/config.py CHANGED
@@ -1,7 +1,6 @@
1
1
  from collections.abc import Mapping
2
2
  from contextlib import contextmanager
3
3
  from enum import Enum
4
- from typing import Optional, Union
5
4
 
6
5
  from tomlkit import TOMLDocument, dump, load
7
6
 
@@ -22,16 +21,13 @@ class Config:
22
21
  # In the order of precedence
23
22
  LEVELS = SYSTEM_LEVELS + LOCAL_LEVELS
24
23
 
25
- def __init__(
26
- self,
27
- level: Optional[ConfigLevel] = None,
28
- ):
24
+ def __init__(self, level: ConfigLevel | None = None):
29
25
  self.level = level
30
26
 
31
27
  self.init()
32
28
 
33
29
  @classmethod
34
- def get_dir(cls, level: Optional[ConfigLevel]) -> str:
30
+ def get_dir(cls, level: ConfigLevel | None) -> str:
35
31
  if level == ConfigLevel.SYSTEM:
36
32
  return system_config_dir()
37
33
  if level == ConfigLevel.GLOBAL:
@@ -43,7 +39,7 @@ class Config:
43
39
  d = DataChainDir(self.get_dir(self.level))
44
40
  d.init()
45
41
 
46
- def load_one(self, level: Optional[ConfigLevel] = None) -> TOMLDocument:
42
+ def load_one(self, level: ConfigLevel | None = None) -> TOMLDocument:
47
43
  config_path = DataChainDir(self.get_dir(level)).config
48
44
 
49
45
  try:
@@ -128,7 +124,7 @@ class Config:
128
124
  return remote_conf
129
125
 
130
126
 
131
- def merge(into: Union[TOMLDocument, dict], update: Union[TOMLDocument, dict]):
127
+ def merge(into: TOMLDocument | dict, update: TOMLDocument | dict):
132
128
  """Merges second dict into first recursively"""
133
129
  for key, val in update.items():
134
130
  if isinstance(into.get(key), dict) and isinstance(val, dict):
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from abc import ABC, abstractmethod
3
3
  from collections.abc import Iterator
4
- from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union
4
+ from typing import TYPE_CHECKING, Any, ClassVar
5
5
 
6
6
  import sqlalchemy as sa
7
7
  from sqlalchemy.sql import FROM_LINTING
@@ -58,7 +58,7 @@ class DatabaseEngine(ABC, Serializable):
58
58
  @classmethod
59
59
  def compile_to_args(
60
60
  cls, statement: "ClauseElement", **kwargs
61
- ) -> Union[tuple[str], tuple[str, dict[str, Any]]]:
61
+ ) -> tuple[str] | tuple[str, dict[str, Any]]:
62
62
  """
63
63
  Compile a sqlalchemy query or ddl object to an args tuple.
64
64
 
@@ -75,8 +75,8 @@ class DatabaseEngine(ABC, Serializable):
75
75
  def execute(
76
76
  self,
77
77
  query,
78
- cursor: Optional[Any] = None,
79
- conn: Optional[Any] = None,
78
+ cursor: Any | None = None,
79
+ conn: Any | None = None,
80
80
  ) -> Iterator[tuple[Any, ...]]: ...
81
81
 
82
82
  def get_table(self, name: str) -> "Table":
@@ -90,7 +90,7 @@ class DatabaseEngine(ABC, Serializable):
90
90
 
91
91
  @abstractmethod
92
92
  def executemany(
93
- self, query, params, cursor: Optional[Any] = None
93
+ self, query, params, cursor: Any | None = None
94
94
  ) -> Iterator[tuple[Any, ...]]: ...
95
95
 
96
96
  @abstractmethod
@@ -112,7 +112,13 @@ class DatabaseEngine(ABC, Serializable):
112
112
  return sa.inspect(self.engine).has_table(name)
113
113
 
114
114
  @abstractmethod
115
- def create_table(self, table: "Table", if_not_exists: bool = True) -> None: ...
115
+ def create_table(
116
+ self,
117
+ table: "Table",
118
+ if_not_exists: bool = True,
119
+ *,
120
+ kind: str | None = None,
121
+ ) -> None: ...
116
122
 
117
123
  @abstractmethod
118
124
  def drop_table(self, table: "Table", if_exists: bool = False) -> None: ...
@@ -4,6 +4,7 @@ from enum import Enum
4
4
  class JobStatus(int, Enum):
5
5
  CREATED = 1
6
6
  SCHEDULED = 10
7
+ PROVISIONING = 12
7
8
  QUEUED = 2
8
9
  INIT = 3
9
10
  RUNNING = 4
@@ -13,6 +14,7 @@ class JobStatus(int, Enum):
13
14
  CANCELED = 8
14
15
  CANCELING_SCHEDULED = 9
15
16
  TASK = 11
17
+ PENDING = 13
16
18
 
17
19
  @classmethod
18
20
  def finished(cls) -> tuple[int, ...]: