datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. datachain/__init__.py +20 -0
  2. datachain/asyn.py +11 -12
  3. datachain/cache.py +7 -7
  4. datachain/catalog/__init__.py +2 -2
  5. datachain/catalog/catalog.py +621 -507
  6. datachain/catalog/dependency.py +164 -0
  7. datachain/catalog/loader.py +28 -18
  8. datachain/checkpoint.py +43 -0
  9. datachain/cli/__init__.py +24 -33
  10. datachain/cli/commands/__init__.py +1 -8
  11. datachain/cli/commands/datasets.py +83 -52
  12. datachain/cli/commands/ls.py +17 -17
  13. datachain/cli/commands/show.py +4 -4
  14. datachain/cli/parser/__init__.py +8 -74
  15. datachain/cli/parser/job.py +95 -3
  16. datachain/cli/parser/studio.py +11 -4
  17. datachain/cli/parser/utils.py +1 -2
  18. datachain/cli/utils.py +2 -15
  19. datachain/client/azure.py +4 -4
  20. datachain/client/fsspec.py +45 -28
  21. datachain/client/gcs.py +6 -6
  22. datachain/client/hf.py +29 -2
  23. datachain/client/http.py +157 -0
  24. datachain/client/local.py +15 -11
  25. datachain/client/s3.py +17 -9
  26. datachain/config.py +4 -8
  27. datachain/data_storage/db_engine.py +12 -6
  28. datachain/data_storage/job.py +5 -1
  29. datachain/data_storage/metastore.py +1252 -186
  30. datachain/data_storage/schema.py +58 -45
  31. datachain/data_storage/serializer.py +105 -15
  32. datachain/data_storage/sqlite.py +286 -127
  33. datachain/data_storage/warehouse.py +250 -113
  34. datachain/dataset.py +353 -148
  35. datachain/delta.py +391 -0
  36. datachain/diff/__init__.py +27 -29
  37. datachain/error.py +60 -0
  38. datachain/func/__init__.py +2 -1
  39. datachain/func/aggregate.py +66 -42
  40. datachain/func/array.py +242 -38
  41. datachain/func/base.py +7 -4
  42. datachain/func/conditional.py +110 -60
  43. datachain/func/func.py +96 -45
  44. datachain/func/numeric.py +55 -38
  45. datachain/func/path.py +32 -20
  46. datachain/func/random.py +2 -2
  47. datachain/func/string.py +67 -37
  48. datachain/func/window.py +7 -8
  49. datachain/hash_utils.py +123 -0
  50. datachain/job.py +11 -7
  51. datachain/json.py +138 -0
  52. datachain/lib/arrow.py +58 -22
  53. datachain/lib/audio.py +245 -0
  54. datachain/lib/clip.py +14 -13
  55. datachain/lib/convert/flatten.py +5 -3
  56. datachain/lib/convert/python_to_sql.py +6 -10
  57. datachain/lib/convert/sql_to_python.py +8 -0
  58. datachain/lib/convert/values_to_tuples.py +156 -51
  59. datachain/lib/data_model.py +42 -20
  60. datachain/lib/dataset_info.py +36 -8
  61. datachain/lib/dc/__init__.py +8 -2
  62. datachain/lib/dc/csv.py +25 -28
  63. datachain/lib/dc/database.py +398 -0
  64. datachain/lib/dc/datachain.py +1289 -425
  65. datachain/lib/dc/datasets.py +320 -38
  66. datachain/lib/dc/hf.py +38 -24
  67. datachain/lib/dc/json.py +29 -32
  68. datachain/lib/dc/listings.py +112 -8
  69. datachain/lib/dc/pandas.py +16 -12
  70. datachain/lib/dc/parquet.py +35 -23
  71. datachain/lib/dc/records.py +31 -23
  72. datachain/lib/dc/storage.py +154 -64
  73. datachain/lib/dc/storage_pattern.py +251 -0
  74. datachain/lib/dc/utils.py +24 -16
  75. datachain/lib/dc/values.py +8 -9
  76. datachain/lib/file.py +622 -89
  77. datachain/lib/hf.py +69 -39
  78. datachain/lib/image.py +14 -14
  79. datachain/lib/listing.py +14 -11
  80. datachain/lib/listing_info.py +1 -2
  81. datachain/lib/meta_formats.py +3 -4
  82. datachain/lib/model_store.py +39 -7
  83. datachain/lib/namespaces.py +125 -0
  84. datachain/lib/projects.py +130 -0
  85. datachain/lib/pytorch.py +32 -21
  86. datachain/lib/settings.py +192 -56
  87. datachain/lib/signal_schema.py +427 -104
  88. datachain/lib/tar.py +1 -2
  89. datachain/lib/text.py +8 -7
  90. datachain/lib/udf.py +164 -76
  91. datachain/lib/udf_signature.py +60 -35
  92. datachain/lib/utils.py +118 -4
  93. datachain/lib/video.py +17 -9
  94. datachain/lib/webdataset.py +61 -56
  95. datachain/lib/webdataset_laion.py +15 -16
  96. datachain/listing.py +22 -10
  97. datachain/model/bbox.py +3 -1
  98. datachain/model/ultralytics/bbox.py +16 -12
  99. datachain/model/ultralytics/pose.py +16 -12
  100. datachain/model/ultralytics/segment.py +16 -12
  101. datachain/namespace.py +84 -0
  102. datachain/node.py +6 -6
  103. datachain/nodes_thread_pool.py +0 -1
  104. datachain/plugins.py +24 -0
  105. datachain/project.py +78 -0
  106. datachain/query/batch.py +40 -41
  107. datachain/query/dataset.py +604 -322
  108. datachain/query/dispatch.py +261 -154
  109. datachain/query/metrics.py +4 -6
  110. datachain/query/params.py +2 -3
  111. datachain/query/queue.py +3 -12
  112. datachain/query/schema.py +11 -6
  113. datachain/query/session.py +200 -33
  114. datachain/query/udf.py +34 -2
  115. datachain/remote/studio.py +171 -69
  116. datachain/script_meta.py +12 -12
  117. datachain/semver.py +68 -0
  118. datachain/sql/__init__.py +2 -0
  119. datachain/sql/functions/array.py +33 -1
  120. datachain/sql/postgresql_dialect.py +9 -0
  121. datachain/sql/postgresql_types.py +21 -0
  122. datachain/sql/sqlite/__init__.py +5 -1
  123. datachain/sql/sqlite/base.py +102 -29
  124. datachain/sql/sqlite/types.py +8 -13
  125. datachain/sql/types.py +70 -15
  126. datachain/studio.py +223 -46
  127. datachain/toolkit/split.py +31 -10
  128. datachain/utils.py +101 -59
  129. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
  130. datachain-0.39.0.dist-info/RECORD +173 -0
  131. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
  132. datachain/cli/commands/query.py +0 -53
  133. datachain/query/utils.py +0 -42
  134. datachain-0.14.2.dist-info/RECORD +0 -158
  135. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
  136. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
  137. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/client/gcs.py CHANGED
@@ -3,7 +3,7 @@ import json
3
3
  import os
4
4
  from collections.abc import Iterable
5
5
  from datetime import datetime
6
- from typing import Any, Optional, cast
6
+ from typing import Any, cast
7
7
 
8
8
  from dateutil.parser import isoparse
9
9
  from gcsfs import GCSFileSystem
@@ -15,7 +15,7 @@ from .fsspec import DELIMITER, Client, ResultQueue
15
15
 
16
16
  # Patch gcsfs for consistency with s3fs
17
17
  GCSFileSystem.set_session = GCSFileSystem._set_session
18
- PageQueue = asyncio.Queue[Optional[Iterable[dict[str, Any]]]]
18
+ PageQueue = asyncio.Queue[Iterable[dict[str, Any]] | None]
19
19
 
20
20
 
21
21
  class GCSClient(Client):
@@ -74,7 +74,7 @@ class GCSClient(Client):
74
74
  try:
75
75
  await self._get_pages(prefix, page_queue)
76
76
  found = await consumer
77
- if not found:
77
+ if not found and prefix:
78
78
  raise FileNotFoundError(f"Unable to resolve remote path: {prefix}")
79
79
  finally:
80
80
  consumer.cancel() # In case _get_pages() raised
@@ -115,7 +115,7 @@ class GCSClient(Client):
115
115
  maxResults=page_size,
116
116
  pageToken=next_page_token,
117
117
  json_out=True,
118
- versions="true",
118
+ versions="true" if self._is_version_aware() else "false",
119
119
  )
120
120
  assert page["kind"] == "storage#objects"
121
121
  await page_queue.put(page.get("items", []))
@@ -134,12 +134,12 @@ class GCSClient(Client):
134
134
  source=self.uri,
135
135
  path=path,
136
136
  etag=v.get("etag", ""),
137
- version=v.get("generation", ""),
137
+ version=v.get("generation", "") if self._is_version_aware() else "",
138
138
  is_latest=not v.get("timeDeleted"),
139
139
  last_modified=self.parse_timestamp(v["updated"]),
140
140
  size=v.get("size", ""),
141
141
  )
142
142
 
143
143
  @classmethod
144
- def version_path(cls, path: str, version_id: Optional[str]) -> str:
144
+ def version_path(cls, path: str, version_id: str | None) -> str:
145
145
  return f"{path}#{version_id}" if version_id else path
datachain/client/hf.py CHANGED
@@ -15,6 +15,34 @@ class classproperty: # noqa: N801
15
15
  return self.fget(owner)
16
16
 
17
17
 
18
+ def _wrap_class(sync_fs_class):
19
+ """
20
+ Analog of `AsyncFileSystemWrapper.wrap_class` from fsspec, but sets
21
+ asynchronous to False by default. This is similar to other Async FS
22
+ we initialize. E.g. it means we don't break things in Jupyter where code
23
+ run in async.
24
+
25
+ This also fixes write operations by ensuring they are properly forwarded
26
+ to the underlying filesystem without async buffering issues.
27
+ """
28
+ from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
29
+
30
+ class GeneratedAsyncFileSystemWrapper(AsyncFileSystemWrapper):
31
+ def __init__(self, *args, **kwargs):
32
+ sync_fs = sync_fs_class(*args, **kwargs)
33
+ super().__init__(sync_fs, asynchronous=False)
34
+
35
+ def open(self, path, mode="rb", **kwargs):
36
+ # Override open to ensure write operations work correctly.
37
+ # It seems to be a bug in the fsspec wrapper. It avoids
38
+ # wrapping open() explicitly but also doesn't redirect it to
39
+ # sync filesystem.
40
+ return self.sync_fs.open(path, mode, **kwargs)
41
+
42
+ GeneratedAsyncFileSystemWrapper.__name__ = f"Async{sync_fs_class.__name__}Wrapper"
43
+ return GeneratedAsyncFileSystemWrapper
44
+
45
+
18
46
  @functools.cache
19
47
  def get_hf_filesystem_cls():
20
48
  import fsspec
@@ -29,10 +57,9 @@ def get_hf_filesystem_cls():
29
57
  f"{fsspec_version} is installed."
30
58
  )
31
59
 
32
- from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
33
60
  from huggingface_hub import HfFileSystem
34
61
 
35
- fs_cls = AsyncFileSystemWrapper.wrap_class(HfFileSystem)
62
+ fs_cls = _wrap_class(HfFileSystem)
36
63
  # AsyncFileSystemWrapper does not set class properties, so we need to set them back.
37
64
  fs_cls.protocol = HfFileSystem.protocol
38
65
  return fs_cls
@@ -0,0 +1,157 @@
1
+ from datetime import datetime, timezone
2
+ from typing import TYPE_CHECKING, Any, ClassVar, cast
3
+ from urllib.parse import urlparse
4
+
5
+ from fsspec.implementations.http import HTTPFileSystem
6
+
7
+ from datachain.dataset import StorageURI
8
+ from datachain.lib.file import File
9
+
10
+ from .fsspec import Client
11
+
12
+ if TYPE_CHECKING:
13
+ from datachain.cache import Cache
14
+
15
+
16
+ class HTTPClient(Client):
17
+ FS_CLASS = HTTPFileSystem
18
+ PREFIX: ClassVar[str] = "http://"
19
+ protocol: ClassVar[str] = "http"
20
+
21
+ @classmethod
22
+ def create_fs(cls, **kwargs) -> HTTPFileSystem:
23
+ # Configure HTTPFileSystem options
24
+ kwargs.setdefault("simple_links", True)
25
+ kwargs.setdefault("same_scheme", True)
26
+ kwargs.setdefault("cache_type", "bytes")
27
+
28
+ kwargs.pop("version_aware", None)
29
+
30
+ fs = cls.FS_CLASS(**kwargs)
31
+ fs.invalidate_cache()
32
+ return cast("HTTPFileSystem", fs)
33
+
34
+ @classmethod
35
+ def from_name(
36
+ cls,
37
+ name: str,
38
+ cache: "Cache",
39
+ kwargs: dict[str, Any],
40
+ ) -> "HTTPClient":
41
+ parsed = urlparse(name)
42
+
43
+ if parsed.scheme:
44
+ name = parsed.netloc + parsed.path
45
+
46
+ return cls(name, kwargs, cache)
47
+
48
+ @classmethod
49
+ def split_url(cls, url: str) -> tuple[str, str]:
50
+ """Split HTTP/HTTPS URL into domain (bucket equivalent) and path."""
51
+ parsed = urlparse(url)
52
+ domain = parsed.netloc
53
+ path = parsed.path.lstrip("/")
54
+
55
+ if parsed.query:
56
+ path += f"?{parsed.query}"
57
+ if parsed.fragment:
58
+ path += f"#{parsed.fragment}"
59
+
60
+ return domain, path
61
+
62
+ @classmethod
63
+ def get_uri(cls, name: str) -> "StorageURI":
64
+ if not name.startswith(("http://", "https://")):
65
+ return StorageURI(f"{cls.PREFIX}{name}")
66
+ return StorageURI(name)
67
+
68
+ @classmethod
69
+ def is_root_url(cls, url: str) -> bool:
70
+ parsed = urlparse(url)
71
+ return parsed.path in ("", "/") and not parsed.query and not parsed.fragment
72
+
73
+ def get_full_path(self, rel_path: str, version_id: str | None = None) -> str:
74
+ if self.name.startswith(("http://", "https://")):
75
+ base_url = self.name
76
+ else:
77
+ if rel_path and "/" in rel_path:
78
+ first_part = rel_path.split("/")[0]
79
+ if "." in first_part and not first_part.startswith("."):
80
+ return f"{self.protocol}://{rel_path}"
81
+
82
+ base_url = f"{self.protocol}://{self.name}"
83
+
84
+ if rel_path:
85
+ if not base_url.endswith("/") and not rel_path.startswith("/"):
86
+ base_url += "/"
87
+ full_url = base_url + rel_path
88
+ else:
89
+ full_url = base_url
90
+
91
+ return full_url
92
+
93
+ def url(self, path: str, expires: int = 3600, **kwargs) -> str:
94
+ """
95
+ Generate URL for the given path.
96
+ Note: HTTP URLs don't support signed/expiring URLs.
97
+ """
98
+ return self.get_full_path(path, kwargs.pop("version_id", None))
99
+
100
+ def info_to_file(self, v: dict[str, Any], path: str) -> File:
101
+ etag = v.get("ETag", "").strip('"')
102
+ last_modified = v.get("last_modified")
103
+ if last_modified:
104
+ if isinstance(last_modified, str):
105
+ try:
106
+ from email.utils import parsedate_to_datetime
107
+
108
+ last_modified = parsedate_to_datetime(last_modified)
109
+ except (ValueError, TypeError):
110
+ last_modified = datetime.now(timezone.utc)
111
+ elif isinstance(last_modified, (int, float)):
112
+ last_modified = datetime.fromtimestamp(last_modified, timezone.utc)
113
+ else:
114
+ last_modified = datetime.now(timezone.utc)
115
+
116
+ return File(
117
+ source=self.uri,
118
+ path=path,
119
+ size=v.get("size", 0),
120
+ etag=etag,
121
+ version="",
122
+ is_latest=True,
123
+ last_modified=last_modified,
124
+ )
125
+
126
+ def upload(self, data: bytes, path: str) -> "File":
127
+ raise NotImplementedError(
128
+ "HTTP/HTTPS client is read-only. Upload operations are not supported."
129
+ )
130
+
131
+ def get_file_info(self, path: str, version_id: str | None = None) -> "File":
132
+ info = self.fs.info(self.get_full_path(path))
133
+ return self.info_to_file(info, path)
134
+
135
+ def open_object(self, file: "File", use_cache: bool = True, cb=None):
136
+ from datachain.client.fileslice import FileWrapper
137
+
138
+ if use_cache and (cache_path := self.cache.get_path(file)):
139
+ return open(cache_path, mode="rb")
140
+
141
+ assert not file.location
142
+ return FileWrapper(
143
+ self.fs.open(self.get_full_path(file.get_path_normalized())),
144
+ cb or (lambda x: None),
145
+ )
146
+
147
+ async def get_file(self, lpath, rpath, callback, version_id: str | None = None):
148
+ return await self.fs._get_file(lpath, rpath, callback=callback)
149
+
150
+ async def _fetch_dir(self, prefix: str, pbar, result_queue) -> set[str]:
151
+ full_url = self.get_full_path(prefix)
152
+ raise NotImplementedError(f"Cannot download file from {full_url}")
153
+
154
+
155
+ class HTTPSClient(HTTPClient):
156
+ protocol = "https"
157
+ PREFIX = "https://"
datachain/client/local.py CHANGED
@@ -2,14 +2,14 @@ import os
2
2
  import posixpath
3
3
  from datetime import datetime, timezone
4
4
  from pathlib import Path
5
- from typing import TYPE_CHECKING, Any, Optional
5
+ from typing import TYPE_CHECKING, Any
6
6
  from urllib.parse import urlparse
7
7
 
8
8
  from fsspec.implementations.local import LocalFileSystem
9
9
 
10
10
  from datachain.lib.file import File
11
11
 
12
- from .fsspec import Client
12
+ from .fsspec import Client, is_win_local_path
13
13
 
14
14
  if TYPE_CHECKING:
15
15
  from datachain.cache import Cache
@@ -57,9 +57,13 @@ class FileClient(Client):
57
57
  /home/user/animals/ -> file:///home/user/animals/
58
58
  C:\\windows\animals -> file:///C:/windows/animals
59
59
  """
60
+ parsed = urlparse(path)
61
+ if parsed.scheme and not is_win_local_path(path):
62
+ return path
63
+
60
64
  uri = Path(path).expanduser().absolute().resolve().as_uri()
61
- if path[-1] == os.sep:
62
- # we should keep os separator from the end of the path
65
+ if path and path[-1] in (os.sep, "/"):
66
+ # keep trailing separator so directory URIs stay rooted
63
67
  uri += "/" # in uri (file:///...) all separators are / regardless of os
64
68
 
65
69
  return uri
@@ -99,13 +103,13 @@ class FileClient(Client):
99
103
  )
100
104
 
101
105
  async def get_current_etag(self, file: "File") -> str:
102
- info = self.fs.info(self.get_full_path(file.path))
106
+ info = self.fs.info(self.get_full_path(file.get_path_normalized()))
103
107
  return self.info_to_file(info, "").etag
104
108
 
105
- async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
109
+ async def get_size(self, path: str, version_id: str | None = None) -> int:
106
110
  return self.fs.size(path)
107
111
 
108
- async def get_file(self, lpath, rpath, callback, version_id: Optional[str] = None):
112
+ async def get_file(self, lpath, rpath, callback, version_id: str | None = None):
109
113
  return self.fs.get_file(lpath, rpath, callback=callback)
110
114
 
111
115
  async def ls_dir(self, path):
@@ -114,7 +118,7 @@ class FileClient(Client):
114
118
  def rel_path(self, path):
115
119
  return posixpath.relpath(path, self.name)
116
120
 
117
- def get_full_path(self, rel_path, version_id: Optional[str] = None):
121
+ def get_full_path(self, rel_path, version_id: str | None = None):
118
122
  full_path = Path(self.name, rel_path).as_posix()
119
123
  if rel_path.endswith("/") or not rel_path:
120
124
  full_path += "/"
@@ -138,8 +142,8 @@ class FileClient(Client):
138
142
  if not self.use_symlinks:
139
143
  super().fetch_nodes(nodes, shared_progress_bar)
140
144
 
141
- def do_instantiate_object(self, uid, dst):
145
+ def do_instantiate_object(self, file: File, dst: str) -> None:
142
146
  if self.use_symlinks:
143
- os.symlink(Path(self.name, uid.path), dst)
147
+ os.symlink(Path(self.name, file.path), dst)
144
148
  else:
145
- super().do_instantiate_object(uid, dst)
149
+ super().do_instantiate_object(file, dst)
datachain/client/s3.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  import os
3
- from typing import Any, Optional, cast
3
+ from typing import Any, cast
4
4
  from urllib.parse import parse_qs, urlsplit, urlunsplit
5
5
 
6
6
  from botocore.exceptions import NoCredentialsError
@@ -80,7 +80,7 @@ class ClientS3(Client):
80
80
  finally:
81
81
  await page_queue.put(None)
82
82
 
83
- async def process_pages(page_queue, result_queue):
83
+ async def process_pages(page_queue, result_queue, prefix):
84
84
  found = False
85
85
  with tqdm(desc=f"Listing {self.uri}", unit=" objects", leave=False) as pbar:
86
86
  while (res := await page_queue.get()) is not None:
@@ -94,14 +94,14 @@ class ClientS3(Client):
94
94
  if entries:
95
95
  await result_queue.put(entries)
96
96
  pbar.update(len(entries))
97
- if not found:
97
+ if not found and prefix:
98
98
  raise FileNotFoundError(f"Unable to resolve remote path: {prefix}")
99
99
 
100
100
  try:
101
101
  prefix = start_prefix
102
102
  if prefix:
103
103
  prefix = prefix.lstrip(DELIMITER) + DELIMITER
104
- versions = True
104
+ versions = self._is_version_aware()
105
105
  fs = self.fs
106
106
  await fs.set_session()
107
107
  s3 = await fs.get_s3(self.name)
@@ -118,7 +118,9 @@ class ClientS3(Client):
118
118
  Delimiter="",
119
119
  )
120
120
  page_queue: asyncio.Queue[list] = asyncio.Queue(2)
121
- consumer = asyncio.create_task(process_pages(page_queue, result_queue))
121
+ consumer = asyncio.create_task(
122
+ process_pages(page_queue, result_queue, prefix)
123
+ )
122
124
  try:
123
125
  await get_pages(it, page_queue)
124
126
  await consumer
@@ -137,14 +139,16 @@ class ClientS3(Client):
137
139
  source=self.uri,
138
140
  path=v["Key"],
139
141
  etag=v.get("ETag", "").strip('"'),
140
- version=ClientS3.clean_s3_version(v.get("VersionId", "")),
142
+ version=(
143
+ ClientS3.clean_s3_version(v.get("VersionId", "")) if versions else ""
144
+ ),
141
145
  is_latest=v.get("IsLatest", True),
142
146
  last_modified=v.get("LastModified", ""),
143
147
  size=v["Size"],
144
148
  )
145
149
 
146
150
  @classmethod
147
- def version_path(cls, path: str, version_id: Optional[str]) -> str:
151
+ def version_path(cls, path: str, version_id: str | None) -> str:
148
152
  parts = list(urlsplit(path))
149
153
  query = parse_qs(parts[3])
150
154
  if "versionId" in query:
@@ -183,7 +187,7 @@ class ClientS3(Client):
183
187
  return subdirs
184
188
 
185
189
  @staticmethod
186
- def clean_s3_version(ver: Optional[str]) -> str:
190
+ def clean_s3_version(ver: str | None) -> str:
187
191
  return ver if (ver is not None and ver != "null") else ""
188
192
 
189
193
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
@@ -191,7 +195,11 @@ class ClientS3(Client):
191
195
  source=self.uri,
192
196
  path=path,
193
197
  size=v["size"],
194
- version=ClientS3.clean_s3_version(v.get("VersionId", "")),
198
+ version=(
199
+ ClientS3.clean_s3_version(v.get("VersionId", ""))
200
+ if self._is_version_aware()
201
+ else ""
202
+ ),
195
203
  etag=v.get("ETag", "").strip('"'),
196
204
  is_latest=v.get("IsLatest", True),
197
205
  last_modified=v.get("LastModified", ""),
datachain/config.py CHANGED
@@ -1,7 +1,6 @@
1
1
  from collections.abc import Mapping
2
2
  from contextlib import contextmanager
3
3
  from enum import Enum
4
- from typing import Optional, Union
5
4
 
6
5
  from tomlkit import TOMLDocument, dump, load
7
6
 
@@ -22,16 +21,13 @@ class Config:
22
21
  # In the order of precedence
23
22
  LEVELS = SYSTEM_LEVELS + LOCAL_LEVELS
24
23
 
25
- def __init__(
26
- self,
27
- level: Optional[ConfigLevel] = None,
28
- ):
24
+ def __init__(self, level: ConfigLevel | None = None):
29
25
  self.level = level
30
26
 
31
27
  self.init()
32
28
 
33
29
  @classmethod
34
- def get_dir(cls, level: Optional[ConfigLevel]) -> str:
30
+ def get_dir(cls, level: ConfigLevel | None) -> str:
35
31
  if level == ConfigLevel.SYSTEM:
36
32
  return system_config_dir()
37
33
  if level == ConfigLevel.GLOBAL:
@@ -43,7 +39,7 @@ class Config:
43
39
  d = DataChainDir(self.get_dir(self.level))
44
40
  d.init()
45
41
 
46
- def load_one(self, level: Optional[ConfigLevel] = None) -> TOMLDocument:
42
+ def load_one(self, level: ConfigLevel | None = None) -> TOMLDocument:
47
43
  config_path = DataChainDir(self.get_dir(level)).config
48
44
 
49
45
  try:
@@ -128,7 +124,7 @@ class Config:
128
124
  return remote_conf
129
125
 
130
126
 
131
- def merge(into: Union[TOMLDocument, dict], update: Union[TOMLDocument, dict]):
127
+ def merge(into: TOMLDocument | dict, update: TOMLDocument | dict):
132
128
  """Merges second dict into first recursively"""
133
129
  for key, val in update.items():
134
130
  if isinstance(into.get(key), dict) and isinstance(val, dict):
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from abc import ABC, abstractmethod
3
3
  from collections.abc import Iterator
4
- from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union
4
+ from typing import TYPE_CHECKING, Any, ClassVar
5
5
 
6
6
  import sqlalchemy as sa
7
7
  from sqlalchemy.sql import FROM_LINTING
@@ -58,7 +58,7 @@ class DatabaseEngine(ABC, Serializable):
58
58
  @classmethod
59
59
  def compile_to_args(
60
60
  cls, statement: "ClauseElement", **kwargs
61
- ) -> Union[tuple[str], tuple[str, dict[str, Any]]]:
61
+ ) -> tuple[str] | tuple[str, dict[str, Any]]:
62
62
  """
63
63
  Compile a sqlalchemy query or ddl object to an args tuple.
64
64
 
@@ -75,8 +75,8 @@ class DatabaseEngine(ABC, Serializable):
75
75
  def execute(
76
76
  self,
77
77
  query,
78
- cursor: Optional[Any] = None,
79
- conn: Optional[Any] = None,
78
+ cursor: Any | None = None,
79
+ conn: Any | None = None,
80
80
  ) -> Iterator[tuple[Any, ...]]: ...
81
81
 
82
82
  def get_table(self, name: str) -> "Table":
@@ -90,7 +90,7 @@ class DatabaseEngine(ABC, Serializable):
90
90
 
91
91
  @abstractmethod
92
92
  def executemany(
93
- self, query, params, cursor: Optional[Any] = None
93
+ self, query, params, cursor: Any | None = None
94
94
  ) -> Iterator[tuple[Any, ...]]: ...
95
95
 
96
96
  @abstractmethod
@@ -112,7 +112,13 @@ class DatabaseEngine(ABC, Serializable):
112
112
  return sa.inspect(self.engine).has_table(name)
113
113
 
114
114
  @abstractmethod
115
- def create_table(self, table: "Table", if_not_exists: bool = True) -> None: ...
115
+ def create_table(
116
+ self,
117
+ table: "Table",
118
+ if_not_exists: bool = True,
119
+ *,
120
+ kind: str | None = None,
121
+ ) -> None: ...
116
122
 
117
123
  @abstractmethod
118
124
  def drop_table(self, table: "Table", if_exists: bool = False) -> None: ...
@@ -3,6 +3,8 @@ from enum import Enum
3
3
 
4
4
  class JobStatus(int, Enum):
5
5
  CREATED = 1
6
+ SCHEDULED = 10
7
+ PROVISIONING = 12
6
8
  QUEUED = 2
7
9
  INIT = 3
8
10
  RUNNING = 4
@@ -11,10 +13,12 @@ class JobStatus(int, Enum):
11
13
  CANCELING = 7
12
14
  CANCELED = 8
13
15
  CANCELING_SCHEDULED = 9
16
+ TASK = 11
17
+ PENDING = 13
14
18
 
15
19
  @classmethod
16
20
  def finished(cls) -> tuple[int, ...]:
17
- return cls.COMPLETE, cls.FAILED, cls.CANCELED
21
+ return cls.COMPLETE, cls.FAILED, cls.CANCELED, cls.TASK
18
22
 
19
23
 
20
24
  class JobQueryType(int, Enum):