datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. datachain/__init__.py +4 -0
  2. datachain/asyn.py +11 -12
  3. datachain/cache.py +5 -5
  4. datachain/catalog/__init__.py +0 -2
  5. datachain/catalog/catalog.py +276 -354
  6. datachain/catalog/dependency.py +164 -0
  7. datachain/catalog/loader.py +8 -3
  8. datachain/checkpoint.py +43 -0
  9. datachain/cli/__init__.py +10 -17
  10. datachain/cli/commands/__init__.py +1 -8
  11. datachain/cli/commands/datasets.py +42 -27
  12. datachain/cli/commands/ls.py +15 -15
  13. datachain/cli/commands/show.py +2 -2
  14. datachain/cli/parser/__init__.py +3 -43
  15. datachain/cli/parser/job.py +1 -1
  16. datachain/cli/parser/utils.py +1 -2
  17. datachain/cli/utils.py +2 -15
  18. datachain/client/azure.py +2 -2
  19. datachain/client/fsspec.py +34 -23
  20. datachain/client/gcs.py +3 -3
  21. datachain/client/http.py +157 -0
  22. datachain/client/local.py +11 -7
  23. datachain/client/s3.py +3 -3
  24. datachain/config.py +4 -8
  25. datachain/data_storage/db_engine.py +12 -6
  26. datachain/data_storage/job.py +2 -0
  27. datachain/data_storage/metastore.py +716 -137
  28. datachain/data_storage/schema.py +20 -27
  29. datachain/data_storage/serializer.py +105 -15
  30. datachain/data_storage/sqlite.py +114 -114
  31. datachain/data_storage/warehouse.py +140 -48
  32. datachain/dataset.py +109 -89
  33. datachain/delta.py +117 -42
  34. datachain/diff/__init__.py +25 -33
  35. datachain/error.py +24 -0
  36. datachain/func/aggregate.py +9 -11
  37. datachain/func/array.py +12 -12
  38. datachain/func/base.py +7 -4
  39. datachain/func/conditional.py +9 -13
  40. datachain/func/func.py +63 -45
  41. datachain/func/numeric.py +5 -7
  42. datachain/func/string.py +2 -2
  43. datachain/hash_utils.py +123 -0
  44. datachain/job.py +11 -7
  45. datachain/json.py +138 -0
  46. datachain/lib/arrow.py +18 -15
  47. datachain/lib/audio.py +60 -59
  48. datachain/lib/clip.py +14 -13
  49. datachain/lib/convert/python_to_sql.py +6 -10
  50. datachain/lib/convert/values_to_tuples.py +151 -53
  51. datachain/lib/data_model.py +23 -19
  52. datachain/lib/dataset_info.py +7 -7
  53. datachain/lib/dc/__init__.py +2 -1
  54. datachain/lib/dc/csv.py +22 -26
  55. datachain/lib/dc/database.py +37 -34
  56. datachain/lib/dc/datachain.py +518 -324
  57. datachain/lib/dc/datasets.py +38 -30
  58. datachain/lib/dc/hf.py +16 -20
  59. datachain/lib/dc/json.py +17 -18
  60. datachain/lib/dc/listings.py +5 -8
  61. datachain/lib/dc/pandas.py +3 -6
  62. datachain/lib/dc/parquet.py +33 -21
  63. datachain/lib/dc/records.py +9 -13
  64. datachain/lib/dc/storage.py +103 -65
  65. datachain/lib/dc/storage_pattern.py +251 -0
  66. datachain/lib/dc/utils.py +17 -14
  67. datachain/lib/dc/values.py +3 -6
  68. datachain/lib/file.py +187 -50
  69. datachain/lib/hf.py +7 -5
  70. datachain/lib/image.py +13 -13
  71. datachain/lib/listing.py +5 -5
  72. datachain/lib/listing_info.py +1 -2
  73. datachain/lib/meta_formats.py +2 -3
  74. datachain/lib/model_store.py +20 -8
  75. datachain/lib/namespaces.py +59 -7
  76. datachain/lib/projects.py +51 -9
  77. datachain/lib/pytorch.py +31 -23
  78. datachain/lib/settings.py +188 -85
  79. datachain/lib/signal_schema.py +302 -64
  80. datachain/lib/text.py +8 -7
  81. datachain/lib/udf.py +103 -63
  82. datachain/lib/udf_signature.py +59 -34
  83. datachain/lib/utils.py +20 -0
  84. datachain/lib/video.py +3 -4
  85. datachain/lib/webdataset.py +31 -36
  86. datachain/lib/webdataset_laion.py +15 -16
  87. datachain/listing.py +12 -5
  88. datachain/model/bbox.py +3 -1
  89. datachain/namespace.py +22 -3
  90. datachain/node.py +6 -6
  91. datachain/nodes_thread_pool.py +0 -1
  92. datachain/plugins.py +24 -0
  93. datachain/project.py +4 -4
  94. datachain/query/batch.py +10 -12
  95. datachain/query/dataset.py +376 -194
  96. datachain/query/dispatch.py +112 -84
  97. datachain/query/metrics.py +3 -4
  98. datachain/query/params.py +2 -3
  99. datachain/query/queue.py +2 -1
  100. datachain/query/schema.py +7 -6
  101. datachain/query/session.py +190 -33
  102. datachain/query/udf.py +9 -6
  103. datachain/remote/studio.py +90 -53
  104. datachain/script_meta.py +12 -12
  105. datachain/sql/sqlite/base.py +37 -25
  106. datachain/sql/sqlite/types.py +1 -1
  107. datachain/sql/types.py +36 -5
  108. datachain/studio.py +49 -40
  109. datachain/toolkit/split.py +31 -10
  110. datachain/utils.py +39 -48
  111. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
  112. datachain-0.39.0.dist-info/RECORD +173 -0
  113. datachain/cli/commands/query.py +0 -54
  114. datachain/query/utils.py +0 -36
  115. datachain-0.30.5.dist-info/RECORD +0 -168
  116. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
  117. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
  118. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
  119. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/__init__.py CHANGED
@@ -6,6 +6,7 @@ from datachain.lib.dc import (
6
6
  Sys,
7
7
  datasets,
8
8
  delete_dataset,
9
+ is_local,
9
10
  is_studio,
10
11
  listings,
11
12
  move_dataset,
@@ -37,6 +38,7 @@ from datachain.lib.file import (
37
38
  VideoFrame,
38
39
  )
39
40
  from datachain.lib.model_store import ModelStore
41
+ from datachain.lib.namespaces import delete_namespace
40
42
  from datachain.lib.projects import create as create_project
41
43
  from datachain.lib.udf import Aggregator, Generator, Mapper
42
44
  from datachain.lib.utils import AbstractUDF, DataChainError
@@ -74,7 +76,9 @@ __all__ = [
74
76
  "create_project",
75
77
  "datasets",
76
78
  "delete_dataset",
79
+ "delete_namespace",
77
80
  "is_chain_type",
81
+ "is_local",
78
82
  "is_studio",
79
83
  "listings",
80
84
  "metrics",
datachain/asyn.py CHANGED
@@ -3,6 +3,7 @@ import threading
3
3
  from collections.abc import (
4
4
  AsyncIterable,
5
5
  Awaitable,
6
+ Callable,
6
7
  Coroutine,
7
8
  Generator,
8
9
  Iterable,
@@ -10,7 +11,7 @@ from collections.abc import (
10
11
  )
11
12
  from concurrent.futures import ThreadPoolExecutor, wait
12
13
  from heapq import heappop, heappush
13
- from typing import Any, Callable, Generic, Optional, TypeVar
14
+ from typing import Any, Generic, TypeVar
14
15
 
15
16
  from fsspec.asyn import get_loop
16
17
 
@@ -49,7 +50,7 @@ class AsyncMapper(Generic[InputT, ResultT]):
49
50
  iterable: Iterable[InputT],
50
51
  *,
51
52
  workers: int = ASYNC_WORKERS,
52
- loop: Optional[asyncio.AbstractEventLoop] = None,
53
+ loop: asyncio.AbstractEventLoop | None = None,
53
54
  ):
54
55
  self.func = func
55
56
  self.iterable = iterable
@@ -107,9 +108,7 @@ class AsyncMapper(Generic[InputT, ResultT]):
107
108
 
108
109
  async def init(self) -> None:
109
110
  self.work_queue = asyncio.Queue(2 * self.workers)
110
- self.result_queue: asyncio.Queue[Optional[ResultT]] = asyncio.Queue(
111
- self.workers
112
- )
111
+ self.result_queue: asyncio.Queue[ResultT | None] = asyncio.Queue(self.workers)
113
112
 
114
113
  async def run(self) -> None:
115
114
  producer = self.start_task(self.produce())
@@ -149,10 +148,10 @@ class AsyncMapper(Generic[InputT, ResultT]):
149
148
  if exc:
150
149
  raise exc
151
150
 
152
- async def _pop_result(self) -> Optional[ResultT]:
151
+ async def _pop_result(self) -> ResultT | None:
153
152
  return await self.result_queue.get()
154
153
 
155
- def next_result(self, timeout=None) -> Optional[ResultT]:
154
+ def next_result(self, timeout=None) -> ResultT | None:
156
155
  """
157
156
  Return the next available result.
158
157
 
@@ -212,17 +211,17 @@ class OrderedMapper(AsyncMapper[InputT, ResultT]):
212
211
  iterable: Iterable[InputT],
213
212
  *,
214
213
  workers: int = ASYNC_WORKERS,
215
- loop: Optional[asyncio.AbstractEventLoop] = None,
214
+ loop: asyncio.AbstractEventLoop | None = None,
216
215
  ):
217
216
  super().__init__(func, iterable, workers=workers, loop=loop)
218
217
  self._waiters: dict[int, Any] = {}
219
- self._getters: dict[int, asyncio.Future[Optional[ResultT]]] = {}
220
- self.heap: list[tuple[int, Optional[ResultT]]] = []
218
+ self._getters: dict[int, asyncio.Future[ResultT | None]] = {}
219
+ self.heap: list[tuple[int, ResultT | None]] = []
221
220
  self._next_yield = 0
222
221
  self._items_seen = 0
223
222
  self._window = 2 * workers
224
223
 
225
- def _push_result(self, i: int, result: Optional[ResultT]) -> None:
224
+ def _push_result(self, i: int, result: ResultT | None) -> None:
226
225
  if i in self._getters:
227
226
  future = self._getters.pop(i)
228
227
  future.set_result(result)
@@ -243,7 +242,7 @@ class OrderedMapper(AsyncMapper[InputT, ResultT]):
243
242
  async def init(self) -> None:
244
243
  self.work_queue = asyncio.Queue(2 * self.workers)
245
244
 
246
- async def _pop_result(self) -> Optional[ResultT]:
245
+ async def _pop_result(self) -> ResultT | None:
247
246
  if self.heap and self.heap[0][0] == self._next_yield:
248
247
  _i, out = heappop(self.heap)
249
248
  else:
datachain/cache.py CHANGED
@@ -2,7 +2,7 @@ import os
2
2
  from collections.abc import Iterator
3
3
  from contextlib import contextmanager
4
4
  from tempfile import mkdtemp
5
- from typing import TYPE_CHECKING, Optional
5
+ from typing import TYPE_CHECKING
6
6
 
7
7
  from dvc_data.hashfile.db.local import LocalHashFileDB
8
8
  from dvc_objects.fs.local import LocalFileSystem
@@ -22,14 +22,14 @@ def try_scandir(path):
22
22
  pass
23
23
 
24
24
 
25
- def get_temp_cache(tmp_dir: str, prefix: Optional[str] = None) -> "Cache":
25
+ def get_temp_cache(tmp_dir: str, prefix: str | None = None) -> "Cache":
26
26
  cache_dir = mkdtemp(prefix=prefix, dir=tmp_dir)
27
27
  return Cache(cache_dir, tmp_dir=tmp_dir)
28
28
 
29
29
 
30
30
  @contextmanager
31
31
  def temporary_cache(
32
- tmp_dir: str, prefix: Optional[str] = None, delete: bool = True
32
+ tmp_dir: str, prefix: str | None = None, delete: bool = True
33
33
  ) -> Iterator["Cache"]:
34
34
  cache = get_temp_cache(tmp_dir, prefix=prefix)
35
35
  try:
@@ -58,7 +58,7 @@ class Cache: # noqa: PLW1641
58
58
  def tmp_dir(self):
59
59
  return self.odb.tmp_dir
60
60
 
61
- def get_path(self, file: "File") -> Optional[str]:
61
+ def get_path(self, file: "File") -> str | None:
62
62
  if self.contains(file):
63
63
  return self.path_from_checksum(file.get_hash())
64
64
  return None
@@ -74,7 +74,7 @@ class Cache: # noqa: PLW1641
74
74
  self.odb.delete(file.get_hash())
75
75
 
76
76
  async def download(
77
- self, file: "File", client: "Client", callback: Optional[Callback] = None
77
+ self, file: "File", client: "Client", callback: Callback | None = None
78
78
  ) -> None:
79
79
  from dvc_objects.fs.utils import tmp_fname
80
80
 
@@ -1,7 +1,6 @@
1
1
  from .catalog import (
2
2
  QUERY_DATASET_PREFIX,
3
3
  QUERY_SCRIPT_CANCELED_EXIT_CODE,
4
- QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE,
5
4
  Catalog,
6
5
  is_namespace_local,
7
6
  )
@@ -10,7 +9,6 @@ from .loader import get_catalog
10
9
  __all__ = [
11
10
  "QUERY_DATASET_PREFIX",
12
11
  "QUERY_SCRIPT_CANCELED_EXIT_CODE",
13
- "QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE",
14
12
  "Catalog",
15
13
  "get_catalog",
16
14
  "is_namespace_local",