datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. datachain/__init__.py +20 -0
  2. datachain/asyn.py +11 -12
  3. datachain/cache.py +7 -7
  4. datachain/catalog/__init__.py +2 -2
  5. datachain/catalog/catalog.py +621 -507
  6. datachain/catalog/dependency.py +164 -0
  7. datachain/catalog/loader.py +28 -18
  8. datachain/checkpoint.py +43 -0
  9. datachain/cli/__init__.py +24 -33
  10. datachain/cli/commands/__init__.py +1 -8
  11. datachain/cli/commands/datasets.py +83 -52
  12. datachain/cli/commands/ls.py +17 -17
  13. datachain/cli/commands/show.py +4 -4
  14. datachain/cli/parser/__init__.py +8 -74
  15. datachain/cli/parser/job.py +95 -3
  16. datachain/cli/parser/studio.py +11 -4
  17. datachain/cli/parser/utils.py +1 -2
  18. datachain/cli/utils.py +2 -15
  19. datachain/client/azure.py +4 -4
  20. datachain/client/fsspec.py +45 -28
  21. datachain/client/gcs.py +6 -6
  22. datachain/client/hf.py +29 -2
  23. datachain/client/http.py +157 -0
  24. datachain/client/local.py +15 -11
  25. datachain/client/s3.py +17 -9
  26. datachain/config.py +4 -8
  27. datachain/data_storage/db_engine.py +12 -6
  28. datachain/data_storage/job.py +5 -1
  29. datachain/data_storage/metastore.py +1252 -186
  30. datachain/data_storage/schema.py +58 -45
  31. datachain/data_storage/serializer.py +105 -15
  32. datachain/data_storage/sqlite.py +286 -127
  33. datachain/data_storage/warehouse.py +250 -113
  34. datachain/dataset.py +353 -148
  35. datachain/delta.py +391 -0
  36. datachain/diff/__init__.py +27 -29
  37. datachain/error.py +60 -0
  38. datachain/func/__init__.py +2 -1
  39. datachain/func/aggregate.py +66 -42
  40. datachain/func/array.py +242 -38
  41. datachain/func/base.py +7 -4
  42. datachain/func/conditional.py +110 -60
  43. datachain/func/func.py +96 -45
  44. datachain/func/numeric.py +55 -38
  45. datachain/func/path.py +32 -20
  46. datachain/func/random.py +2 -2
  47. datachain/func/string.py +67 -37
  48. datachain/func/window.py +7 -8
  49. datachain/hash_utils.py +123 -0
  50. datachain/job.py +11 -7
  51. datachain/json.py +138 -0
  52. datachain/lib/arrow.py +58 -22
  53. datachain/lib/audio.py +245 -0
  54. datachain/lib/clip.py +14 -13
  55. datachain/lib/convert/flatten.py +5 -3
  56. datachain/lib/convert/python_to_sql.py +6 -10
  57. datachain/lib/convert/sql_to_python.py +8 -0
  58. datachain/lib/convert/values_to_tuples.py +156 -51
  59. datachain/lib/data_model.py +42 -20
  60. datachain/lib/dataset_info.py +36 -8
  61. datachain/lib/dc/__init__.py +8 -2
  62. datachain/lib/dc/csv.py +25 -28
  63. datachain/lib/dc/database.py +398 -0
  64. datachain/lib/dc/datachain.py +1289 -425
  65. datachain/lib/dc/datasets.py +320 -38
  66. datachain/lib/dc/hf.py +38 -24
  67. datachain/lib/dc/json.py +29 -32
  68. datachain/lib/dc/listings.py +112 -8
  69. datachain/lib/dc/pandas.py +16 -12
  70. datachain/lib/dc/parquet.py +35 -23
  71. datachain/lib/dc/records.py +31 -23
  72. datachain/lib/dc/storage.py +154 -64
  73. datachain/lib/dc/storage_pattern.py +251 -0
  74. datachain/lib/dc/utils.py +24 -16
  75. datachain/lib/dc/values.py +8 -9
  76. datachain/lib/file.py +622 -89
  77. datachain/lib/hf.py +69 -39
  78. datachain/lib/image.py +14 -14
  79. datachain/lib/listing.py +14 -11
  80. datachain/lib/listing_info.py +1 -2
  81. datachain/lib/meta_formats.py +3 -4
  82. datachain/lib/model_store.py +39 -7
  83. datachain/lib/namespaces.py +125 -0
  84. datachain/lib/projects.py +130 -0
  85. datachain/lib/pytorch.py +32 -21
  86. datachain/lib/settings.py +192 -56
  87. datachain/lib/signal_schema.py +427 -104
  88. datachain/lib/tar.py +1 -2
  89. datachain/lib/text.py +8 -7
  90. datachain/lib/udf.py +164 -76
  91. datachain/lib/udf_signature.py +60 -35
  92. datachain/lib/utils.py +118 -4
  93. datachain/lib/video.py +17 -9
  94. datachain/lib/webdataset.py +61 -56
  95. datachain/lib/webdataset_laion.py +15 -16
  96. datachain/listing.py +22 -10
  97. datachain/model/bbox.py +3 -1
  98. datachain/model/ultralytics/bbox.py +16 -12
  99. datachain/model/ultralytics/pose.py +16 -12
  100. datachain/model/ultralytics/segment.py +16 -12
  101. datachain/namespace.py +84 -0
  102. datachain/node.py +6 -6
  103. datachain/nodes_thread_pool.py +0 -1
  104. datachain/plugins.py +24 -0
  105. datachain/project.py +78 -0
  106. datachain/query/batch.py +40 -41
  107. datachain/query/dataset.py +604 -322
  108. datachain/query/dispatch.py +261 -154
  109. datachain/query/metrics.py +4 -6
  110. datachain/query/params.py +2 -3
  111. datachain/query/queue.py +3 -12
  112. datachain/query/schema.py +11 -6
  113. datachain/query/session.py +200 -33
  114. datachain/query/udf.py +34 -2
  115. datachain/remote/studio.py +171 -69
  116. datachain/script_meta.py +12 -12
  117. datachain/semver.py +68 -0
  118. datachain/sql/__init__.py +2 -0
  119. datachain/sql/functions/array.py +33 -1
  120. datachain/sql/postgresql_dialect.py +9 -0
  121. datachain/sql/postgresql_types.py +21 -0
  122. datachain/sql/sqlite/__init__.py +5 -1
  123. datachain/sql/sqlite/base.py +102 -29
  124. datachain/sql/sqlite/types.py +8 -13
  125. datachain/sql/types.py +70 -15
  126. datachain/studio.py +223 -46
  127. datachain/toolkit/split.py +31 -10
  128. datachain/utils.py +101 -59
  129. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
  130. datachain-0.39.0.dist-info/RECORD +173 -0
  131. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
  132. datachain/cli/commands/query.py +0 -53
  133. datachain/query/utils.py +0 -42
  134. datachain-0.14.2.dist-info/RECORD +0 -158
  135. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
  136. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
  137. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/__init__.py CHANGED
@@ -5,8 +5,13 @@ from datachain.lib.dc import (
5
5
  DataChain,
6
6
  Sys,
7
7
  datasets,
8
+ delete_dataset,
9
+ is_local,
10
+ is_studio,
8
11
  listings,
12
+ move_dataset,
9
13
  read_csv,
14
+ read_database,
10
15
  read_dataset,
11
16
  read_hf,
12
17
  read_json,
@@ -18,6 +23,9 @@ from datachain.lib.dc import (
18
23
  )
19
24
  from datachain.lib.file import (
20
25
  ArrowRow,
26
+ Audio,
27
+ AudioFile,
28
+ AudioFragment,
21
29
  File,
22
30
  FileError,
23
31
  Image,
@@ -30,6 +38,8 @@ from datachain.lib.file import (
30
38
  VideoFrame,
31
39
  )
32
40
  from datachain.lib.model_store import ModelStore
41
+ from datachain.lib.namespaces import delete_namespace
42
+ from datachain.lib.projects import create as create_project
33
43
  from datachain.lib.udf import Aggregator, Generator, Mapper
34
44
  from datachain.lib.utils import AbstractUDF, DataChainError
35
45
  from datachain.query import metrics, param
@@ -39,6 +49,9 @@ __all__ = [
39
49
  "AbstractUDF",
40
50
  "Aggregator",
41
51
  "ArrowRow",
52
+ "Audio",
53
+ "AudioFile",
54
+ "AudioFragment",
42
55
  "C",
43
56
  "Column",
44
57
  "DataChain",
@@ -60,12 +73,19 @@ __all__ = [
60
73
  "VideoFile",
61
74
  "VideoFragment",
62
75
  "VideoFrame",
76
+ "create_project",
63
77
  "datasets",
78
+ "delete_dataset",
79
+ "delete_namespace",
64
80
  "is_chain_type",
81
+ "is_local",
82
+ "is_studio",
65
83
  "listings",
66
84
  "metrics",
85
+ "move_dataset",
67
86
  "param",
68
87
  "read_csv",
88
+ "read_database",
69
89
  "read_dataset",
70
90
  "read_hf",
71
91
  "read_json",
datachain/asyn.py CHANGED
@@ -3,6 +3,7 @@ import threading
3
3
  from collections.abc import (
4
4
  AsyncIterable,
5
5
  Awaitable,
6
+ Callable,
6
7
  Coroutine,
7
8
  Generator,
8
9
  Iterable,
@@ -10,7 +11,7 @@ from collections.abc import (
10
11
  )
11
12
  from concurrent.futures import ThreadPoolExecutor, wait
12
13
  from heapq import heappop, heappush
13
- from typing import Any, Callable, Generic, Optional, TypeVar
14
+ from typing import Any, Generic, TypeVar
14
15
 
15
16
  from fsspec.asyn import get_loop
16
17
 
@@ -49,7 +50,7 @@ class AsyncMapper(Generic[InputT, ResultT]):
49
50
  iterable: Iterable[InputT],
50
51
  *,
51
52
  workers: int = ASYNC_WORKERS,
52
- loop: Optional[asyncio.AbstractEventLoop] = None,
53
+ loop: asyncio.AbstractEventLoop | None = None,
53
54
  ):
54
55
  self.func = func
55
56
  self.iterable = iterable
@@ -107,9 +108,7 @@ class AsyncMapper(Generic[InputT, ResultT]):
107
108
 
108
109
  async def init(self) -> None:
109
110
  self.work_queue = asyncio.Queue(2 * self.workers)
110
- self.result_queue: asyncio.Queue[Optional[ResultT]] = asyncio.Queue(
111
- self.workers
112
- )
111
+ self.result_queue: asyncio.Queue[ResultT | None] = asyncio.Queue(self.workers)
113
112
 
114
113
  async def run(self) -> None:
115
114
  producer = self.start_task(self.produce())
@@ -149,10 +148,10 @@ class AsyncMapper(Generic[InputT, ResultT]):
149
148
  if exc:
150
149
  raise exc
151
150
 
152
- async def _pop_result(self) -> Optional[ResultT]:
151
+ async def _pop_result(self) -> ResultT | None:
153
152
  return await self.result_queue.get()
154
153
 
155
- def next_result(self, timeout=None) -> Optional[ResultT]:
154
+ def next_result(self, timeout=None) -> ResultT | None:
156
155
  """
157
156
  Return the next available result.
158
157
 
@@ -212,17 +211,17 @@ class OrderedMapper(AsyncMapper[InputT, ResultT]):
212
211
  iterable: Iterable[InputT],
213
212
  *,
214
213
  workers: int = ASYNC_WORKERS,
215
- loop: Optional[asyncio.AbstractEventLoop] = None,
214
+ loop: asyncio.AbstractEventLoop | None = None,
216
215
  ):
217
216
  super().__init__(func, iterable, workers=workers, loop=loop)
218
217
  self._waiters: dict[int, Any] = {}
219
- self._getters: dict[int, asyncio.Future[Optional[ResultT]]] = {}
220
- self.heap: list[tuple[int, Optional[ResultT]]] = []
218
+ self._getters: dict[int, asyncio.Future[ResultT | None]] = {}
219
+ self.heap: list[tuple[int, ResultT | None]] = []
221
220
  self._next_yield = 0
222
221
  self._items_seen = 0
223
222
  self._window = 2 * workers
224
223
 
225
- def _push_result(self, i: int, result: Optional[ResultT]) -> None:
224
+ def _push_result(self, i: int, result: ResultT | None) -> None:
226
225
  if i in self._getters:
227
226
  future = self._getters.pop(i)
228
227
  future.set_result(result)
@@ -243,7 +242,7 @@ class OrderedMapper(AsyncMapper[InputT, ResultT]):
243
242
  async def init(self) -> None:
244
243
  self.work_queue = asyncio.Queue(2 * self.workers)
245
244
 
246
- async def _pop_result(self) -> Optional[ResultT]:
245
+ async def _pop_result(self) -> ResultT | None:
247
246
  if self.heap and self.heap[0][0] == self._next_yield:
248
247
  _i, out = heappop(self.heap)
249
248
  else:
datachain/cache.py CHANGED
@@ -2,7 +2,7 @@ import os
2
2
  from collections.abc import Iterator
3
3
  from contextlib import contextmanager
4
4
  from tempfile import mkdtemp
5
- from typing import TYPE_CHECKING, Optional
5
+ from typing import TYPE_CHECKING
6
6
 
7
7
  from dvc_data.hashfile.db.local import LocalHashFileDB
8
8
  from dvc_objects.fs.local import LocalFileSystem
@@ -22,14 +22,14 @@ def try_scandir(path):
22
22
  pass
23
23
 
24
24
 
25
- def get_temp_cache(tmp_dir: str, prefix: Optional[str] = None) -> "Cache":
25
+ def get_temp_cache(tmp_dir: str, prefix: str | None = None) -> "Cache":
26
26
  cache_dir = mkdtemp(prefix=prefix, dir=tmp_dir)
27
27
  return Cache(cache_dir, tmp_dir=tmp_dir)
28
28
 
29
29
 
30
30
  @contextmanager
31
31
  def temporary_cache(
32
- tmp_dir: str, prefix: Optional[str] = None, delete: bool = True
32
+ tmp_dir: str, prefix: str | None = None, delete: bool = True
33
33
  ) -> Iterator["Cache"]:
34
34
  cache = get_temp_cache(tmp_dir, prefix=prefix)
35
35
  try:
@@ -39,7 +39,7 @@ def temporary_cache(
39
39
  cache.destroy()
40
40
 
41
41
 
42
- class Cache:
42
+ class Cache: # noqa: PLW1641
43
43
  def __init__(self, cache_dir: str, tmp_dir: str):
44
44
  self.odb = LocalHashFileDB(
45
45
  LocalFileSystem(),
@@ -58,7 +58,7 @@ class Cache:
58
58
  def tmp_dir(self):
59
59
  return self.odb.tmp_dir
60
60
 
61
- def get_path(self, file: "File") -> Optional[str]:
61
+ def get_path(self, file: "File") -> str | None:
62
62
  if self.contains(file):
63
63
  return self.path_from_checksum(file.get_hash())
64
64
  return None
@@ -74,11 +74,11 @@ class Cache:
74
74
  self.odb.delete(file.get_hash())
75
75
 
76
76
  async def download(
77
- self, file: "File", client: "Client", callback: Optional[Callback] = None
77
+ self, file: "File", client: "Client", callback: Callback | None = None
78
78
  ) -> None:
79
- from_path = f"{file.source}/{file.path}"
80
79
  from dvc_objects.fs.utils import tmp_fname
81
80
 
81
+ from_path = file.get_uri()
82
82
  odb_fs = self.odb.fs
83
83
  tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
84
84
  size = file.size
@@ -1,15 +1,15 @@
1
1
  from .catalog import (
2
2
  QUERY_DATASET_PREFIX,
3
3
  QUERY_SCRIPT_CANCELED_EXIT_CODE,
4
- QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE,
5
4
  Catalog,
5
+ is_namespace_local,
6
6
  )
7
7
  from .loader import get_catalog
8
8
 
9
9
  __all__ = [
10
10
  "QUERY_DATASET_PREFIX",
11
11
  "QUERY_SCRIPT_CANCELED_EXIT_CODE",
12
- "QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE",
13
12
  "Catalog",
14
13
  "get_catalog",
14
+ "is_namespace_local",
15
15
  ]