datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. datachain/__init__.py +4 -0
  2. datachain/asyn.py +11 -12
  3. datachain/cache.py +5 -5
  4. datachain/catalog/__init__.py +0 -2
  5. datachain/catalog/catalog.py +276 -354
  6. datachain/catalog/dependency.py +164 -0
  7. datachain/catalog/loader.py +8 -3
  8. datachain/checkpoint.py +43 -0
  9. datachain/cli/__init__.py +10 -17
  10. datachain/cli/commands/__init__.py +1 -8
  11. datachain/cli/commands/datasets.py +42 -27
  12. datachain/cli/commands/ls.py +15 -15
  13. datachain/cli/commands/show.py +2 -2
  14. datachain/cli/parser/__init__.py +3 -43
  15. datachain/cli/parser/job.py +1 -1
  16. datachain/cli/parser/utils.py +1 -2
  17. datachain/cli/utils.py +2 -15
  18. datachain/client/azure.py +2 -2
  19. datachain/client/fsspec.py +34 -23
  20. datachain/client/gcs.py +3 -3
  21. datachain/client/http.py +157 -0
  22. datachain/client/local.py +11 -7
  23. datachain/client/s3.py +3 -3
  24. datachain/config.py +4 -8
  25. datachain/data_storage/db_engine.py +12 -6
  26. datachain/data_storage/job.py +2 -0
  27. datachain/data_storage/metastore.py +716 -137
  28. datachain/data_storage/schema.py +20 -27
  29. datachain/data_storage/serializer.py +105 -15
  30. datachain/data_storage/sqlite.py +114 -114
  31. datachain/data_storage/warehouse.py +140 -48
  32. datachain/dataset.py +109 -89
  33. datachain/delta.py +117 -42
  34. datachain/diff/__init__.py +25 -33
  35. datachain/error.py +24 -0
  36. datachain/func/aggregate.py +9 -11
  37. datachain/func/array.py +12 -12
  38. datachain/func/base.py +7 -4
  39. datachain/func/conditional.py +9 -13
  40. datachain/func/func.py +63 -45
  41. datachain/func/numeric.py +5 -7
  42. datachain/func/string.py +2 -2
  43. datachain/hash_utils.py +123 -0
  44. datachain/job.py +11 -7
  45. datachain/json.py +138 -0
  46. datachain/lib/arrow.py +18 -15
  47. datachain/lib/audio.py +60 -59
  48. datachain/lib/clip.py +14 -13
  49. datachain/lib/convert/python_to_sql.py +6 -10
  50. datachain/lib/convert/values_to_tuples.py +151 -53
  51. datachain/lib/data_model.py +23 -19
  52. datachain/lib/dataset_info.py +7 -7
  53. datachain/lib/dc/__init__.py +2 -1
  54. datachain/lib/dc/csv.py +22 -26
  55. datachain/lib/dc/database.py +37 -34
  56. datachain/lib/dc/datachain.py +518 -324
  57. datachain/lib/dc/datasets.py +38 -30
  58. datachain/lib/dc/hf.py +16 -20
  59. datachain/lib/dc/json.py +17 -18
  60. datachain/lib/dc/listings.py +5 -8
  61. datachain/lib/dc/pandas.py +3 -6
  62. datachain/lib/dc/parquet.py +33 -21
  63. datachain/lib/dc/records.py +9 -13
  64. datachain/lib/dc/storage.py +103 -65
  65. datachain/lib/dc/storage_pattern.py +251 -0
  66. datachain/lib/dc/utils.py +17 -14
  67. datachain/lib/dc/values.py +3 -6
  68. datachain/lib/file.py +187 -50
  69. datachain/lib/hf.py +7 -5
  70. datachain/lib/image.py +13 -13
  71. datachain/lib/listing.py +5 -5
  72. datachain/lib/listing_info.py +1 -2
  73. datachain/lib/meta_formats.py +2 -3
  74. datachain/lib/model_store.py +20 -8
  75. datachain/lib/namespaces.py +59 -7
  76. datachain/lib/projects.py +51 -9
  77. datachain/lib/pytorch.py +31 -23
  78. datachain/lib/settings.py +188 -85
  79. datachain/lib/signal_schema.py +302 -64
  80. datachain/lib/text.py +8 -7
  81. datachain/lib/udf.py +103 -63
  82. datachain/lib/udf_signature.py +59 -34
  83. datachain/lib/utils.py +20 -0
  84. datachain/lib/video.py +3 -4
  85. datachain/lib/webdataset.py +31 -36
  86. datachain/lib/webdataset_laion.py +15 -16
  87. datachain/listing.py +12 -5
  88. datachain/model/bbox.py +3 -1
  89. datachain/namespace.py +22 -3
  90. datachain/node.py +6 -6
  91. datachain/nodes_thread_pool.py +0 -1
  92. datachain/plugins.py +24 -0
  93. datachain/project.py +4 -4
  94. datachain/query/batch.py +10 -12
  95. datachain/query/dataset.py +376 -194
  96. datachain/query/dispatch.py +112 -84
  97. datachain/query/metrics.py +3 -4
  98. datachain/query/params.py +2 -3
  99. datachain/query/queue.py +2 -1
  100. datachain/query/schema.py +7 -6
  101. datachain/query/session.py +190 -33
  102. datachain/query/udf.py +9 -6
  103. datachain/remote/studio.py +90 -53
  104. datachain/script_meta.py +12 -12
  105. datachain/sql/sqlite/base.py +37 -25
  106. datachain/sql/sqlite/types.py +1 -1
  107. datachain/sql/types.py +36 -5
  108. datachain/studio.py +49 -40
  109. datachain/toolkit/split.py +31 -10
  110. datachain/utils.py +39 -48
  111. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
  112. datachain-0.39.0.dist-info/RECORD +173 -0
  113. datachain/cli/commands/query.py +0 -54
  114. datachain/query/utils.py +0 -36
  115. datachain-0.30.5.dist-info/RECORD +0 -168
  116. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
  117. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
  118. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
  119. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,5 @@
1
1
  import warnings
2
2
  from collections.abc import Iterator
3
- from typing import Optional
4
3
 
5
4
  import numpy as np
6
5
  from pydantic import BaseModel, Field
@@ -23,18 +22,18 @@ warnings.filterwarnings(
23
22
 
24
23
  class Laion(WDSReadableSubclass):
25
24
  uid: str = Field(default="")
26
- face_bboxes: Optional[list[list[float]]] = Field(default=None)
27
- caption: Optional[str] = Field(default=None)
28
- url: Optional[str] = Field(default=None)
29
- key: Optional[str] = Field(default=None)
30
- status: Optional[str] = Field(default=None)
31
- error_message: Optional[str] = Field(default=None)
32
- width: Optional[int] = Field(default=None)
33
- height: Optional[int] = Field(default=None)
34
- original_width: Optional[int] = Field(default=None)
35
- original_height: Optional[int] = Field(default=None)
36
- exif: Optional[str] = Field(default=None)
37
- sha256: Optional[str] = Field(default=None)
25
+ face_bboxes: list[list[float]] | None = Field(default=None)
26
+ caption: str | None = Field(default=None)
27
+ url: str | None = Field(default=None)
28
+ key: str | None = Field(default=None)
29
+ status: str | None = Field(default=None)
30
+ error_message: str | None = Field(default=None)
31
+ width: int | None = Field(default=None)
32
+ height: int | None = Field(default=None)
33
+ original_width: int | None = Field(default=None)
34
+ original_height: int | None = Field(default=None)
35
+ exif: str | None = Field(default=None)
36
+ sha256: str | None = Field(default=None)
38
37
 
39
38
  @staticmethod
40
39
  def _reader(builder, item):
@@ -42,13 +41,13 @@ class Laion(WDSReadableSubclass):
42
41
 
43
42
 
44
43
  class WDSLaion(WDSBasic):
45
- txt: Optional[str] = Field(default=None)
46
- json: Laion # type: ignore[assignment]
44
+ txt: str | None = Field(default=None)
45
+ json: Laion = Field(default_factory=Laion) # type: ignore[assignment]
47
46
 
48
47
 
49
48
  class LaionMeta(BaseModel):
50
49
  file: File
51
- index: Optional[int] = Field(default=None)
50
+ index: int | None = Field(default=None)
52
51
  b32_img: list[float] = Field(default=[])
53
52
  b32_txt: list[float] = Field(default=[])
54
53
  l14_img: list[float] = Field(default=[])
datachain/listing.py CHANGED
@@ -2,7 +2,7 @@ import glob
2
2
  import os
3
3
  from collections.abc import Iterable, Iterator
4
4
  from functools import cached_property
5
- from typing import TYPE_CHECKING, Optional
5
+ from typing import TYPE_CHECKING
6
6
 
7
7
  from sqlalchemy import Column
8
8
  from sqlalchemy.sql import func
@@ -25,8 +25,8 @@ class Listing:
25
25
  metastore: "AbstractMetastore",
26
26
  warehouse: "AbstractWarehouse",
27
27
  client: "Client",
28
- dataset_name: Optional["str"] = None,
29
- dataset_version: Optional[str] = None,
28
+ dataset_name: str | None = None,
29
+ dataset_version: str | None = None,
30
30
  column: str = "file",
31
31
  ):
32
32
  self.metastore = metastore
@@ -35,6 +35,7 @@ class Listing:
35
35
  self.dataset_name = dataset_name # dataset representing bucket listing
36
36
  self.dataset_version = dataset_version # dataset representing bucket listing
37
37
  self.column = column
38
+ self._closed = False
38
39
 
39
40
  def clone(self) -> "Listing":
40
41
  return self.__class__(
@@ -53,7 +54,13 @@ class Listing:
53
54
  self.close()
54
55
 
55
56
  def close(self) -> None:
56
- self.warehouse.close()
57
+ if self._closed:
58
+ return
59
+ self._closed = True
60
+ try:
61
+ self.warehouse.close_on_exit()
62
+ finally:
63
+ self.metastore.close_on_exit()
57
64
 
58
65
  @property
59
66
  def uri(self):
@@ -102,7 +109,7 @@ class Listing:
102
109
  def collect_nodes_to_instantiate(
103
110
  self,
104
111
  sources: Iterable["DataSource"],
105
- copy_to_filename: Optional[str],
112
+ copy_to_filename: str | None,
106
113
  recursive=False,
107
114
  copy_dir_contents=False,
108
115
  from_dataset=False,
datachain/model/bbox.py CHANGED
@@ -198,7 +198,9 @@ class BBox(DataModel):
198
198
  def pose_inside(self, pose: Union["Pose", "Pose3D"]) -> bool:
199
199
  """Return True if the pose is inside the bounding box."""
200
200
  return all(
201
- self.point_inside(x, y) for x, y in zip(pose.x, pose.y) if x > 0 or y > 0
201
+ self.point_inside(x, y)
202
+ for x, y in zip(pose.x, pose.y, strict=False)
203
+ if x > 0 or y > 0
202
204
  )
203
205
 
204
206
  @staticmethod
datachain/namespace.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import builtins
2
2
  from dataclasses import dataclass, fields
3
3
  from datetime import datetime
4
- from typing import Any, Optional, TypeVar
4
+ from typing import Any, TypeVar
5
5
 
6
6
  from datachain.error import InvalidNamespaceNameError
7
7
 
@@ -9,12 +9,31 @@ N = TypeVar("N", bound="Namespace")
9
9
  NAMESPACE_NAME_RESERVED_CHARS = [".", "@"]
10
10
 
11
11
 
12
+ def parse_name(name: str) -> tuple[str, str | None]:
13
+ """
14
+ Parses namespace name into namespace and optional project name.
15
+ If both namespace and project are defined in name, they need to be split by dot
16
+ e.g dev.my-project
17
+ Valid inputs:
18
+ - dev.my-project
19
+ - dev
20
+ """
21
+ parts = name.split(".")
22
+ if len(parts) == 1:
23
+ return name, None
24
+ if len(parts) == 2:
25
+ return parts[0], parts[1]
26
+ raise InvalidNamespaceNameError(
27
+ f"Invalid namespace format: {name}. Expected 'namespace' or 'ns1.ns2'."
28
+ )
29
+
30
+
12
31
  @dataclass(frozen=True)
13
32
  class Namespace:
14
33
  id: int
15
34
  uuid: str
16
35
  name: str
17
- descr: Optional[str]
36
+ descr: str | None
18
37
  created_at: datetime
19
38
 
20
39
  @staticmethod
@@ -54,7 +73,7 @@ class Namespace:
54
73
  id: int,
55
74
  uuid: str,
56
75
  name: str,
57
- descr: Optional[str],
76
+ descr: str | None,
58
77
  created_at: datetime,
59
78
  ) -> "Namespace":
60
79
  return cls(id, uuid, name, descr, created_at)
datachain/node.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import os
2
2
  from datetime import datetime
3
- from typing import TYPE_CHECKING, Any, Optional
3
+ from typing import TYPE_CHECKING, Any
4
4
 
5
5
  import attrs
6
6
 
@@ -53,11 +53,11 @@ class Node:
53
53
  sys__rand: int = 0
54
54
  path: str = ""
55
55
  etag: str = ""
56
- version: Optional[str] = None
56
+ version: str | None = None
57
57
  is_latest: bool = True
58
- last_modified: Optional[datetime] = None
58
+ last_modified: datetime | None = None
59
59
  size: int = 0
60
- location: Optional[str] = None
60
+ location: str | None = None
61
61
  source: StorageURI = StorageURI("") # noqa: RUF009
62
62
  dir_type: int = DirType.FILE
63
63
 
@@ -90,7 +90,7 @@ class Node:
90
90
  return self.path + "/"
91
91
  return self.path
92
92
 
93
- def to_file(self, source: Optional[StorageURI] = None) -> File:
93
+ def to_file(self, source: StorageURI | None = None) -> File:
94
94
  if source is None:
95
95
  source = self.source
96
96
  return File(
@@ -189,7 +189,7 @@ class NodeWithPath:
189
189
  TIME_FMT = "%Y-%m-%d %H:%M"
190
190
 
191
191
 
192
- def long_line_str(name: str, timestamp: Optional[datetime]) -> str:
192
+ def long_line_str(name: str, timestamp: datetime | None) -> str:
193
193
  if timestamp is None:
194
194
  time = "-"
195
195
  else:
@@ -1,4 +1,3 @@
1
- import concurrent
2
1
  import concurrent.futures
3
2
  import threading
4
3
  from abc import ABC, abstractmethod
datachain/plugins.py ADDED
@@ -0,0 +1,24 @@
1
+ """Plugin loader for DataChain callables.
2
+
3
+ Discovers and invokes entry points in the group "datachain.callables" once
4
+ per process. This enables external packages (e.g., Studio) to register
5
+ their callables with the serializer registry without explicit imports.
6
+ """
7
+
8
+ from importlib import metadata as importlib_metadata
9
+
10
+ _plugins_loaded = False
11
+
12
+
13
+ def ensure_plugins_loaded() -> None:
14
+ global _plugins_loaded # noqa: PLW0603
15
+ if _plugins_loaded:
16
+ return
17
+
18
+ # Compatible across importlib.metadata versions
19
+ eps_obj = importlib_metadata.entry_points()
20
+ for ep in eps_obj.select(group="datachain.callables"):
21
+ func = ep.load()
22
+ func()
23
+
24
+ _plugins_loaded = True
datachain/project.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import builtins
2
2
  from dataclasses import dataclass, fields
3
3
  from datetime import datetime
4
- from typing import Any, Optional, TypeVar
4
+ from typing import Any, TypeVar
5
5
 
6
6
  from datachain.error import InvalidProjectNameError
7
7
  from datachain.namespace import Namespace
@@ -15,7 +15,7 @@ class Project:
15
15
  id: int
16
16
  uuid: str
17
17
  name: str
18
- descr: Optional[str]
18
+ descr: str | None
19
19
  created_at: datetime
20
20
  namespace: Namespace
21
21
 
@@ -52,12 +52,12 @@ class Project:
52
52
  namespace_id: int,
53
53
  namespace_uuid: str,
54
54
  namespace_name: str,
55
- namespace_descr: Optional[str],
55
+ namespace_descr: str | None,
56
56
  namespace_created_at: datetime,
57
57
  project_id: int,
58
58
  uuid: str,
59
59
  name: str,
60
- descr: Optional[str],
60
+ descr: str | None,
61
61
  created_at: datetime,
62
62
  project_namespace_id: int,
63
63
  ) -> "Project":
datachain/query/batch.py CHANGED
@@ -1,16 +1,14 @@
1
1
  import contextlib
2
2
  import math
3
3
  from abc import ABC, abstractmethod
4
- from collections.abc import Generator, Sequence
5
- from typing import Callable, Optional, Union
4
+ from collections.abc import Callable, Generator, Sequence
6
5
 
7
6
  import sqlalchemy as sa
8
7
 
9
8
  from datachain.data_storage.schema import PARTITION_COLUMN_ID
10
- from datachain.query.utils import get_query_column
11
9
 
12
10
  RowsOutputBatch = Sequence[Sequence]
13
- RowsOutput = Union[Sequence, RowsOutputBatch]
11
+ RowsOutput = Sequence | RowsOutputBatch
14
12
 
15
13
 
16
14
  class BatchingStrategy(ABC):
@@ -23,7 +21,7 @@ class BatchingStrategy(ABC):
23
21
  self,
24
22
  execute: Callable,
25
23
  query: sa.Select,
26
- id_col: Optional[sa.ColumnElement] = None,
24
+ id_col: sa.ColumnElement | None = None,
27
25
  ) -> Generator[RowsOutput, None, None]:
28
26
  """Apply the provided parameters to the UDF."""
29
27
 
@@ -40,7 +38,7 @@ class NoBatching(BatchingStrategy):
40
38
  self,
41
39
  execute: Callable,
42
40
  query: sa.Select,
43
- id_col: Optional[sa.ColumnElement] = None,
41
+ id_col: sa.ColumnElement | None = None,
44
42
  ) -> Generator[Sequence, None, None]:
45
43
  ids_only = False
46
44
  if id_col is not None:
@@ -66,7 +64,7 @@ class Batch(BatchingStrategy):
66
64
  self,
67
65
  execute: Callable,
68
66
  query: sa.Select,
69
- id_col: Optional[sa.ColumnElement] = None,
67
+ id_col: sa.ColumnElement | None = None,
70
68
  ) -> Generator[RowsOutput, None, None]:
71
69
  from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
72
70
 
@@ -81,8 +79,8 @@ class Batch(BatchingStrategy):
81
79
  # select rows in batches
82
80
  results = []
83
81
 
84
- with contextlib.closing(execute(query, page_size=page_size)) as batch_rows:
85
- for row in batch_rows:
82
+ with contextlib.closing(execute(query, page_size=page_size)) as rows:
83
+ for row in rows:
86
84
  results.append(row)
87
85
  if len(results) >= self.count:
88
86
  batch, results = results[: self.count], results[self.count :]
@@ -105,9 +103,9 @@ class Partition(BatchingStrategy):
105
103
  self,
106
104
  execute: Callable,
107
105
  query: sa.Select,
108
- id_col: Optional[sa.ColumnElement] = None,
106
+ id_col: sa.ColumnElement | None = None,
109
107
  ) -> Generator[RowsOutput, None, None]:
110
- if (partition_col := get_query_column(query, PARTITION_COLUMN_ID)) is None:
108
+ if (partition_col := query.selected_columns.get(PARTITION_COLUMN_ID)) is None:
111
109
  raise RuntimeError("partition column not found in query")
112
110
 
113
111
  ids_only = False
@@ -115,7 +113,7 @@ class Partition(BatchingStrategy):
115
113
  query = query.with_only_columns(id_col, partition_col)
116
114
  ids_only = True
117
115
 
118
- current_partition: Optional[int] = None
116
+ current_partition: int | None = None
119
117
  batch: list = []
120
118
 
121
119
  query_fields = [str(c.name) for c in query.selected_columns]