datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +4 -0
- datachain/asyn.py +11 -12
- datachain/cache.py +5 -5
- datachain/catalog/__init__.py +0 -2
- datachain/catalog/catalog.py +276 -354
- datachain/catalog/dependency.py +164 -0
- datachain/catalog/loader.py +8 -3
- datachain/checkpoint.py +43 -0
- datachain/cli/__init__.py +10 -17
- datachain/cli/commands/__init__.py +1 -8
- datachain/cli/commands/datasets.py +42 -27
- datachain/cli/commands/ls.py +15 -15
- datachain/cli/commands/show.py +2 -2
- datachain/cli/parser/__init__.py +3 -43
- datachain/cli/parser/job.py +1 -1
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +2 -15
- datachain/client/azure.py +2 -2
- datachain/client/fsspec.py +34 -23
- datachain/client/gcs.py +3 -3
- datachain/client/http.py +157 -0
- datachain/client/local.py +11 -7
- datachain/client/s3.py +3 -3
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +12 -6
- datachain/data_storage/job.py +2 -0
- datachain/data_storage/metastore.py +716 -137
- datachain/data_storage/schema.py +20 -27
- datachain/data_storage/serializer.py +105 -15
- datachain/data_storage/sqlite.py +114 -114
- datachain/data_storage/warehouse.py +140 -48
- datachain/dataset.py +109 -89
- datachain/delta.py +117 -42
- datachain/diff/__init__.py +25 -33
- datachain/error.py +24 -0
- datachain/func/aggregate.py +9 -11
- datachain/func/array.py +12 -12
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +9 -13
- datachain/func/func.py +63 -45
- datachain/func/numeric.py +5 -7
- datachain/func/string.py +2 -2
- datachain/hash_utils.py +123 -0
- datachain/job.py +11 -7
- datachain/json.py +138 -0
- datachain/lib/arrow.py +18 -15
- datachain/lib/audio.py +60 -59
- datachain/lib/clip.py +14 -13
- datachain/lib/convert/python_to_sql.py +6 -10
- datachain/lib/convert/values_to_tuples.py +151 -53
- datachain/lib/data_model.py +23 -19
- datachain/lib/dataset_info.py +7 -7
- datachain/lib/dc/__init__.py +2 -1
- datachain/lib/dc/csv.py +22 -26
- datachain/lib/dc/database.py +37 -34
- datachain/lib/dc/datachain.py +518 -324
- datachain/lib/dc/datasets.py +38 -30
- datachain/lib/dc/hf.py +16 -20
- datachain/lib/dc/json.py +17 -18
- datachain/lib/dc/listings.py +5 -8
- datachain/lib/dc/pandas.py +3 -6
- datachain/lib/dc/parquet.py +33 -21
- datachain/lib/dc/records.py +9 -13
- datachain/lib/dc/storage.py +103 -65
- datachain/lib/dc/storage_pattern.py +251 -0
- datachain/lib/dc/utils.py +17 -14
- datachain/lib/dc/values.py +3 -6
- datachain/lib/file.py +187 -50
- datachain/lib/hf.py +7 -5
- datachain/lib/image.py +13 -13
- datachain/lib/listing.py +5 -5
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +2 -3
- datachain/lib/model_store.py +20 -8
- datachain/lib/namespaces.py +59 -7
- datachain/lib/projects.py +51 -9
- datachain/lib/pytorch.py +31 -23
- datachain/lib/settings.py +188 -85
- datachain/lib/signal_schema.py +302 -64
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +103 -63
- datachain/lib/udf_signature.py +59 -34
- datachain/lib/utils.py +20 -0
- datachain/lib/video.py +3 -4
- datachain/lib/webdataset.py +31 -36
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +12 -5
- datachain/model/bbox.py +3 -1
- datachain/namespace.py +22 -3
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +24 -0
- datachain/project.py +4 -4
- datachain/query/batch.py +10 -12
- datachain/query/dataset.py +376 -194
- datachain/query/dispatch.py +112 -84
- datachain/query/metrics.py +3 -4
- datachain/query/params.py +2 -3
- datachain/query/queue.py +2 -1
- datachain/query/schema.py +7 -6
- datachain/query/session.py +190 -33
- datachain/query/udf.py +9 -6
- datachain/remote/studio.py +90 -53
- datachain/script_meta.py +12 -12
- datachain/sql/sqlite/base.py +37 -25
- datachain/sql/sqlite/types.py +1 -1
- datachain/sql/types.py +36 -5
- datachain/studio.py +49 -40
- datachain/toolkit/split.py +31 -10
- datachain/utils.py +39 -48
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
- datachain-0.39.0.dist-info/RECORD +173 -0
- datachain/cli/commands/query.py +0 -54
- datachain/query/utils.py +0 -36
- datachain-0.30.5.dist-info/RECORD +0 -168
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import warnings
|
|
2
2
|
from collections.abc import Iterator
|
|
3
|
-
from typing import Optional
|
|
4
3
|
|
|
5
4
|
import numpy as np
|
|
6
5
|
from pydantic import BaseModel, Field
|
|
@@ -23,18 +22,18 @@ warnings.filterwarnings(
|
|
|
23
22
|
|
|
24
23
|
class Laion(WDSReadableSubclass):
|
|
25
24
|
uid: str = Field(default="")
|
|
26
|
-
face_bboxes:
|
|
27
|
-
caption:
|
|
28
|
-
url:
|
|
29
|
-
key:
|
|
30
|
-
status:
|
|
31
|
-
error_message:
|
|
32
|
-
width:
|
|
33
|
-
height:
|
|
34
|
-
original_width:
|
|
35
|
-
original_height:
|
|
36
|
-
exif:
|
|
37
|
-
sha256:
|
|
25
|
+
face_bboxes: list[list[float]] | None = Field(default=None)
|
|
26
|
+
caption: str | None = Field(default=None)
|
|
27
|
+
url: str | None = Field(default=None)
|
|
28
|
+
key: str | None = Field(default=None)
|
|
29
|
+
status: str | None = Field(default=None)
|
|
30
|
+
error_message: str | None = Field(default=None)
|
|
31
|
+
width: int | None = Field(default=None)
|
|
32
|
+
height: int | None = Field(default=None)
|
|
33
|
+
original_width: int | None = Field(default=None)
|
|
34
|
+
original_height: int | None = Field(default=None)
|
|
35
|
+
exif: str | None = Field(default=None)
|
|
36
|
+
sha256: str | None = Field(default=None)
|
|
38
37
|
|
|
39
38
|
@staticmethod
|
|
40
39
|
def _reader(builder, item):
|
|
@@ -42,13 +41,13 @@ class Laion(WDSReadableSubclass):
|
|
|
42
41
|
|
|
43
42
|
|
|
44
43
|
class WDSLaion(WDSBasic):
|
|
45
|
-
txt:
|
|
46
|
-
json: Laion # type: ignore[assignment]
|
|
44
|
+
txt: str | None = Field(default=None)
|
|
45
|
+
json: Laion = Field(default_factory=Laion) # type: ignore[assignment]
|
|
47
46
|
|
|
48
47
|
|
|
49
48
|
class LaionMeta(BaseModel):
|
|
50
49
|
file: File
|
|
51
|
-
index:
|
|
50
|
+
index: int | None = Field(default=None)
|
|
52
51
|
b32_img: list[float] = Field(default=[])
|
|
53
52
|
b32_txt: list[float] = Field(default=[])
|
|
54
53
|
l14_img: list[float] = Field(default=[])
|
datachain/listing.py
CHANGED
|
@@ -2,7 +2,7 @@ import glob
|
|
|
2
2
|
import os
|
|
3
3
|
from collections.abc import Iterable, Iterator
|
|
4
4
|
from functools import cached_property
|
|
5
|
-
from typing import TYPE_CHECKING
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
6
|
|
|
7
7
|
from sqlalchemy import Column
|
|
8
8
|
from sqlalchemy.sql import func
|
|
@@ -25,8 +25,8 @@ class Listing:
|
|
|
25
25
|
metastore: "AbstractMetastore",
|
|
26
26
|
warehouse: "AbstractWarehouse",
|
|
27
27
|
client: "Client",
|
|
28
|
-
dataset_name:
|
|
29
|
-
dataset_version:
|
|
28
|
+
dataset_name: str | None = None,
|
|
29
|
+
dataset_version: str | None = None,
|
|
30
30
|
column: str = "file",
|
|
31
31
|
):
|
|
32
32
|
self.metastore = metastore
|
|
@@ -35,6 +35,7 @@ class Listing:
|
|
|
35
35
|
self.dataset_name = dataset_name # dataset representing bucket listing
|
|
36
36
|
self.dataset_version = dataset_version # dataset representing bucket listing
|
|
37
37
|
self.column = column
|
|
38
|
+
self._closed = False
|
|
38
39
|
|
|
39
40
|
def clone(self) -> "Listing":
|
|
40
41
|
return self.__class__(
|
|
@@ -53,7 +54,13 @@ class Listing:
|
|
|
53
54
|
self.close()
|
|
54
55
|
|
|
55
56
|
def close(self) -> None:
|
|
56
|
-
self.
|
|
57
|
+
if self._closed:
|
|
58
|
+
return
|
|
59
|
+
self._closed = True
|
|
60
|
+
try:
|
|
61
|
+
self.warehouse.close_on_exit()
|
|
62
|
+
finally:
|
|
63
|
+
self.metastore.close_on_exit()
|
|
57
64
|
|
|
58
65
|
@property
|
|
59
66
|
def uri(self):
|
|
@@ -102,7 +109,7 @@ class Listing:
|
|
|
102
109
|
def collect_nodes_to_instantiate(
|
|
103
110
|
self,
|
|
104
111
|
sources: Iterable["DataSource"],
|
|
105
|
-
copy_to_filename:
|
|
112
|
+
copy_to_filename: str | None,
|
|
106
113
|
recursive=False,
|
|
107
114
|
copy_dir_contents=False,
|
|
108
115
|
from_dataset=False,
|
datachain/model/bbox.py
CHANGED
|
@@ -198,7 +198,9 @@ class BBox(DataModel):
|
|
|
198
198
|
def pose_inside(self, pose: Union["Pose", "Pose3D"]) -> bool:
|
|
199
199
|
"""Return True if the pose is inside the bounding box."""
|
|
200
200
|
return all(
|
|
201
|
-
self.point_inside(x, y)
|
|
201
|
+
self.point_inside(x, y)
|
|
202
|
+
for x, y in zip(pose.x, pose.y, strict=False)
|
|
203
|
+
if x > 0 or y > 0
|
|
202
204
|
)
|
|
203
205
|
|
|
204
206
|
@staticmethod
|
datachain/namespace.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import builtins
|
|
2
2
|
from dataclasses import dataclass, fields
|
|
3
3
|
from datetime import datetime
|
|
4
|
-
from typing import Any,
|
|
4
|
+
from typing import Any, TypeVar
|
|
5
5
|
|
|
6
6
|
from datachain.error import InvalidNamespaceNameError
|
|
7
7
|
|
|
@@ -9,12 +9,31 @@ N = TypeVar("N", bound="Namespace")
|
|
|
9
9
|
NAMESPACE_NAME_RESERVED_CHARS = [".", "@"]
|
|
10
10
|
|
|
11
11
|
|
|
12
|
+
def parse_name(name: str) -> tuple[str, str | None]:
|
|
13
|
+
"""
|
|
14
|
+
Parses namespace name into namespace and optional project name.
|
|
15
|
+
If both namespace and project are defined in name, they need to be split by dot
|
|
16
|
+
e.g dev.my-project
|
|
17
|
+
Valid inputs:
|
|
18
|
+
- dev.my-project
|
|
19
|
+
- dev
|
|
20
|
+
"""
|
|
21
|
+
parts = name.split(".")
|
|
22
|
+
if len(parts) == 1:
|
|
23
|
+
return name, None
|
|
24
|
+
if len(parts) == 2:
|
|
25
|
+
return parts[0], parts[1]
|
|
26
|
+
raise InvalidNamespaceNameError(
|
|
27
|
+
f"Invalid namespace format: {name}. Expected 'namespace' or 'ns1.ns2'."
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
12
31
|
@dataclass(frozen=True)
|
|
13
32
|
class Namespace:
|
|
14
33
|
id: int
|
|
15
34
|
uuid: str
|
|
16
35
|
name: str
|
|
17
|
-
descr:
|
|
36
|
+
descr: str | None
|
|
18
37
|
created_at: datetime
|
|
19
38
|
|
|
20
39
|
@staticmethod
|
|
@@ -54,7 +73,7 @@ class Namespace:
|
|
|
54
73
|
id: int,
|
|
55
74
|
uuid: str,
|
|
56
75
|
name: str,
|
|
57
|
-
descr:
|
|
76
|
+
descr: str | None,
|
|
58
77
|
created_at: datetime,
|
|
59
78
|
) -> "Namespace":
|
|
60
79
|
return cls(id, uuid, name, descr, created_at)
|
datachain/node.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from datetime import datetime
|
|
3
|
-
from typing import TYPE_CHECKING, Any
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
4
|
|
|
5
5
|
import attrs
|
|
6
6
|
|
|
@@ -53,11 +53,11 @@ class Node:
|
|
|
53
53
|
sys__rand: int = 0
|
|
54
54
|
path: str = ""
|
|
55
55
|
etag: str = ""
|
|
56
|
-
version:
|
|
56
|
+
version: str | None = None
|
|
57
57
|
is_latest: bool = True
|
|
58
|
-
last_modified:
|
|
58
|
+
last_modified: datetime | None = None
|
|
59
59
|
size: int = 0
|
|
60
|
-
location:
|
|
60
|
+
location: str | None = None
|
|
61
61
|
source: StorageURI = StorageURI("") # noqa: RUF009
|
|
62
62
|
dir_type: int = DirType.FILE
|
|
63
63
|
|
|
@@ -90,7 +90,7 @@ class Node:
|
|
|
90
90
|
return self.path + "/"
|
|
91
91
|
return self.path
|
|
92
92
|
|
|
93
|
-
def to_file(self, source:
|
|
93
|
+
def to_file(self, source: StorageURI | None = None) -> File:
|
|
94
94
|
if source is None:
|
|
95
95
|
source = self.source
|
|
96
96
|
return File(
|
|
@@ -189,7 +189,7 @@ class NodeWithPath:
|
|
|
189
189
|
TIME_FMT = "%Y-%m-%d %H:%M"
|
|
190
190
|
|
|
191
191
|
|
|
192
|
-
def long_line_str(name: str, timestamp:
|
|
192
|
+
def long_line_str(name: str, timestamp: datetime | None) -> str:
|
|
193
193
|
if timestamp is None:
|
|
194
194
|
time = "-"
|
|
195
195
|
else:
|
datachain/nodes_thread_pool.py
CHANGED
datachain/plugins.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Plugin loader for DataChain callables.
|
|
2
|
+
|
|
3
|
+
Discovers and invokes entry points in the group "datachain.callables" once
|
|
4
|
+
per process. This enables external packages (e.g., Studio) to register
|
|
5
|
+
their callables with the serializer registry without explicit imports.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from importlib import metadata as importlib_metadata
|
|
9
|
+
|
|
10
|
+
_plugins_loaded = False
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def ensure_plugins_loaded() -> None:
|
|
14
|
+
global _plugins_loaded # noqa: PLW0603
|
|
15
|
+
if _plugins_loaded:
|
|
16
|
+
return
|
|
17
|
+
|
|
18
|
+
# Compatible across importlib.metadata versions
|
|
19
|
+
eps_obj = importlib_metadata.entry_points()
|
|
20
|
+
for ep in eps_obj.select(group="datachain.callables"):
|
|
21
|
+
func = ep.load()
|
|
22
|
+
func()
|
|
23
|
+
|
|
24
|
+
_plugins_loaded = True
|
datachain/project.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import builtins
|
|
2
2
|
from dataclasses import dataclass, fields
|
|
3
3
|
from datetime import datetime
|
|
4
|
-
from typing import Any,
|
|
4
|
+
from typing import Any, TypeVar
|
|
5
5
|
|
|
6
6
|
from datachain.error import InvalidProjectNameError
|
|
7
7
|
from datachain.namespace import Namespace
|
|
@@ -15,7 +15,7 @@ class Project:
|
|
|
15
15
|
id: int
|
|
16
16
|
uuid: str
|
|
17
17
|
name: str
|
|
18
|
-
descr:
|
|
18
|
+
descr: str | None
|
|
19
19
|
created_at: datetime
|
|
20
20
|
namespace: Namespace
|
|
21
21
|
|
|
@@ -52,12 +52,12 @@ class Project:
|
|
|
52
52
|
namespace_id: int,
|
|
53
53
|
namespace_uuid: str,
|
|
54
54
|
namespace_name: str,
|
|
55
|
-
namespace_descr:
|
|
55
|
+
namespace_descr: str | None,
|
|
56
56
|
namespace_created_at: datetime,
|
|
57
57
|
project_id: int,
|
|
58
58
|
uuid: str,
|
|
59
59
|
name: str,
|
|
60
|
-
descr:
|
|
60
|
+
descr: str | None,
|
|
61
61
|
created_at: datetime,
|
|
62
62
|
project_namespace_id: int,
|
|
63
63
|
) -> "Project":
|
datachain/query/batch.py
CHANGED
|
@@ -1,16 +1,14 @@
|
|
|
1
1
|
import contextlib
|
|
2
2
|
import math
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
-
from collections.abc import Generator, Sequence
|
|
5
|
-
from typing import Callable, Optional, Union
|
|
4
|
+
from collections.abc import Callable, Generator, Sequence
|
|
6
5
|
|
|
7
6
|
import sqlalchemy as sa
|
|
8
7
|
|
|
9
8
|
from datachain.data_storage.schema import PARTITION_COLUMN_ID
|
|
10
|
-
from datachain.query.utils import get_query_column
|
|
11
9
|
|
|
12
10
|
RowsOutputBatch = Sequence[Sequence]
|
|
13
|
-
RowsOutput =
|
|
11
|
+
RowsOutput = Sequence | RowsOutputBatch
|
|
14
12
|
|
|
15
13
|
|
|
16
14
|
class BatchingStrategy(ABC):
|
|
@@ -23,7 +21,7 @@ class BatchingStrategy(ABC):
|
|
|
23
21
|
self,
|
|
24
22
|
execute: Callable,
|
|
25
23
|
query: sa.Select,
|
|
26
|
-
id_col:
|
|
24
|
+
id_col: sa.ColumnElement | None = None,
|
|
27
25
|
) -> Generator[RowsOutput, None, None]:
|
|
28
26
|
"""Apply the provided parameters to the UDF."""
|
|
29
27
|
|
|
@@ -40,7 +38,7 @@ class NoBatching(BatchingStrategy):
|
|
|
40
38
|
self,
|
|
41
39
|
execute: Callable,
|
|
42
40
|
query: sa.Select,
|
|
43
|
-
id_col:
|
|
41
|
+
id_col: sa.ColumnElement | None = None,
|
|
44
42
|
) -> Generator[Sequence, None, None]:
|
|
45
43
|
ids_only = False
|
|
46
44
|
if id_col is not None:
|
|
@@ -66,7 +64,7 @@ class Batch(BatchingStrategy):
|
|
|
66
64
|
self,
|
|
67
65
|
execute: Callable,
|
|
68
66
|
query: sa.Select,
|
|
69
|
-
id_col:
|
|
67
|
+
id_col: sa.ColumnElement | None = None,
|
|
70
68
|
) -> Generator[RowsOutput, None, None]:
|
|
71
69
|
from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
|
|
72
70
|
|
|
@@ -81,8 +79,8 @@ class Batch(BatchingStrategy):
|
|
|
81
79
|
# select rows in batches
|
|
82
80
|
results = []
|
|
83
81
|
|
|
84
|
-
with contextlib.closing(execute(query, page_size=page_size)) as
|
|
85
|
-
for row in
|
|
82
|
+
with contextlib.closing(execute(query, page_size=page_size)) as rows:
|
|
83
|
+
for row in rows:
|
|
86
84
|
results.append(row)
|
|
87
85
|
if len(results) >= self.count:
|
|
88
86
|
batch, results = results[: self.count], results[self.count :]
|
|
@@ -105,9 +103,9 @@ class Partition(BatchingStrategy):
|
|
|
105
103
|
self,
|
|
106
104
|
execute: Callable,
|
|
107
105
|
query: sa.Select,
|
|
108
|
-
id_col:
|
|
106
|
+
id_col: sa.ColumnElement | None = None,
|
|
109
107
|
) -> Generator[RowsOutput, None, None]:
|
|
110
|
-
if (partition_col :=
|
|
108
|
+
if (partition_col := query.selected_columns.get(PARTITION_COLUMN_ID)) is None:
|
|
111
109
|
raise RuntimeError("partition column not found in query")
|
|
112
110
|
|
|
113
111
|
ids_only = False
|
|
@@ -115,7 +113,7 @@ class Partition(BatchingStrategy):
|
|
|
115
113
|
query = query.with_only_columns(id_col, partition_col)
|
|
116
114
|
ids_only = True
|
|
117
115
|
|
|
118
|
-
current_partition:
|
|
116
|
+
current_partition: int | None = None
|
|
119
117
|
batch: list = []
|
|
120
118
|
|
|
121
119
|
query_fields = [str(c.name) for c in query.selected_columns]
|