datachain 0.34.6__py3-none-any.whl → 0.35.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/asyn.py +11 -12
- datachain/cache.py +5 -5
- datachain/catalog/catalog.py +75 -83
- datachain/catalog/loader.py +3 -3
- datachain/checkpoint.py +1 -2
- datachain/cli/__init__.py +2 -4
- datachain/cli/commands/datasets.py +13 -13
- datachain/cli/commands/ls.py +4 -4
- datachain/cli/commands/query.py +3 -3
- datachain/cli/commands/show.py +2 -2
- datachain/cli/parser/job.py +1 -1
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +1 -2
- datachain/client/azure.py +2 -2
- datachain/client/fsspec.py +11 -21
- datachain/client/gcs.py +3 -3
- datachain/client/http.py +4 -4
- datachain/client/local.py +4 -4
- datachain/client/s3.py +3 -3
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +5 -5
- datachain/data_storage/metastore.py +107 -107
- datachain/data_storage/schema.py +18 -24
- datachain/data_storage/sqlite.py +21 -28
- datachain/data_storage/warehouse.py +13 -13
- datachain/dataset.py +64 -70
- datachain/delta.py +21 -18
- datachain/diff/__init__.py +13 -13
- datachain/func/aggregate.py +9 -11
- datachain/func/array.py +12 -12
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +9 -13
- datachain/func/func.py +45 -42
- datachain/func/numeric.py +5 -7
- datachain/func/string.py +2 -2
- datachain/hash_utils.py +54 -81
- datachain/job.py +8 -8
- datachain/lib/arrow.py +17 -14
- datachain/lib/audio.py +6 -6
- datachain/lib/clip.py +5 -4
- datachain/lib/convert/python_to_sql.py +4 -22
- datachain/lib/convert/values_to_tuples.py +4 -9
- datachain/lib/data_model.py +20 -19
- datachain/lib/dataset_info.py +6 -6
- datachain/lib/dc/csv.py +10 -10
- datachain/lib/dc/database.py +28 -29
- datachain/lib/dc/datachain.py +98 -97
- datachain/lib/dc/datasets.py +22 -22
- datachain/lib/dc/hf.py +4 -4
- datachain/lib/dc/json.py +9 -10
- datachain/lib/dc/listings.py +5 -8
- datachain/lib/dc/pandas.py +3 -6
- datachain/lib/dc/parquet.py +5 -5
- datachain/lib/dc/records.py +5 -5
- datachain/lib/dc/storage.py +12 -12
- datachain/lib/dc/storage_pattern.py +2 -2
- datachain/lib/dc/utils.py +11 -14
- datachain/lib/dc/values.py +3 -6
- datachain/lib/file.py +32 -28
- datachain/lib/hf.py +7 -5
- datachain/lib/image.py +13 -13
- datachain/lib/listing.py +5 -5
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +1 -2
- datachain/lib/model_store.py +3 -3
- datachain/lib/namespaces.py +4 -6
- datachain/lib/projects.py +5 -9
- datachain/lib/pytorch.py +10 -10
- datachain/lib/settings.py +23 -23
- datachain/lib/signal_schema.py +52 -44
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +25 -17
- datachain/lib/udf_signature.py +11 -11
- datachain/lib/video.py +3 -4
- datachain/lib/webdataset.py +30 -35
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +4 -4
- datachain/model/bbox.py +3 -1
- datachain/namespace.py +4 -4
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +1 -7
- datachain/project.py +4 -4
- datachain/query/batch.py +7 -8
- datachain/query/dataset.py +80 -87
- datachain/query/dispatch.py +7 -7
- datachain/query/metrics.py +3 -4
- datachain/query/params.py +2 -3
- datachain/query/schema.py +7 -6
- datachain/query/session.py +7 -7
- datachain/query/udf.py +8 -7
- datachain/query/utils.py +3 -5
- datachain/remote/studio.py +33 -39
- datachain/script_meta.py +12 -12
- datachain/sql/sqlite/base.py +6 -9
- datachain/studio.py +30 -30
- datachain/toolkit/split.py +1 -2
- datachain/utils.py +21 -21
- {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/METADATA +2 -3
- datachain-0.35.0.dist-info/RECORD +173 -0
- datachain-0.34.6.dist-info/RECORD +0 -173
- {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/WHEEL +0 -0
- {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/top_level.txt +0 -0
datachain/lib/dc/pandas.py
CHANGED
|
@@ -1,7 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
2
|
-
TYPE_CHECKING,
|
|
3
|
-
Optional,
|
|
4
|
-
)
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
5
2
|
|
|
6
3
|
from datachain.query import Session
|
|
7
4
|
|
|
@@ -19,8 +16,8 @@ if TYPE_CHECKING:
|
|
|
19
16
|
def read_pandas( # type: ignore[override]
|
|
20
17
|
df: "pd.DataFrame",
|
|
21
18
|
name: str = "",
|
|
22
|
-
session:
|
|
23
|
-
settings:
|
|
19
|
+
session: Session | None = None,
|
|
20
|
+
settings: dict | None = None,
|
|
24
21
|
in_memory: bool = False,
|
|
25
22
|
column: str = "",
|
|
26
23
|
) -> "DataChain":
|
datachain/lib/dc/parquet.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from typing import TYPE_CHECKING, Any
|
|
2
|
+
from typing import TYPE_CHECKING, Any
|
|
3
3
|
|
|
4
4
|
from datachain.lib.data_model import DataType
|
|
5
5
|
from datachain.query import Session
|
|
@@ -13,14 +13,14 @@ if TYPE_CHECKING:
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
def read_parquet(
|
|
16
|
-
path:
|
|
16
|
+
path: str | os.PathLike[str] | list[str] | list[os.PathLike[str]],
|
|
17
17
|
partitioning: Any = "hive",
|
|
18
|
-
output:
|
|
18
|
+
output: dict[str, DataType] | None = None,
|
|
19
19
|
column: str = "",
|
|
20
20
|
model_name: str = "",
|
|
21
21
|
source: bool = True,
|
|
22
|
-
session:
|
|
23
|
-
settings:
|
|
22
|
+
session: Session | None = None,
|
|
23
|
+
settings: dict | None = None,
|
|
24
24
|
**kwargs,
|
|
25
25
|
) -> "DataChain":
|
|
26
26
|
"""Generate chain from parquet files.
|
datachain/lib/dc/records.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from collections.abc import Iterable
|
|
2
|
-
from typing import TYPE_CHECKING
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
3
|
|
|
4
4
|
import sqlalchemy
|
|
5
5
|
|
|
@@ -19,11 +19,11 @@ READ_RECORDS_BATCH_SIZE = 10000
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
def read_records(
|
|
22
|
-
to_insert:
|
|
23
|
-
session:
|
|
24
|
-
settings:
|
|
22
|
+
to_insert: dict | Iterable[dict] | None,
|
|
23
|
+
session: Session | None = None,
|
|
24
|
+
settings: dict | None = None,
|
|
25
25
|
in_memory: bool = False,
|
|
26
|
-
schema:
|
|
26
|
+
schema: dict[str, DataType] | None = None,
|
|
27
27
|
) -> "DataChain":
|
|
28
28
|
"""Create a DataChain from the provided records. This method can be used for
|
|
29
29
|
programmatically generating a chain in contrast of reading data from storages
|
datachain/lib/dc/storage.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from collections.abc import Sequence
|
|
3
3
|
from functools import reduce
|
|
4
|
-
from typing import TYPE_CHECKING
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
5
|
|
|
6
6
|
from datachain.lib.dc.storage_pattern import (
|
|
7
7
|
apply_glob_filter,
|
|
@@ -19,27 +19,27 @@ if TYPE_CHECKING:
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
def read_storage(
|
|
22
|
-
uri:
|
|
22
|
+
uri: str | os.PathLike[str] | list[str] | list[os.PathLike[str]],
|
|
23
23
|
*,
|
|
24
24
|
type: FileType = "binary",
|
|
25
|
-
session:
|
|
26
|
-
settings:
|
|
25
|
+
session: Session | None = None,
|
|
26
|
+
settings: dict | None = None,
|
|
27
27
|
in_memory: bool = False,
|
|
28
|
-
recursive:
|
|
28
|
+
recursive: bool | None = True,
|
|
29
29
|
column: str = "file",
|
|
30
30
|
update: bool = False,
|
|
31
|
-
anon:
|
|
32
|
-
delta:
|
|
33
|
-
delta_on:
|
|
31
|
+
anon: bool | None = None,
|
|
32
|
+
delta: bool | None = False,
|
|
33
|
+
delta_on: str | Sequence[str] | None = (
|
|
34
34
|
"file.path",
|
|
35
35
|
"file.etag",
|
|
36
36
|
"file.version",
|
|
37
37
|
),
|
|
38
|
-
delta_result_on:
|
|
39
|
-
delta_compare:
|
|
40
|
-
delta_retry:
|
|
38
|
+
delta_result_on: str | Sequence[str] | None = None,
|
|
39
|
+
delta_compare: str | Sequence[str] | None = None,
|
|
40
|
+
delta_retry: bool | str | None = None,
|
|
41
41
|
delta_unsafe: bool = False,
|
|
42
|
-
client_config:
|
|
42
|
+
client_config: dict | None = None,
|
|
43
43
|
) -> "DataChain":
|
|
44
44
|
"""Get data from storage(s) as a list of file with all file attributes.
|
|
45
45
|
It returns the chain itself as usual.
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import glob
|
|
2
|
-
from typing import TYPE_CHECKING
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
3
|
|
|
4
4
|
from datachain.client.fsspec import is_cloud_uri
|
|
5
5
|
from datachain.lib.listing import ls
|
|
@@ -32,7 +32,7 @@ def validate_cloud_bucket_name(uri: str) -> None:
|
|
|
32
32
|
raise ValueError(f"Glob patterns in bucket names are not supported: {uri}")
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
def split_uri_pattern(uri: str) -> tuple[str,
|
|
35
|
+
def split_uri_pattern(uri: str) -> tuple[str, str | None]:
|
|
36
36
|
"""Split a URI into base path and glob pattern."""
|
|
37
37
|
if not any(char in uri for char in ["*", "?", "[", "{", "}"]):
|
|
38
38
|
return uri, None
|
datachain/lib/dc/utils.py
CHANGED
|
@@ -1,12 +1,6 @@
|
|
|
1
1
|
from collections.abc import Sequence
|
|
2
2
|
from functools import wraps
|
|
3
|
-
from typing import
|
|
4
|
-
TYPE_CHECKING,
|
|
5
|
-
Callable,
|
|
6
|
-
Optional,
|
|
7
|
-
TypeVar,
|
|
8
|
-
Union,
|
|
9
|
-
)
|
|
3
|
+
from typing import TYPE_CHECKING, TypeVar
|
|
10
4
|
|
|
11
5
|
import sqlalchemy
|
|
12
6
|
from sqlalchemy.sql.functions import GenericFunction
|
|
@@ -18,7 +12,10 @@ from datachain.query.schema import DEFAULT_DELIMITER
|
|
|
18
12
|
from datachain.utils import getenv_bool
|
|
19
13
|
|
|
20
14
|
if TYPE_CHECKING:
|
|
21
|
-
from
|
|
15
|
+
from collections.abc import Callable
|
|
16
|
+
from typing import Concatenate
|
|
17
|
+
|
|
18
|
+
from typing_extensions import ParamSpec
|
|
22
19
|
|
|
23
20
|
from .datachain import DataChain
|
|
24
21
|
|
|
@@ -70,11 +67,11 @@ class DatasetFromValuesError(DataChainParamsError):
|
|
|
70
67
|
super().__init__(f"Dataset{name} from values error: {msg}")
|
|
71
68
|
|
|
72
69
|
|
|
73
|
-
MergeColType =
|
|
70
|
+
MergeColType = str | Function | sqlalchemy.ColumnElement
|
|
74
71
|
|
|
75
72
|
|
|
76
73
|
def _validate_merge_on(
|
|
77
|
-
on:
|
|
74
|
+
on: MergeColType | Sequence[MergeColType],
|
|
78
75
|
ds: "DataChain",
|
|
79
76
|
) -> Sequence[MergeColType]:
|
|
80
77
|
if isinstance(on, (str, sqlalchemy.ColumnElement)):
|
|
@@ -103,12 +100,12 @@ def _get_merge_error_str(col: MergeColType) -> str:
|
|
|
103
100
|
class DatasetMergeError(DataChainParamsError):
|
|
104
101
|
def __init__(
|
|
105
102
|
self,
|
|
106
|
-
on:
|
|
107
|
-
right_on:
|
|
103
|
+
on: MergeColType | Sequence[MergeColType],
|
|
104
|
+
right_on: MergeColType | Sequence[MergeColType] | None,
|
|
108
105
|
msg: str,
|
|
109
106
|
):
|
|
110
107
|
def _get_str(
|
|
111
|
-
on:
|
|
108
|
+
on: MergeColType | Sequence[MergeColType],
|
|
112
109
|
) -> str:
|
|
113
110
|
if not isinstance(on, Sequence):
|
|
114
111
|
return str(on) # type: ignore[unreachable]
|
|
@@ -123,7 +120,7 @@ class DatasetMergeError(DataChainParamsError):
|
|
|
123
120
|
super().__init__(f"Merge error on='{on_str}'{right_on_str}: {msg}")
|
|
124
121
|
|
|
125
122
|
|
|
126
|
-
OutputType =
|
|
123
|
+
OutputType = DataType | Sequence[str] | dict[str, DataType] | None
|
|
127
124
|
|
|
128
125
|
|
|
129
126
|
class Sys(DataModel):
|
datachain/lib/dc/values.py
CHANGED
|
@@ -1,8 +1,5 @@
|
|
|
1
1
|
from collections.abc import Iterator
|
|
2
|
-
from typing import
|
|
3
|
-
TYPE_CHECKING,
|
|
4
|
-
Optional,
|
|
5
|
-
)
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
6
3
|
|
|
7
4
|
from datachain.lib.convert.values_to_tuples import values_to_tuples
|
|
8
5
|
from datachain.lib.data_model import dict_to_data_model
|
|
@@ -20,8 +17,8 @@ if TYPE_CHECKING:
|
|
|
20
17
|
|
|
21
18
|
def read_values(
|
|
22
19
|
ds_name: str = "",
|
|
23
|
-
session:
|
|
24
|
-
settings:
|
|
20
|
+
session: Session | None = None,
|
|
21
|
+
settings: dict | None = None,
|
|
25
22
|
in_memory: bool = False,
|
|
26
23
|
output: OutputType = None,
|
|
27
24
|
column: str = "",
|
datachain/lib/file.py
CHANGED
|
@@ -13,7 +13,7 @@ from datetime import datetime
|
|
|
13
13
|
from functools import partial
|
|
14
14
|
from io import BytesIO
|
|
15
15
|
from pathlib import Path, PurePath, PurePosixPath
|
|
16
|
-
from typing import TYPE_CHECKING, Any, ClassVar, Literal
|
|
16
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Literal
|
|
17
17
|
from urllib.parse import unquote, urlparse
|
|
18
18
|
from urllib.request import url2pathname
|
|
19
19
|
|
|
@@ -53,12 +53,12 @@ class FileExporter(NodesThreadPool):
|
|
|
53
53
|
|
|
54
54
|
def __init__(
|
|
55
55
|
self,
|
|
56
|
-
output:
|
|
56
|
+
output: str | os.PathLike[str],
|
|
57
57
|
placement: ExportPlacement,
|
|
58
58
|
use_cache: bool,
|
|
59
59
|
link_type: Literal["copy", "symlink"],
|
|
60
60
|
max_threads: int = EXPORT_FILES_MAX_THREADS,
|
|
61
|
-
client_config:
|
|
61
|
+
client_config: dict | None = None,
|
|
62
62
|
):
|
|
63
63
|
super().__init__(max_threads)
|
|
64
64
|
self.output = output
|
|
@@ -221,7 +221,7 @@ class File(DataModel):
|
|
|
221
221
|
etag: str = Field(default="")
|
|
222
222
|
is_latest: bool = Field(default=True)
|
|
223
223
|
last_modified: datetime = Field(default=TIME_ZERO)
|
|
224
|
-
location:
|
|
224
|
+
location: dict | list[dict] | None = Field(default=None)
|
|
225
225
|
|
|
226
226
|
_datachain_column_types: ClassVar[dict[str, Any]] = {
|
|
227
227
|
"source": String,
|
|
@@ -264,8 +264,8 @@ class File(DataModel):
|
|
|
264
264
|
|
|
265
265
|
@staticmethod
|
|
266
266
|
def _validate_dict(
|
|
267
|
-
v:
|
|
268
|
-
) ->
|
|
267
|
+
v: str | dict | list[dict] | None,
|
|
268
|
+
) -> str | dict | list[dict] | None:
|
|
269
269
|
if v is None or v == "":
|
|
270
270
|
return None
|
|
271
271
|
if isinstance(v, str):
|
|
@@ -334,8 +334,8 @@ class File(DataModel):
|
|
|
334
334
|
def upload(
|
|
335
335
|
cls,
|
|
336
336
|
data: bytes,
|
|
337
|
-
path:
|
|
338
|
-
catalog:
|
|
337
|
+
path: str | os.PathLike[str],
|
|
338
|
+
catalog: "Catalog | None" = None,
|
|
339
339
|
) -> "Self":
|
|
340
340
|
if catalog is None:
|
|
341
341
|
from datachain.catalog.loader import get_catalog
|
|
@@ -357,7 +357,7 @@ class File(DataModel):
|
|
|
357
357
|
|
|
358
358
|
@classmethod
|
|
359
359
|
def at(
|
|
360
|
-
cls, uri:
|
|
360
|
+
cls, uri: str | os.PathLike[str], session: "Session | None" = None
|
|
361
361
|
) -> "Self":
|
|
362
362
|
"""Construct a File from a full URI in one call.
|
|
363
363
|
|
|
@@ -470,7 +470,7 @@ class File(DataModel):
|
|
|
470
470
|
"""Returns file contents."""
|
|
471
471
|
return self.read_bytes(length)
|
|
472
472
|
|
|
473
|
-
def save(self, destination: str, client_config:
|
|
473
|
+
def save(self, destination: str, client_config: dict | None = None):
|
|
474
474
|
"""Writes it's content to destination"""
|
|
475
475
|
destination = stringify_path(destination)
|
|
476
476
|
client: Client = self._catalog.get_client(destination, **(client_config or {}))
|
|
@@ -497,11 +497,11 @@ class File(DataModel):
|
|
|
497
497
|
|
|
498
498
|
def export(
|
|
499
499
|
self,
|
|
500
|
-
output:
|
|
500
|
+
output: str | os.PathLike[str],
|
|
501
501
|
placement: ExportPlacement = "fullpath",
|
|
502
502
|
use_cache: bool = True,
|
|
503
503
|
link_type: Literal["copy", "symlink"] = "copy",
|
|
504
|
-
client_config:
|
|
504
|
+
client_config: dict | None = None,
|
|
505
505
|
) -> None:
|
|
506
506
|
"""Export file to new location."""
|
|
507
507
|
self._caching_enabled = use_cache
|
|
@@ -537,7 +537,7 @@ class File(DataModel):
|
|
|
537
537
|
client = self._catalog.get_client(self.source)
|
|
538
538
|
client.download(self, callback=self._download_cb)
|
|
539
539
|
|
|
540
|
-
async def _prefetch(self, download_cb:
|
|
540
|
+
async def _prefetch(self, download_cb: "Callback | None" = None) -> bool:
|
|
541
541
|
if self._catalog is None:
|
|
542
542
|
raise RuntimeError("cannot prefetch file because catalog is not setup")
|
|
543
543
|
|
|
@@ -552,7 +552,7 @@ class File(DataModel):
|
|
|
552
552
|
)
|
|
553
553
|
return True
|
|
554
554
|
|
|
555
|
-
def get_local_path(self) ->
|
|
555
|
+
def get_local_path(self) -> str | None:
|
|
556
556
|
"""Return path to a file in a local cache.
|
|
557
557
|
|
|
558
558
|
Returns None if file is not cached.
|
|
@@ -629,7 +629,7 @@ class File(DataModel):
|
|
|
629
629
|
return path
|
|
630
630
|
|
|
631
631
|
def get_destination_path(
|
|
632
|
-
self, output:
|
|
632
|
+
self, output: str | os.PathLike[str], placement: ExportPlacement
|
|
633
633
|
) -> str:
|
|
634
634
|
"""
|
|
635
635
|
Returns full destination path of a file for exporting to some output
|
|
@@ -681,7 +681,7 @@ class File(DataModel):
|
|
|
681
681
|
normalized_path = self.get_path_normalized()
|
|
682
682
|
info = client.fs.info(client.get_full_path(normalized_path))
|
|
683
683
|
converted_info = client.info_to_file(info, normalized_path)
|
|
684
|
-
|
|
684
|
+
res = type(self)(
|
|
685
685
|
path=self.path,
|
|
686
686
|
source=self.source,
|
|
687
687
|
size=converted_info.size,
|
|
@@ -691,6 +691,8 @@ class File(DataModel):
|
|
|
691
691
|
last_modified=converted_info.last_modified,
|
|
692
692
|
location=self.location,
|
|
693
693
|
)
|
|
694
|
+
res._set_stream(self._catalog)
|
|
695
|
+
return res
|
|
694
696
|
except FileError as e:
|
|
695
697
|
logger.warning(
|
|
696
698
|
"File error when resolving %s/%s: %s", self.source, self.path, str(e)
|
|
@@ -703,7 +705,7 @@ class File(DataModel):
|
|
|
703
705
|
str(e),
|
|
704
706
|
)
|
|
705
707
|
|
|
706
|
-
|
|
708
|
+
res = type(self)(
|
|
707
709
|
path=self.path,
|
|
708
710
|
source=self.source,
|
|
709
711
|
size=0,
|
|
@@ -713,6 +715,8 @@ class File(DataModel):
|
|
|
713
715
|
last_modified=TIME_ZERO,
|
|
714
716
|
location=self.location,
|
|
715
717
|
)
|
|
718
|
+
res._set_stream(self._catalog)
|
|
719
|
+
return res
|
|
716
720
|
|
|
717
721
|
def rebase(
|
|
718
722
|
self,
|
|
@@ -796,7 +800,7 @@ class TextFile(File):
|
|
|
796
800
|
with self.open(**open_kwargs) as stream:
|
|
797
801
|
return stream.read()
|
|
798
802
|
|
|
799
|
-
def save(self, destination: str, client_config:
|
|
803
|
+
def save(self, destination: str, client_config: dict | None = None):
|
|
800
804
|
"""Writes it's content to destination"""
|
|
801
805
|
destination = stringify_path(destination)
|
|
802
806
|
|
|
@@ -829,8 +833,8 @@ class ImageFile(File):
|
|
|
829
833
|
def save( # type: ignore[override]
|
|
830
834
|
self,
|
|
831
835
|
destination: str,
|
|
832
|
-
format:
|
|
833
|
-
client_config:
|
|
836
|
+
format: str | None = None,
|
|
837
|
+
client_config: dict | None = None,
|
|
834
838
|
):
|
|
835
839
|
"""Writes it's content to destination"""
|
|
836
840
|
destination = stringify_path(destination)
|
|
@@ -912,7 +916,7 @@ class VideoFile(File):
|
|
|
912
916
|
def get_frames(
|
|
913
917
|
self,
|
|
914
918
|
start: int = 0,
|
|
915
|
-
end:
|
|
919
|
+
end: int | None = None,
|
|
916
920
|
step: int = 1,
|
|
917
921
|
) -> "Iterator[VideoFrame]":
|
|
918
922
|
"""
|
|
@@ -962,7 +966,7 @@ class VideoFile(File):
|
|
|
962
966
|
self,
|
|
963
967
|
duration: float,
|
|
964
968
|
start: float = 0,
|
|
965
|
-
end:
|
|
969
|
+
end: float | None = None,
|
|
966
970
|
) -> "Iterator[VideoFragment]":
|
|
967
971
|
"""
|
|
968
972
|
Splits the video into multiple fragments of a specified duration.
|
|
@@ -1048,7 +1052,7 @@ class AudioFile(File):
|
|
|
1048
1052
|
self,
|
|
1049
1053
|
duration: float,
|
|
1050
1054
|
start: float = 0,
|
|
1051
|
-
end:
|
|
1055
|
+
end: float | None = None,
|
|
1052
1056
|
) -> "Iterator[AudioFragment]":
|
|
1053
1057
|
"""
|
|
1054
1058
|
Splits the audio into multiple fragments of a specified duration.
|
|
@@ -1086,10 +1090,10 @@ class AudioFile(File):
|
|
|
1086
1090
|
def save( # type: ignore[override]
|
|
1087
1091
|
self,
|
|
1088
1092
|
output: str,
|
|
1089
|
-
format:
|
|
1093
|
+
format: str | None = None,
|
|
1090
1094
|
start: float = 0,
|
|
1091
|
-
end:
|
|
1092
|
-
client_config:
|
|
1095
|
+
end: float | None = None,
|
|
1096
|
+
client_config: dict | None = None,
|
|
1093
1097
|
) -> "AudioFile":
|
|
1094
1098
|
"""Save audio file or extract fragment to specified format.
|
|
1095
1099
|
|
|
@@ -1160,7 +1164,7 @@ class AudioFragment(DataModel):
|
|
|
1160
1164
|
duration = self.end - self.start
|
|
1161
1165
|
return audio_to_bytes(self.audio, format, self.start, duration)
|
|
1162
1166
|
|
|
1163
|
-
def save(self, output: str, format:
|
|
1167
|
+
def save(self, output: str, format: str | None = None) -> "AudioFile":
|
|
1164
1168
|
"""
|
|
1165
1169
|
Saves the audio fragment as a new audio file.
|
|
1166
1170
|
|
|
@@ -1263,7 +1267,7 @@ class VideoFragment(DataModel):
|
|
|
1263
1267
|
start: float
|
|
1264
1268
|
end: float
|
|
1265
1269
|
|
|
1266
|
-
def save(self, output: str, format:
|
|
1270
|
+
def save(self, output: str, format: str | None = None) -> "VideoFile":
|
|
1267
1271
|
"""
|
|
1268
1272
|
Saves the video fragment as a new video file.
|
|
1269
1273
|
|
datachain/lib/hf.py
CHANGED
|
@@ -26,7 +26,7 @@ except ImportError as exc:
|
|
|
26
26
|
) from exc
|
|
27
27
|
|
|
28
28
|
from io import BytesIO
|
|
29
|
-
from typing import TYPE_CHECKING, Any,
|
|
29
|
+
from typing import TYPE_CHECKING, Any, TypeAlias
|
|
30
30
|
|
|
31
31
|
import PIL
|
|
32
32
|
from tqdm.auto import tqdm
|
|
@@ -41,7 +41,9 @@ if TYPE_CHECKING:
|
|
|
41
41
|
from pydantic import BaseModel
|
|
42
42
|
|
|
43
43
|
|
|
44
|
-
HFDatasetType =
|
|
44
|
+
HFDatasetType: TypeAlias = (
|
|
45
|
+
str | DatasetDict | Dataset | IterableDatasetDict | IterableDataset
|
|
46
|
+
)
|
|
45
47
|
|
|
46
48
|
|
|
47
49
|
class HFClassLabel(DataModel):
|
|
@@ -67,7 +69,7 @@ class HFAudio(DataModel):
|
|
|
67
69
|
class HFGenerator(Generator):
|
|
68
70
|
def __init__(
|
|
69
71
|
self,
|
|
70
|
-
ds:
|
|
72
|
+
ds: HFDatasetType,
|
|
71
73
|
output_schema: type["BaseModel"],
|
|
72
74
|
limit: int = 0,
|
|
73
75
|
*args,
|
|
@@ -117,7 +119,7 @@ class HFGenerator(Generator):
|
|
|
117
119
|
pbar.update(1)
|
|
118
120
|
|
|
119
121
|
|
|
120
|
-
def stream_splits(ds:
|
|
122
|
+
def stream_splits(ds: HFDatasetType, *args, **kwargs):
|
|
121
123
|
if isinstance(ds, str):
|
|
122
124
|
ds = load_dataset(ds, *args, **kwargs)
|
|
123
125
|
if isinstance(ds, (DatasetDict, IterableDatasetDict)):
|
|
@@ -153,7 +155,7 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
|
|
|
153
155
|
|
|
154
156
|
|
|
155
157
|
def get_output_schema(
|
|
156
|
-
features: Features, existing_column_names:
|
|
158
|
+
features: Features, existing_column_names: list[str] | None = None
|
|
157
159
|
) -> tuple[dict[str, DataType], dict[str, str]]:
|
|
158
160
|
"""
|
|
159
161
|
Generate UDF output schema from Hugging Face datasets features. It normalizes the
|
datachain/lib/image.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from
|
|
1
|
+
from collections.abc import Callable
|
|
2
2
|
|
|
3
3
|
import torch
|
|
4
4
|
from PIL import Image as PILImage
|
|
@@ -6,7 +6,7 @@ from PIL import Image as PILImage
|
|
|
6
6
|
from datachain.lib.file import File, FileError, Image, ImageFile
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
def image_info(file:
|
|
9
|
+
def image_info(file: File | ImageFile) -> Image:
|
|
10
10
|
"""
|
|
11
11
|
Returns image file information.
|
|
12
12
|
|
|
@@ -31,11 +31,11 @@ def image_info(file: Union[File, ImageFile]) -> Image:
|
|
|
31
31
|
def convert_image(
|
|
32
32
|
img: PILImage.Image,
|
|
33
33
|
mode: str = "RGB",
|
|
34
|
-
size:
|
|
35
|
-
transform:
|
|
36
|
-
encoder:
|
|
37
|
-
device:
|
|
38
|
-
) ->
|
|
34
|
+
size: tuple[int, int] | None = None,
|
|
35
|
+
transform: Callable | None = None,
|
|
36
|
+
encoder: Callable | None = None,
|
|
37
|
+
device: str | torch.device | None = None,
|
|
38
|
+
) -> PILImage.Image | torch.Tensor:
|
|
39
39
|
"""
|
|
40
40
|
Resize, transform, and otherwise convert an image.
|
|
41
41
|
|
|
@@ -71,13 +71,13 @@ def convert_image(
|
|
|
71
71
|
|
|
72
72
|
|
|
73
73
|
def convert_images(
|
|
74
|
-
images:
|
|
74
|
+
images: PILImage.Image | list[PILImage.Image],
|
|
75
75
|
mode: str = "RGB",
|
|
76
|
-
size:
|
|
77
|
-
transform:
|
|
78
|
-
encoder:
|
|
79
|
-
device:
|
|
80
|
-
) ->
|
|
76
|
+
size: tuple[int, int] | None = None,
|
|
77
|
+
transform: Callable | None = None,
|
|
78
|
+
encoder: Callable | None = None,
|
|
79
|
+
device: str | torch.device | None = None,
|
|
80
|
+
) -> list[PILImage.Image] | torch.Tensor:
|
|
81
81
|
"""
|
|
82
82
|
Resize, transform, and otherwise convert one or more images.
|
|
83
83
|
|
datachain/lib/listing.py
CHANGED
|
@@ -2,10 +2,10 @@ import glob
|
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
4
|
import posixpath
|
|
5
|
-
from collections.abc import Iterator
|
|
5
|
+
from collections.abc import Callable, Iterator
|
|
6
6
|
from contextlib import contextmanager
|
|
7
7
|
from datetime import datetime, timedelta, timezone
|
|
8
|
-
from typing import TYPE_CHECKING,
|
|
8
|
+
from typing import TYPE_CHECKING, TypeVar
|
|
9
9
|
|
|
10
10
|
from fsspec.asyn import get_loop
|
|
11
11
|
from sqlalchemy.sql.expression import true
|
|
@@ -73,7 +73,7 @@ def get_file_info(uri: str, cache, client_config=None) -> File:
|
|
|
73
73
|
def ls(
|
|
74
74
|
dc: D,
|
|
75
75
|
path: str,
|
|
76
|
-
recursive:
|
|
76
|
+
recursive: bool | None = True,
|
|
77
77
|
column="file",
|
|
78
78
|
) -> D:
|
|
79
79
|
"""
|
|
@@ -150,8 +150,8 @@ def _reraise_as_client_error() -> Iterator[None]:
|
|
|
150
150
|
|
|
151
151
|
|
|
152
152
|
def get_listing(
|
|
153
|
-
uri:
|
|
154
|
-
) -> tuple[
|
|
153
|
+
uri: str | os.PathLike[str], session: "Session", update: bool = False
|
|
154
|
+
) -> tuple[str | None, str, str, bool]:
|
|
155
155
|
"""Returns correct listing dataset name that must be used for saving listing
|
|
156
156
|
operation. It takes into account existing listings and reusability of those.
|
|
157
157
|
It also returns boolean saying if returned dataset name is reused / already
|
datachain/lib/listing_info.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from datetime import datetime, timedelta, timezone
|
|
2
|
-
from typing import Optional
|
|
3
2
|
|
|
4
3
|
from datachain.client import Client
|
|
5
4
|
from datachain.lib.dataset_info import DatasetInfo
|
|
@@ -17,7 +16,7 @@ class ListingInfo(DatasetInfo):
|
|
|
17
16
|
return uri
|
|
18
17
|
|
|
19
18
|
@property
|
|
20
|
-
def expires(self) ->
|
|
19
|
+
def expires(self) -> datetime | None:
|
|
21
20
|
if not self.finished_at:
|
|
22
21
|
return None
|
|
23
22
|
return self.finished_at + timedelta(seconds=LISTING_TTL)
|
datachain/lib/meta_formats.py
CHANGED
|
@@ -2,9 +2,8 @@ import csv
|
|
|
2
2
|
import json
|
|
3
3
|
import tempfile
|
|
4
4
|
import uuid
|
|
5
|
-
from collections.abc import Iterator
|
|
5
|
+
from collections.abc import Callable, Iterator
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Callable
|
|
8
7
|
|
|
9
8
|
import jmespath as jsp
|
|
10
9
|
from pydantic import BaseModel, ConfigDict, Field, ValidationError # noqa: F401
|
datachain/lib/model_store.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import inspect
|
|
2
2
|
import logging
|
|
3
|
-
from typing import Any, ClassVar
|
|
3
|
+
from typing import Any, ClassVar
|
|
4
4
|
|
|
5
5
|
from pydantic import BaseModel
|
|
6
6
|
|
|
@@ -39,7 +39,7 @@ class ModelStore:
|
|
|
39
39
|
cls.register(anno)
|
|
40
40
|
|
|
41
41
|
@classmethod
|
|
42
|
-
def get(cls, name: str, version:
|
|
42
|
+
def get(cls, name: str, version: int | None = None) -> type | None:
|
|
43
43
|
class_dict = cls.store.get(name, None)
|
|
44
44
|
if class_dict is None:
|
|
45
45
|
return None
|
|
@@ -77,7 +77,7 @@ class ModelStore:
|
|
|
77
77
|
)
|
|
78
78
|
|
|
79
79
|
@staticmethod
|
|
80
|
-
def to_pydantic(val) ->
|
|
80
|
+
def to_pydantic(val) -> type[BaseModel] | None:
|
|
81
81
|
if val is None or not ModelStore.is_pydantic(val):
|
|
82
82
|
return None
|
|
83
83
|
return val
|
datachain/lib/namespaces.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from typing import Optional
|
|
2
|
-
|
|
3
1
|
from datachain.error import (
|
|
4
2
|
NamespaceCreateNotAllowedError,
|
|
5
3
|
NamespaceDeleteNotAllowedError,
|
|
@@ -10,7 +8,7 @@ from datachain.query import Session
|
|
|
10
8
|
|
|
11
9
|
|
|
12
10
|
def create(
|
|
13
|
-
name: str, descr:
|
|
11
|
+
name: str, descr: str | None = None, session: Session | None = None
|
|
14
12
|
) -> Namespace:
|
|
15
13
|
"""
|
|
16
14
|
Creates a new namespace.
|
|
@@ -42,7 +40,7 @@ def create(
|
|
|
42
40
|
return session.catalog.metastore.create_namespace(name, descr)
|
|
43
41
|
|
|
44
42
|
|
|
45
|
-
def get(name: str, session:
|
|
43
|
+
def get(name: str, session: Session | None = None) -> Namespace:
|
|
46
44
|
"""
|
|
47
45
|
Gets a namespace by name.
|
|
48
46
|
If the namespace is not found, a `NamespaceNotFoundError` is raised.
|
|
@@ -61,7 +59,7 @@ def get(name: str, session: Optional[Session] = None) -> Namespace:
|
|
|
61
59
|
return session.catalog.metastore.get_namespace(name)
|
|
62
60
|
|
|
63
61
|
|
|
64
|
-
def ls(session:
|
|
62
|
+
def ls(session: Session | None = None) -> list[Namespace]:
|
|
65
63
|
"""
|
|
66
64
|
Gets a list of all namespaces.
|
|
67
65
|
|
|
@@ -77,7 +75,7 @@ def ls(session: Optional[Session] = None) -> list[Namespace]:
|
|
|
77
75
|
return Session.get(session).catalog.metastore.list_namespaces()
|
|
78
76
|
|
|
79
77
|
|
|
80
|
-
def delete_namespace(name: str, session:
|
|
78
|
+
def delete_namespace(name: str, session: Session | None = None) -> None:
|
|
81
79
|
"""
|
|
82
80
|
Removes a namespace by name.
|
|
83
81
|
|