datachain 0.34.6__py3-none-any.whl → 0.34.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/asyn.py +11 -12
- datachain/cache.py +5 -5
- datachain/catalog/catalog.py +75 -83
- datachain/catalog/loader.py +3 -3
- datachain/checkpoint.py +1 -2
- datachain/cli/__init__.py +2 -4
- datachain/cli/commands/datasets.py +13 -13
- datachain/cli/commands/ls.py +4 -4
- datachain/cli/commands/query.py +3 -3
- datachain/cli/commands/show.py +2 -2
- datachain/cli/parser/job.py +1 -1
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +1 -2
- datachain/client/azure.py +2 -2
- datachain/client/fsspec.py +11 -21
- datachain/client/gcs.py +3 -3
- datachain/client/http.py +4 -4
- datachain/client/local.py +4 -4
- datachain/client/s3.py +3 -3
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +5 -5
- datachain/data_storage/metastore.py +107 -107
- datachain/data_storage/schema.py +18 -24
- datachain/data_storage/sqlite.py +21 -28
- datachain/data_storage/warehouse.py +13 -13
- datachain/dataset.py +64 -70
- datachain/delta.py +21 -18
- datachain/diff/__init__.py +13 -13
- datachain/func/aggregate.py +9 -11
- datachain/func/array.py +12 -12
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +9 -13
- datachain/func/func.py +45 -42
- datachain/func/numeric.py +5 -7
- datachain/func/string.py +2 -2
- datachain/hash_utils.py +54 -81
- datachain/job.py +8 -8
- datachain/lib/arrow.py +17 -14
- datachain/lib/audio.py +6 -6
- datachain/lib/clip.py +5 -4
- datachain/lib/convert/python_to_sql.py +4 -22
- datachain/lib/convert/values_to_tuples.py +4 -9
- datachain/lib/data_model.py +20 -19
- datachain/lib/dataset_info.py +6 -6
- datachain/lib/dc/csv.py +10 -10
- datachain/lib/dc/database.py +28 -29
- datachain/lib/dc/datachain.py +98 -97
- datachain/lib/dc/datasets.py +22 -22
- datachain/lib/dc/hf.py +4 -4
- datachain/lib/dc/json.py +9 -10
- datachain/lib/dc/listings.py +5 -8
- datachain/lib/dc/pandas.py +3 -6
- datachain/lib/dc/parquet.py +5 -5
- datachain/lib/dc/records.py +5 -5
- datachain/lib/dc/storage.py +12 -12
- datachain/lib/dc/storage_pattern.py +2 -2
- datachain/lib/dc/utils.py +11 -14
- datachain/lib/dc/values.py +3 -6
- datachain/lib/file.py +26 -26
- datachain/lib/hf.py +7 -5
- datachain/lib/image.py +13 -13
- datachain/lib/listing.py +5 -5
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +1 -2
- datachain/lib/model_store.py +3 -3
- datachain/lib/namespaces.py +4 -6
- datachain/lib/projects.py +5 -9
- datachain/lib/pytorch.py +10 -10
- datachain/lib/settings.py +23 -23
- datachain/lib/signal_schema.py +52 -44
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +25 -17
- datachain/lib/udf_signature.py +11 -11
- datachain/lib/video.py +3 -4
- datachain/lib/webdataset.py +30 -35
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +4 -4
- datachain/model/bbox.py +3 -1
- datachain/namespace.py +4 -4
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +1 -7
- datachain/project.py +4 -4
- datachain/query/batch.py +7 -8
- datachain/query/dataset.py +80 -87
- datachain/query/dispatch.py +7 -7
- datachain/query/metrics.py +3 -4
- datachain/query/params.py +2 -3
- datachain/query/schema.py +7 -6
- datachain/query/session.py +7 -7
- datachain/query/udf.py +8 -7
- datachain/query/utils.py +3 -5
- datachain/remote/studio.py +33 -39
- datachain/script_meta.py +12 -12
- datachain/sql/sqlite/base.py +6 -9
- datachain/studio.py +30 -30
- datachain/toolkit/split.py +1 -2
- datachain/utils.py +21 -21
- {datachain-0.34.6.dist-info → datachain-0.34.7.dist-info}/METADATA +2 -3
- datachain-0.34.7.dist-info/RECORD +173 -0
- datachain-0.34.6.dist-info/RECORD +0 -173
- {datachain-0.34.6.dist-info → datachain-0.34.7.dist-info}/WHEEL +0 -0
- {datachain-0.34.6.dist-info → datachain-0.34.7.dist-info}/entry_points.txt +0 -0
- {datachain-0.34.6.dist-info → datachain-0.34.7.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.34.6.dist-info → datachain-0.34.7.dist-info}/top_level.txt +0 -0
datachain/lib/dc/datachain.py
CHANGED
|
@@ -4,18 +4,15 @@ import os
|
|
|
4
4
|
import os.path
|
|
5
5
|
import sys
|
|
6
6
|
import warnings
|
|
7
|
-
from collections.abc import Iterator, Sequence
|
|
7
|
+
from collections.abc import Callable, Iterator, Sequence
|
|
8
8
|
from typing import (
|
|
9
9
|
IO,
|
|
10
10
|
TYPE_CHECKING,
|
|
11
11
|
Any,
|
|
12
12
|
BinaryIO,
|
|
13
|
-
Callable,
|
|
14
13
|
ClassVar,
|
|
15
14
|
Literal,
|
|
16
|
-
Optional,
|
|
17
15
|
TypeVar,
|
|
18
|
-
Union,
|
|
19
16
|
cast,
|
|
20
17
|
overload,
|
|
21
18
|
)
|
|
@@ -85,19 +82,20 @@ if TYPE_CHECKING:
|
|
|
85
82
|
import sqlite3
|
|
86
83
|
|
|
87
84
|
import pandas as pd
|
|
85
|
+
from sqlalchemy.orm import Session as OrmSession
|
|
88
86
|
from typing_extensions import ParamSpec, Self
|
|
89
87
|
|
|
90
88
|
P = ParamSpec("P")
|
|
91
89
|
|
|
92
|
-
ConnectionType =
|
|
93
|
-
str
|
|
94
|
-
sqlalchemy.engine.URL
|
|
95
|
-
sqlalchemy.engine.interfaces.Connectable
|
|
96
|
-
sqlalchemy.engine.Engine
|
|
97
|
-
sqlalchemy.engine.Connection
|
|
98
|
-
|
|
99
|
-
sqlite3.Connection
|
|
100
|
-
|
|
90
|
+
ConnectionType = (
|
|
91
|
+
str
|
|
92
|
+
| sqlalchemy.engine.URL
|
|
93
|
+
| sqlalchemy.engine.interfaces.Connectable
|
|
94
|
+
| sqlalchemy.engine.Engine
|
|
95
|
+
| sqlalchemy.engine.Connection
|
|
96
|
+
| OrmSession
|
|
97
|
+
| sqlite3.Connection
|
|
98
|
+
)
|
|
101
99
|
|
|
102
100
|
|
|
103
101
|
T = TypeVar("T", bound="DataChain")
|
|
@@ -186,7 +184,7 @@ class DataChain:
|
|
|
186
184
|
query: DatasetQuery,
|
|
187
185
|
settings: Settings,
|
|
188
186
|
signal_schema: SignalSchema,
|
|
189
|
-
setup:
|
|
187
|
+
setup: dict | None = None,
|
|
190
188
|
_sys: bool = False,
|
|
191
189
|
) -> None:
|
|
192
190
|
"""Don't instantiate this directly, use one of the from_XXX constructors."""
|
|
@@ -197,10 +195,10 @@ class DataChain:
|
|
|
197
195
|
self._sys = _sys
|
|
198
196
|
self._delta = False
|
|
199
197
|
self._delta_unsafe = False
|
|
200
|
-
self._delta_on:
|
|
201
|
-
self._delta_result_on:
|
|
202
|
-
self._delta_compare:
|
|
203
|
-
self._delta_retry:
|
|
198
|
+
self._delta_on: str | Sequence[str] | None = None
|
|
199
|
+
self._delta_result_on: str | Sequence[str] | None = None
|
|
200
|
+
self._delta_compare: str | Sequence[str] | None = None
|
|
201
|
+
self._delta_retry: bool | str | None = None
|
|
204
202
|
|
|
205
203
|
def __repr__(self) -> str:
|
|
206
204
|
"""Return a string representation of the chain."""
|
|
@@ -224,10 +222,10 @@ class DataChain:
|
|
|
224
222
|
|
|
225
223
|
def _as_delta(
|
|
226
224
|
self,
|
|
227
|
-
on:
|
|
228
|
-
right_on:
|
|
229
|
-
compare:
|
|
230
|
-
delta_retry:
|
|
225
|
+
on: str | Sequence[str] | None = None,
|
|
226
|
+
right_on: str | Sequence[str] | None = None,
|
|
227
|
+
compare: str | Sequence[str] | None = None,
|
|
228
|
+
delta_retry: bool | str | None = None,
|
|
231
229
|
delta_unsafe: bool = False,
|
|
232
230
|
) -> "Self":
|
|
233
231
|
"""Marks this chain as delta, which means special delta process will be
|
|
@@ -277,7 +275,7 @@ class DataChain:
|
|
|
277
275
|
|
|
278
276
|
raise ValueError(f"Column with name {name} not found in the schema")
|
|
279
277
|
|
|
280
|
-
def c(self, column:
|
|
278
|
+
def c(self, column: str | Column) -> Column:
|
|
281
279
|
"""Returns Column instance attached to the current chain."""
|
|
282
280
|
c = self.column(column) if isinstance(column, str) else self.column(column.name)
|
|
283
281
|
c.table = self._query.table
|
|
@@ -289,17 +287,17 @@ class DataChain:
|
|
|
289
287
|
return self._query.session
|
|
290
288
|
|
|
291
289
|
@property
|
|
292
|
-
def name(self) ->
|
|
290
|
+
def name(self) -> str | None:
|
|
293
291
|
"""Name of the underlying dataset, if there is one."""
|
|
294
292
|
return self._query.name
|
|
295
293
|
|
|
296
294
|
@property
|
|
297
|
-
def version(self) ->
|
|
295
|
+
def version(self) -> str | None:
|
|
298
296
|
"""Version of the underlying dataset, if there is one."""
|
|
299
297
|
return self._query.version
|
|
300
298
|
|
|
301
299
|
@property
|
|
302
|
-
def dataset(self) ->
|
|
300
|
+
def dataset(self) -> DatasetRecord | None:
|
|
303
301
|
"""Underlying dataset, if there is one."""
|
|
304
302
|
if not self.name:
|
|
305
303
|
return None
|
|
@@ -313,7 +311,7 @@ class DataChain:
|
|
|
313
311
|
"""Return `self.union(other)`."""
|
|
314
312
|
return self.union(other)
|
|
315
313
|
|
|
316
|
-
def print_schema(self, file:
|
|
314
|
+
def print_schema(self, file: IO | None = None) -> None:
|
|
317
315
|
"""Print schema of the chain."""
|
|
318
316
|
self._effective_signals_schema.print_tree(file=file)
|
|
319
317
|
|
|
@@ -324,8 +322,8 @@ class DataChain:
|
|
|
324
322
|
def _evolve(
|
|
325
323
|
self,
|
|
326
324
|
*,
|
|
327
|
-
query:
|
|
328
|
-
settings:
|
|
325
|
+
query: DatasetQuery | None = None,
|
|
326
|
+
settings: Settings | None = None,
|
|
329
327
|
signal_schema=None,
|
|
330
328
|
_sys=None,
|
|
331
329
|
) -> "Self":
|
|
@@ -353,15 +351,15 @@ class DataChain:
|
|
|
353
351
|
|
|
354
352
|
def settings(
|
|
355
353
|
self,
|
|
356
|
-
cache:
|
|
357
|
-
prefetch:
|
|
358
|
-
parallel:
|
|
359
|
-
workers:
|
|
360
|
-
namespace:
|
|
361
|
-
project:
|
|
362
|
-
min_task_size:
|
|
363
|
-
batch_size:
|
|
364
|
-
sys:
|
|
354
|
+
cache: bool | None = None,
|
|
355
|
+
prefetch: bool | int | None = None,
|
|
356
|
+
parallel: bool | int | None = None,
|
|
357
|
+
workers: int | None = None,
|
|
358
|
+
namespace: str | None = None,
|
|
359
|
+
project: str | None = None,
|
|
360
|
+
min_task_size: int | None = None,
|
|
361
|
+
batch_size: int | None = None,
|
|
362
|
+
sys: bool | None = None,
|
|
365
363
|
) -> "Self":
|
|
366
364
|
"""
|
|
367
365
|
Set chain execution parameters. Returns the chain itself, allowing method
|
|
@@ -412,7 +410,7 @@ class DataChain:
|
|
|
412
410
|
)
|
|
413
411
|
return self._evolve(settings=settings, _sys=sys)
|
|
414
412
|
|
|
415
|
-
def reset_settings(self, settings:
|
|
413
|
+
def reset_settings(self, settings: Settings | None = None) -> "Self":
|
|
416
414
|
"""Reset all chain settings to default values."""
|
|
417
415
|
self._settings = settings if settings else Settings()
|
|
418
416
|
return self
|
|
@@ -464,8 +462,8 @@ class DataChain:
|
|
|
464
462
|
def explode(
|
|
465
463
|
self,
|
|
466
464
|
col: str,
|
|
467
|
-
model_name:
|
|
468
|
-
column:
|
|
465
|
+
model_name: str | None = None,
|
|
466
|
+
column: str | None = None,
|
|
469
467
|
schema_sample_size: int = 1,
|
|
470
468
|
) -> "DataChain":
|
|
471
469
|
"""Explodes a column containing JSON objects (dict or str DataChain type) into
|
|
@@ -506,7 +504,7 @@ class DataChain:
|
|
|
506
504
|
|
|
507
505
|
model = dict_to_data_model(model_name, output, original_names)
|
|
508
506
|
|
|
509
|
-
def json_to_model(json_value:
|
|
507
|
+
def json_to_model(json_value: str | dict):
|
|
510
508
|
json_dict = (
|
|
511
509
|
json.loads(json_value) if isinstance(json_value, str) else json_value
|
|
512
510
|
)
|
|
@@ -599,10 +597,10 @@ class DataChain:
|
|
|
599
597
|
def save( # type: ignore[override]
|
|
600
598
|
self,
|
|
601
599
|
name: str,
|
|
602
|
-
version:
|
|
603
|
-
description:
|
|
604
|
-
attrs:
|
|
605
|
-
update_version:
|
|
600
|
+
version: str | None = None,
|
|
601
|
+
description: str | None = None,
|
|
602
|
+
attrs: list[str] | None = None,
|
|
603
|
+
update_version: str | None = "patch",
|
|
606
604
|
**kwargs,
|
|
607
605
|
) -> "DataChain":
|
|
608
606
|
"""Save to a Dataset. It returns the chain itself.
|
|
@@ -666,12 +664,12 @@ class DataChain:
|
|
|
666
664
|
|
|
667
665
|
return result
|
|
668
666
|
|
|
669
|
-
def _validate_version(self, version:
|
|
667
|
+
def _validate_version(self, version: str | None) -> None:
|
|
670
668
|
"""Validate dataset version if provided."""
|
|
671
669
|
if version is not None:
|
|
672
670
|
semver.validate(version)
|
|
673
671
|
|
|
674
|
-
def _validate_update_version(self, update_version:
|
|
672
|
+
def _validate_update_version(self, update_version: str | None) -> None:
|
|
675
673
|
"""Ensure update_version is one of: major, minor, patch."""
|
|
676
674
|
allowed = ["major", "minor", "patch"]
|
|
677
675
|
if update_version not in allowed:
|
|
@@ -693,7 +691,7 @@ class DataChain:
|
|
|
693
691
|
name: str,
|
|
694
692
|
project: Project,
|
|
695
693
|
kwargs: dict,
|
|
696
|
-
) -> tuple[
|
|
694
|
+
) -> tuple[Job | None, str | None, "DataChain | None"]:
|
|
697
695
|
"""Check if checkpoint exists and return cached dataset if possible."""
|
|
698
696
|
from .datasets import read_dataset
|
|
699
697
|
|
|
@@ -727,11 +725,11 @@ class DataChain:
|
|
|
727
725
|
def _handle_delta(
|
|
728
726
|
self,
|
|
729
727
|
name: str,
|
|
730
|
-
version:
|
|
728
|
+
version: str | None,
|
|
731
729
|
project: Project,
|
|
732
730
|
schema: dict,
|
|
733
731
|
kwargs: dict,
|
|
734
|
-
) ->
|
|
732
|
+
) -> "DataChain | None":
|
|
735
733
|
"""Try to save as a delta dataset.
|
|
736
734
|
Returns:
|
|
737
735
|
A DataChain if delta logic could handle it, otherwise None to fall back
|
|
@@ -811,8 +809,8 @@ class DataChain:
|
|
|
811
809
|
|
|
812
810
|
def map(
|
|
813
811
|
self,
|
|
814
|
-
func:
|
|
815
|
-
params:
|
|
812
|
+
func: Callable | None = None,
|
|
813
|
+
params: str | Sequence[str] | None = None,
|
|
816
814
|
output: OutputType = None,
|
|
817
815
|
**signal_map: Any,
|
|
818
816
|
) -> "Self":
|
|
@@ -863,8 +861,8 @@ class DataChain:
|
|
|
863
861
|
|
|
864
862
|
def gen(
|
|
865
863
|
self,
|
|
866
|
-
func:
|
|
867
|
-
params:
|
|
864
|
+
func: Callable | Generator | None = None,
|
|
865
|
+
params: str | Sequence[str] | None = None,
|
|
868
866
|
output: OutputType = None,
|
|
869
867
|
**signal_map,
|
|
870
868
|
) -> "Self":
|
|
@@ -903,9 +901,9 @@ class DataChain:
|
|
|
903
901
|
def agg(
|
|
904
902
|
self,
|
|
905
903
|
/,
|
|
906
|
-
func:
|
|
907
|
-
partition_by:
|
|
908
|
-
params:
|
|
904
|
+
func: Callable | None = None,
|
|
905
|
+
partition_by: PartitionByType | None = None,
|
|
906
|
+
params: str | Sequence[str] | None = None,
|
|
909
907
|
output: OutputType = None,
|
|
910
908
|
**signal_map: Callable,
|
|
911
909
|
) -> "Self":
|
|
@@ -1038,8 +1036,8 @@ class DataChain:
|
|
|
1038
1036
|
|
|
1039
1037
|
def batch_map(
|
|
1040
1038
|
self,
|
|
1041
|
-
func:
|
|
1042
|
-
params:
|
|
1039
|
+
func: Callable | None = None,
|
|
1040
|
+
params: str | Sequence[str] | None = None,
|
|
1043
1041
|
output: OutputType = None,
|
|
1044
1042
|
batch: int = 1000,
|
|
1045
1043
|
**signal_map,
|
|
@@ -1087,8 +1085,8 @@ class DataChain:
|
|
|
1087
1085
|
def _udf_to_obj(
|
|
1088
1086
|
self,
|
|
1089
1087
|
target_class: type[UDFObjT],
|
|
1090
|
-
func:
|
|
1091
|
-
params:
|
|
1088
|
+
func: Callable | UDFObjT | None,
|
|
1089
|
+
params: str | Sequence[str] | None,
|
|
1092
1090
|
output: OutputType,
|
|
1093
1091
|
signal_map: dict[str, Callable],
|
|
1094
1092
|
) -> UDFObjT:
|
|
@@ -1180,7 +1178,7 @@ class DataChain:
|
|
|
1180
1178
|
def group_by( # noqa: C901, PLR0912
|
|
1181
1179
|
self,
|
|
1182
1180
|
*,
|
|
1183
|
-
partition_by:
|
|
1181
|
+
partition_by: str | Func | Sequence[str | Func] | None = None,
|
|
1184
1182
|
**kwargs: Func,
|
|
1185
1183
|
) -> "Self":
|
|
1186
1184
|
"""Group rows by specified set of signals and return new signals
|
|
@@ -1486,7 +1484,7 @@ class DataChain:
|
|
|
1486
1484
|
"""Convert every row to a dictionary."""
|
|
1487
1485
|
|
|
1488
1486
|
def to_dict(cols: list[str], row: tuple[Any, ...]) -> dict[str, Any]:
|
|
1489
|
-
return dict(zip(cols, row))
|
|
1487
|
+
return dict(zip(cols, row, strict=False))
|
|
1490
1488
|
|
|
1491
1489
|
return self.results(row_factory=to_dict)
|
|
1492
1490
|
|
|
@@ -1544,7 +1542,7 @@ class DataChain:
|
|
|
1544
1542
|
@overload
|
|
1545
1543
|
def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1546
1544
|
|
|
1547
|
-
def collect(self, *cols: str) -> Iterator[
|
|
1545
|
+
def collect(self, *cols: str) -> Iterator[DataValue | tuple[DataValue, ...]]: # type: ignore[overload-overlap,misc]
|
|
1548
1546
|
"""
|
|
1549
1547
|
Deprecated. Use `to_iter` method instead.
|
|
1550
1548
|
"""
|
|
@@ -1609,8 +1607,8 @@ class DataChain:
|
|
|
1609
1607
|
def merge(
|
|
1610
1608
|
self,
|
|
1611
1609
|
right_ds: "DataChain",
|
|
1612
|
-
on:
|
|
1613
|
-
right_on:
|
|
1610
|
+
on: MergeColType | Sequence[MergeColType],
|
|
1611
|
+
right_on: MergeColType | Sequence[MergeColType] | None = None,
|
|
1614
1612
|
inner=False,
|
|
1615
1613
|
full=False,
|
|
1616
1614
|
rname="right_",
|
|
@@ -1678,8 +1676,8 @@ class DataChain:
|
|
|
1678
1676
|
|
|
1679
1677
|
def _resolve(
|
|
1680
1678
|
ds: DataChain,
|
|
1681
|
-
col:
|
|
1682
|
-
side:
|
|
1679
|
+
col: str | Function | sqlalchemy.ColumnElement,
|
|
1680
|
+
side: str | None,
|
|
1683
1681
|
):
|
|
1684
1682
|
try:
|
|
1685
1683
|
if isinstance(col, Function):
|
|
@@ -1692,7 +1690,7 @@ class DataChain:
|
|
|
1692
1690
|
ops = [
|
|
1693
1691
|
_resolve(self, left, "left")
|
|
1694
1692
|
== _resolve(right_ds, right, "right" if right_on else None)
|
|
1695
|
-
for left, right in zip(on, right_on or on)
|
|
1693
|
+
for left, right in zip(on, right_on or on, strict=False)
|
|
1696
1694
|
]
|
|
1697
1695
|
|
|
1698
1696
|
if errors:
|
|
@@ -1730,8 +1728,8 @@ class DataChain:
|
|
|
1730
1728
|
def subtract( # type: ignore[override]
|
|
1731
1729
|
self,
|
|
1732
1730
|
other: "DataChain",
|
|
1733
|
-
on:
|
|
1734
|
-
right_on:
|
|
1731
|
+
on: str | Sequence[str] | None = None,
|
|
1732
|
+
right_on: str | Sequence[str] | None = None,
|
|
1735
1733
|
) -> "Self":
|
|
1736
1734
|
"""Remove rows that appear in another chain.
|
|
1737
1735
|
|
|
@@ -1788,6 +1786,7 @@ class DataChain:
|
|
|
1788
1786
|
zip(
|
|
1789
1787
|
self.signals_schema.resolve(*on).db_signals(),
|
|
1790
1788
|
other.signals_schema.resolve(*right_on).db_signals(),
|
|
1789
|
+
strict=False,
|
|
1791
1790
|
) # type: ignore[arg-type]
|
|
1792
1791
|
)
|
|
1793
1792
|
return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
|
|
@@ -1795,15 +1794,15 @@ class DataChain:
|
|
|
1795
1794
|
def diff(
|
|
1796
1795
|
self,
|
|
1797
1796
|
other: "DataChain",
|
|
1798
|
-
on:
|
|
1799
|
-
right_on:
|
|
1800
|
-
compare:
|
|
1801
|
-
right_compare:
|
|
1797
|
+
on: str | Sequence[str],
|
|
1798
|
+
right_on: str | Sequence[str] | None = None,
|
|
1799
|
+
compare: str | Sequence[str] | None = None,
|
|
1800
|
+
right_compare: str | Sequence[str] | None = None,
|
|
1802
1801
|
added: bool = True,
|
|
1803
1802
|
deleted: bool = True,
|
|
1804
1803
|
modified: bool = True,
|
|
1805
1804
|
same: bool = False,
|
|
1806
|
-
status_col:
|
|
1805
|
+
status_col: str | None = None,
|
|
1807
1806
|
) -> "DataChain":
|
|
1808
1807
|
"""Calculate differences between two chains.
|
|
1809
1808
|
|
|
@@ -1864,12 +1863,12 @@ class DataChain:
|
|
|
1864
1863
|
self,
|
|
1865
1864
|
other: "DataChain",
|
|
1866
1865
|
on: str = "file",
|
|
1867
|
-
right_on:
|
|
1866
|
+
right_on: str | None = None,
|
|
1868
1867
|
added: bool = True,
|
|
1869
1868
|
modified: bool = True,
|
|
1870
1869
|
deleted: bool = False,
|
|
1871
1870
|
same: bool = False,
|
|
1872
|
-
status_col:
|
|
1871
|
+
status_col: str | None = None,
|
|
1873
1872
|
) -> "DataChain":
|
|
1874
1873
|
"""Calculate differences between two chains containing files.
|
|
1875
1874
|
|
|
@@ -1985,6 +1984,8 @@ class DataChain:
|
|
|
1985
1984
|
headers, max_length = self._effective_signals_schema.get_headers_with_length(
|
|
1986
1985
|
include_hidden=include_hidden
|
|
1987
1986
|
)
|
|
1987
|
+
|
|
1988
|
+
columns: list[str] | pd.MultiIndex
|
|
1988
1989
|
if flatten or max_length < 2:
|
|
1989
1990
|
columns = [".".join(filter(None, header)) for header in headers]
|
|
1990
1991
|
else:
|
|
@@ -2080,7 +2081,7 @@ class DataChain:
|
|
|
2080
2081
|
column: str = "",
|
|
2081
2082
|
model_name: str = "",
|
|
2082
2083
|
source: bool = True,
|
|
2083
|
-
nrows:
|
|
2084
|
+
nrows: int | None = None,
|
|
2084
2085
|
**kwargs: Any,
|
|
2085
2086
|
) -> "Self":
|
|
2086
2087
|
"""Generate chain from list of tabular files.
|
|
@@ -2214,10 +2215,10 @@ class DataChain:
|
|
|
2214
2215
|
|
|
2215
2216
|
def to_parquet(
|
|
2216
2217
|
self,
|
|
2217
|
-
path:
|
|
2218
|
-
partition_cols:
|
|
2218
|
+
path: str | os.PathLike[str] | BinaryIO,
|
|
2219
|
+
partition_cols: Sequence[str] | None = None,
|
|
2219
2220
|
chunk_size: int = DEFAULT_PARQUET_CHUNK_SIZE,
|
|
2220
|
-
fs_kwargs:
|
|
2221
|
+
fs_kwargs: dict[str, Any] | None = None,
|
|
2221
2222
|
**kwargs,
|
|
2222
2223
|
) -> None:
|
|
2223
2224
|
"""Save chain to parquet file with SignalSchema metadata.
|
|
@@ -2274,7 +2275,7 @@ class DataChain:
|
|
|
2274
2275
|
# pyarrow infers the best parquet schema from the python types of
|
|
2275
2276
|
# the input data.
|
|
2276
2277
|
table = pa.Table.from_pydict(
|
|
2277
|
-
dict(zip(column_names, chunk)),
|
|
2278
|
+
dict(zip(column_names, chunk, strict=False)),
|
|
2278
2279
|
schema=parquet_schema,
|
|
2279
2280
|
)
|
|
2280
2281
|
|
|
@@ -2312,9 +2313,9 @@ class DataChain:
|
|
|
2312
2313
|
|
|
2313
2314
|
def to_csv(
|
|
2314
2315
|
self,
|
|
2315
|
-
path:
|
|
2316
|
+
path: str | os.PathLike[str],
|
|
2316
2317
|
delimiter: str = ",",
|
|
2317
|
-
fs_kwargs:
|
|
2318
|
+
fs_kwargs: dict[str, Any] | None = None,
|
|
2318
2319
|
**kwargs,
|
|
2319
2320
|
) -> None:
|
|
2320
2321
|
"""Save chain to a csv (comma-separated values) file.
|
|
@@ -2359,8 +2360,8 @@ class DataChain:
|
|
|
2359
2360
|
|
|
2360
2361
|
def to_json(
|
|
2361
2362
|
self,
|
|
2362
|
-
path:
|
|
2363
|
-
fs_kwargs:
|
|
2363
|
+
path: str | os.PathLike[str],
|
|
2364
|
+
fs_kwargs: dict[str, Any] | None = None,
|
|
2364
2365
|
include_outer_list: bool = True,
|
|
2365
2366
|
) -> None:
|
|
2366
2367
|
"""Save chain to a JSON file.
|
|
@@ -2420,8 +2421,8 @@ class DataChain:
|
|
|
2420
2421
|
|
|
2421
2422
|
def to_jsonl(
|
|
2422
2423
|
self,
|
|
2423
|
-
path:
|
|
2424
|
-
fs_kwargs:
|
|
2424
|
+
path: str | os.PathLike[str],
|
|
2425
|
+
fs_kwargs: dict[str, Any] | None = None,
|
|
2425
2426
|
) -> None:
|
|
2426
2427
|
"""Save chain to a JSON lines file.
|
|
2427
2428
|
|
|
@@ -2440,9 +2441,9 @@ class DataChain:
|
|
|
2440
2441
|
connection: "ConnectionType",
|
|
2441
2442
|
*,
|
|
2442
2443
|
batch_size: int = DEFAULT_DATABASE_BATCH_SIZE,
|
|
2443
|
-
on_conflict:
|
|
2444
|
-
conflict_columns:
|
|
2445
|
-
column_mapping:
|
|
2444
|
+
on_conflict: str | None = None,
|
|
2445
|
+
conflict_columns: list[str] | None = None,
|
|
2446
|
+
column_mapping: dict[str, str | None] | None = None,
|
|
2446
2447
|
) -> int:
|
|
2447
2448
|
"""Save chain to a database table using a given database connection.
|
|
2448
2449
|
|
|
@@ -2678,13 +2679,13 @@ class DataChain:
|
|
|
2678
2679
|
|
|
2679
2680
|
def to_storage(
|
|
2680
2681
|
self,
|
|
2681
|
-
output:
|
|
2682
|
+
output: str | os.PathLike[str],
|
|
2682
2683
|
signal: str = "file",
|
|
2683
2684
|
placement: FileExportPlacement = "fullpath",
|
|
2684
2685
|
link_type: Literal["copy", "symlink"] = "copy",
|
|
2685
|
-
num_threads:
|
|
2686
|
-
anon:
|
|
2687
|
-
client_config:
|
|
2686
|
+
num_threads: int | None = EXPORT_FILES_MAX_THREADS,
|
|
2687
|
+
anon: bool | None = None,
|
|
2688
|
+
client_config: dict | None = None,
|
|
2688
2689
|
) -> None:
|
|
2689
2690
|
"""Export files from a specified signal to a directory. Files can be
|
|
2690
2691
|
exported to a local or cloud directory.
|
datachain/lib/dc/datasets.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from collections.abc import Sequence
|
|
2
|
-
from typing import TYPE_CHECKING,
|
|
2
|
+
from typing import TYPE_CHECKING, get_origin, get_type_hints
|
|
3
3
|
|
|
4
4
|
from datachain.error import (
|
|
5
5
|
DatasetNotFoundError,
|
|
@@ -26,20 +26,20 @@ if TYPE_CHECKING:
|
|
|
26
26
|
|
|
27
27
|
def read_dataset(
|
|
28
28
|
name: str,
|
|
29
|
-
namespace:
|
|
30
|
-
project:
|
|
31
|
-
version:
|
|
32
|
-
session:
|
|
33
|
-
settings:
|
|
34
|
-
delta:
|
|
35
|
-
delta_on:
|
|
29
|
+
namespace: str | None = None,
|
|
30
|
+
project: str | None = None,
|
|
31
|
+
version: str | int | None = None,
|
|
32
|
+
session: Session | None = None,
|
|
33
|
+
settings: dict | None = None,
|
|
34
|
+
delta: bool | None = False,
|
|
35
|
+
delta_on: str | Sequence[str] | None = (
|
|
36
36
|
"file.path",
|
|
37
37
|
"file.etag",
|
|
38
38
|
"file.version",
|
|
39
39
|
),
|
|
40
|
-
delta_result_on:
|
|
41
|
-
delta_compare:
|
|
42
|
-
delta_retry:
|
|
40
|
+
delta_result_on: str | Sequence[str] | None = None,
|
|
41
|
+
delta_compare: str | Sequence[str] | None = None,
|
|
42
|
+
delta_retry: bool | str | None = None,
|
|
43
43
|
delta_unsafe: bool = False,
|
|
44
44
|
update: bool = False,
|
|
45
45
|
) -> "DataChain":
|
|
@@ -215,13 +215,13 @@ def read_dataset(
|
|
|
215
215
|
|
|
216
216
|
|
|
217
217
|
def datasets(
|
|
218
|
-
session:
|
|
219
|
-
settings:
|
|
218
|
+
session: Session | None = None,
|
|
219
|
+
settings: dict | None = None,
|
|
220
220
|
in_memory: bool = False,
|
|
221
|
-
column:
|
|
221
|
+
column: str | None = None,
|
|
222
222
|
include_listing: bool = False,
|
|
223
223
|
studio: bool = False,
|
|
224
|
-
attrs:
|
|
224
|
+
attrs: list[str] | None = None,
|
|
225
225
|
) -> "DataChain":
|
|
226
226
|
"""Generate chain with list of registered datasets.
|
|
227
227
|
|
|
@@ -298,12 +298,12 @@ def datasets(
|
|
|
298
298
|
|
|
299
299
|
def delete_dataset(
|
|
300
300
|
name: str,
|
|
301
|
-
namespace:
|
|
302
|
-
project:
|
|
303
|
-
version:
|
|
304
|
-
force:
|
|
305
|
-
studio:
|
|
306
|
-
session:
|
|
301
|
+
namespace: str | None = None,
|
|
302
|
+
project: str | None = None,
|
|
303
|
+
version: str | None = None,
|
|
304
|
+
force: bool | None = False,
|
|
305
|
+
studio: bool | None = False,
|
|
306
|
+
session: Session | None = None,
|
|
307
307
|
in_memory: bool = False,
|
|
308
308
|
) -> None:
|
|
309
309
|
"""Removes specific dataset version or all dataset versions, depending on
|
|
@@ -377,7 +377,7 @@ def delete_dataset(
|
|
|
377
377
|
def move_dataset(
|
|
378
378
|
src: str,
|
|
379
379
|
dest: str,
|
|
380
|
-
session:
|
|
380
|
+
session: Session | None = None,
|
|
381
381
|
in_memory: bool = False,
|
|
382
382
|
) -> None:
|
|
383
383
|
"""Moves an entire dataset between namespaces and projects.
|
datachain/lib/dc/hf.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING, Any
|
|
1
|
+
from typing import TYPE_CHECKING, Any
|
|
2
2
|
|
|
3
3
|
from datachain.lib.data_model import dict_to_data_model
|
|
4
4
|
from datachain.query import Session
|
|
@@ -15,10 +15,10 @@ if TYPE_CHECKING:
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
def read_hf(
|
|
18
|
-
dataset:
|
|
18
|
+
dataset: "HFDatasetType",
|
|
19
19
|
*args: Any,
|
|
20
|
-
session:
|
|
21
|
-
settings:
|
|
20
|
+
session: Session | None = None,
|
|
21
|
+
settings: dict | None = None,
|
|
22
22
|
column: str = "",
|
|
23
23
|
model_name: str = "",
|
|
24
24
|
limit: int = 0,
|
datachain/lib/dc/json.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
|
-
import os.path
|
|
3
2
|
import re
|
|
4
|
-
from typing import TYPE_CHECKING
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
5
4
|
|
|
6
5
|
import cloudpickle
|
|
7
6
|
|
|
@@ -18,15 +17,15 @@ if TYPE_CHECKING:
|
|
|
18
17
|
|
|
19
18
|
|
|
20
19
|
def read_json(
|
|
21
|
-
path:
|
|
20
|
+
path: str | os.PathLike[str],
|
|
22
21
|
type: FileType = "text",
|
|
23
|
-
spec:
|
|
24
|
-
schema_from:
|
|
25
|
-
jmespath:
|
|
26
|
-
column:
|
|
27
|
-
model_name:
|
|
28
|
-
format:
|
|
29
|
-
nrows:
|
|
22
|
+
spec: DataType | None = None,
|
|
23
|
+
schema_from: str | None = "auto",
|
|
24
|
+
jmespath: str | None = None,
|
|
25
|
+
column: str | None = "",
|
|
26
|
+
model_name: str | None = None,
|
|
27
|
+
format: str | None = "json",
|
|
28
|
+
nrows: int | None = None,
|
|
30
29
|
**kwargs,
|
|
31
30
|
) -> "DataChain":
|
|
32
31
|
"""Get data from JSON. It returns the chain itself.
|
datachain/lib/dc/listings.py
CHANGED
|
@@ -1,7 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
2
|
-
TYPE_CHECKING,
|
|
3
|
-
Optional,
|
|
4
|
-
)
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
5
2
|
|
|
6
3
|
from datachain.lib.listing import LISTING_PREFIX, ls
|
|
7
4
|
from datachain.lib.listing_info import ListingInfo
|
|
@@ -56,7 +53,7 @@ class ReadOnlyQueryStep(QueryStep):
|
|
|
56
53
|
|
|
57
54
|
|
|
58
55
|
def listings(
|
|
59
|
-
session:
|
|
56
|
+
session: Session | None = None,
|
|
60
57
|
in_memory: bool = False,
|
|
61
58
|
column: str = "listing",
|
|
62
59
|
**kwargs,
|
|
@@ -84,10 +81,10 @@ def listings(
|
|
|
84
81
|
|
|
85
82
|
def read_listing_dataset(
|
|
86
83
|
name: str,
|
|
87
|
-
version:
|
|
84
|
+
version: str | None = None,
|
|
88
85
|
path: str = "",
|
|
89
|
-
session:
|
|
90
|
-
settings:
|
|
86
|
+
session: Session | None = None,
|
|
87
|
+
settings: dict | None = None,
|
|
91
88
|
) -> tuple["DataChain", "DatasetVersion"]:
|
|
92
89
|
"""Read a listing dataset and return a DataChain and listing version.
|
|
93
90
|
|