datachain 0.17.1__py3-none-any.whl → 0.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +6 -0
- datachain/cli/parser/job.py +7 -0
- datachain/data_storage/warehouse.py +1 -1
- datachain/dataset.py +7 -10
- datachain/delta.py +119 -0
- datachain/diff/__init__.py +10 -4
- datachain/lib/dc/datachain.py +89 -2
- datachain/lib/dc/datasets.py +41 -1
- datachain/lib/dc/storage.py +45 -11
- datachain/lib/signal_schema.py +12 -6
- datachain/query/dataset.py +27 -10
- datachain/remote/studio.py +2 -0
- datachain/studio.py +3 -0
- {datachain-0.17.1.dist-info → datachain-0.18.0.dist-info}/METADATA +2 -2
- {datachain-0.17.1.dist-info → datachain-0.18.0.dist-info}/RECORD +19 -18
- {datachain-0.17.1.dist-info → datachain-0.18.0.dist-info}/WHEEL +1 -1
- {datachain-0.17.1.dist-info → datachain-0.18.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.17.1.dist-info → datachain-0.18.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.17.1.dist-info → datachain-0.18.0.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -779,6 +779,7 @@ class Catalog:
|
|
|
779
779
|
uuid: Optional[str] = None,
|
|
780
780
|
description: Optional[str] = None,
|
|
781
781
|
attrs: Optional[list[str]] = None,
|
|
782
|
+
update_version: Optional[str] = "patch",
|
|
782
783
|
) -> "DatasetRecord":
|
|
783
784
|
"""
|
|
784
785
|
Creates new dataset of a specific version.
|
|
@@ -795,6 +796,11 @@ class Catalog:
|
|
|
795
796
|
try:
|
|
796
797
|
dataset = self.get_dataset(name)
|
|
797
798
|
default_version = dataset.next_version_patch
|
|
799
|
+
if update_version == "major":
|
|
800
|
+
default_version = dataset.next_version_major
|
|
801
|
+
if update_version == "minor":
|
|
802
|
+
default_version = dataset.next_version_minor
|
|
803
|
+
|
|
798
804
|
if (description or attrs) and (
|
|
799
805
|
dataset.description != description or dataset.attrs != attrs
|
|
800
806
|
):
|
datachain/cli/parser/job.py
CHANGED
|
@@ -82,6 +82,13 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
|
|
|
82
82
|
nargs="+",
|
|
83
83
|
help="Python package requirements",
|
|
84
84
|
)
|
|
85
|
+
studio_run_parser.add_argument(
|
|
86
|
+
"--priority",
|
|
87
|
+
type=int,
|
|
88
|
+
default=5,
|
|
89
|
+
help="Priority for the job in range 0-5. "
|
|
90
|
+
"Lower value is higher priority (default: 5)",
|
|
91
|
+
)
|
|
85
92
|
|
|
86
93
|
studio_ls_help = "List jobs in Studio"
|
|
87
94
|
studio_ls_description = "List jobs in Studio."
|
|
@@ -258,7 +258,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
258
258
|
if Client.is_data_source_uri(dataset_name):
|
|
259
259
|
# for datasets that are created for bucket listing we use different prefix
|
|
260
260
|
prefix = self.DATASET_SOURCE_TABLE_PREFIX
|
|
261
|
-
return f"{prefix}{dataset_name}_{version}"
|
|
261
|
+
return f"{prefix}{dataset_name}_{version.replace('.', '_')}"
|
|
262
262
|
|
|
263
263
|
def temp_table_name(self) -> str:
|
|
264
264
|
return self.TMP_TABLE_NAME_PREFIX + _random_string(6)
|
datachain/dataset.py
CHANGED
|
@@ -107,24 +107,21 @@ class DatasetDependency:
|
|
|
107
107
|
dataset_version: Optional[str],
|
|
108
108
|
dataset_version_created_at: Optional[datetime],
|
|
109
109
|
) -> Optional["DatasetDependency"]:
|
|
110
|
-
from datachain.
|
|
111
|
-
from datachain.lib.listing import is_listing_dataset, listing_uri_from_name
|
|
110
|
+
from datachain.lib.listing import is_listing_dataset
|
|
112
111
|
|
|
113
112
|
if not dataset_id:
|
|
114
113
|
return None
|
|
115
114
|
|
|
116
115
|
assert dataset_name is not None
|
|
117
|
-
dependency_type = DatasetDependencyType.DATASET
|
|
118
|
-
dependency_name = dataset_name
|
|
119
|
-
|
|
120
|
-
if is_listing_dataset(dataset_name):
|
|
121
|
-
dependency_type = DatasetDependencyType.STORAGE # type: ignore[arg-type]
|
|
122
|
-
dependency_name, _ = Client.parse_url(listing_uri_from_name(dataset_name))
|
|
123
116
|
|
|
124
117
|
return cls(
|
|
125
118
|
id,
|
|
126
|
-
|
|
127
|
-
|
|
119
|
+
(
|
|
120
|
+
DatasetDependencyType.STORAGE
|
|
121
|
+
if is_listing_dataset(dataset_name)
|
|
122
|
+
else DatasetDependencyType.DATASET
|
|
123
|
+
),
|
|
124
|
+
dataset_name,
|
|
128
125
|
(
|
|
129
126
|
dataset_version # type: ignore[arg-type]
|
|
130
127
|
if dataset_version
|
datachain/delta.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
from collections.abc import Sequence
|
|
2
|
+
from copy import copy
|
|
3
|
+
from functools import wraps
|
|
4
|
+
from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
|
|
5
|
+
|
|
6
|
+
import datachain
|
|
7
|
+
from datachain.dataset import DatasetDependency
|
|
8
|
+
from datachain.error import DatasetNotFoundError
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from typing_extensions import Concatenate, ParamSpec
|
|
12
|
+
|
|
13
|
+
from datachain.lib.dc import DataChain
|
|
14
|
+
|
|
15
|
+
P = ParamSpec("P")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
T = TypeVar("T", bound="DataChain")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def delta_disabled(
|
|
22
|
+
method: "Callable[Concatenate[T, P], T]",
|
|
23
|
+
) -> "Callable[Concatenate[T, P], T]":
|
|
24
|
+
"""
|
|
25
|
+
Decorator for disabling DataChain methods (e.g `.agg()` or `.union()`) to
|
|
26
|
+
work with delta updates. It throws `NotImplementedError` if chain on which
|
|
27
|
+
method is called is marked as delta.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
@wraps(method)
|
|
31
|
+
def _inner(self: T, *args: "P.args", **kwargs: "P.kwargs") -> T:
|
|
32
|
+
if self.delta:
|
|
33
|
+
raise NotImplementedError(
|
|
34
|
+
f"Delta update cannot be used with {method.__name__}"
|
|
35
|
+
)
|
|
36
|
+
return method(self, *args, **kwargs)
|
|
37
|
+
|
|
38
|
+
return _inner
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _append_steps(dc: "DataChain", other: "DataChain"):
|
|
42
|
+
"""Returns cloned chain with appended steps from other chain.
|
|
43
|
+
Steps are all those modification methods applied like filters, mappers etc.
|
|
44
|
+
"""
|
|
45
|
+
dc = dc.clone()
|
|
46
|
+
dc._query.steps += other._query.steps.copy()
|
|
47
|
+
dc.signals_schema = other.signals_schema
|
|
48
|
+
return dc
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def delta_update(
|
|
52
|
+
dc: "DataChain",
|
|
53
|
+
name: str,
|
|
54
|
+
on: Union[str, Sequence[str]],
|
|
55
|
+
right_on: Optional[Union[str, Sequence[str]]] = None,
|
|
56
|
+
compare: Optional[Union[str, Sequence[str]]] = None,
|
|
57
|
+
) -> tuple[Optional["DataChain"], Optional[list[DatasetDependency]], bool]:
|
|
58
|
+
"""
|
|
59
|
+
Creates new chain that consists of the last version of current delta dataset
|
|
60
|
+
plus diff from the source with all needed modifications.
|
|
61
|
+
This way we don't need to re-calculate the whole chain from the source again(
|
|
62
|
+
apply all the DataChain methods like filters, mappers, generators etc.)
|
|
63
|
+
but just the diff part which is very important for performance.
|
|
64
|
+
|
|
65
|
+
Note that currently delta update works only if there is only one direct dependency.
|
|
66
|
+
"""
|
|
67
|
+
catalog = dc.session.catalog
|
|
68
|
+
dc._query.apply_listing_pre_step()
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
latest_version = catalog.get_dataset(name).latest_version
|
|
72
|
+
except DatasetNotFoundError:
|
|
73
|
+
# first creation of delta update dataset
|
|
74
|
+
return None, None, True
|
|
75
|
+
|
|
76
|
+
dependencies = catalog.get_dataset_dependencies(
|
|
77
|
+
name, latest_version, indirect=False
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
dep = dependencies[0]
|
|
81
|
+
if not dep:
|
|
82
|
+
# starting dataset (e.g listing) was removed so we are backing off to normal
|
|
83
|
+
# dataset creation, as it was created first time
|
|
84
|
+
return None, None, True
|
|
85
|
+
|
|
86
|
+
source_ds_name = dep.name
|
|
87
|
+
source_ds_version = dep.version
|
|
88
|
+
source_ds_latest_version = catalog.get_dataset(source_ds_name).latest_version
|
|
89
|
+
dependencies = copy(dependencies)
|
|
90
|
+
dependencies = [d for d in dependencies if d is not None] # filter out removed dep
|
|
91
|
+
dependencies[0].version = source_ds_latest_version # type: ignore[union-attr]
|
|
92
|
+
|
|
93
|
+
source_dc = datachain.read_dataset(source_ds_name, source_ds_version)
|
|
94
|
+
source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
|
|
95
|
+
|
|
96
|
+
diff = source_dc_latest.compare(source_dc, on=on, compare=compare, deleted=False)
|
|
97
|
+
# We append all the steps from the original chain to diff, e.g filters, mappers.
|
|
98
|
+
diff = _append_steps(diff, dc)
|
|
99
|
+
|
|
100
|
+
# to avoid re-calculating diff multiple times
|
|
101
|
+
diff = diff.persist()
|
|
102
|
+
|
|
103
|
+
if diff.empty:
|
|
104
|
+
return None, None, False
|
|
105
|
+
|
|
106
|
+
# merging diff and the latest version of dataset
|
|
107
|
+
delta_chain = (
|
|
108
|
+
datachain.read_dataset(name, latest_version)
|
|
109
|
+
.compare(
|
|
110
|
+
diff,
|
|
111
|
+
on=right_on or on,
|
|
112
|
+
added=True,
|
|
113
|
+
modified=False,
|
|
114
|
+
deleted=False,
|
|
115
|
+
)
|
|
116
|
+
.union(diff)
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
return delta_chain, dependencies, True # type: ignore[return-value]
|
datachain/diff/__init__.py
CHANGED
|
@@ -30,7 +30,7 @@ class CompareStatus(str, Enum):
|
|
|
30
30
|
SAME = "S"
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
def _compare( # noqa: C901
|
|
33
|
+
def _compare( # noqa: C901, PLR0912
|
|
34
34
|
left: "DataChain",
|
|
35
35
|
right: "DataChain",
|
|
36
36
|
on: Union[str, Sequence[str]],
|
|
@@ -77,14 +77,16 @@ def _compare( # noqa: C901
|
|
|
77
77
|
cols_select = list(left.signals_schema.clone_without_sys_signals().values.keys())
|
|
78
78
|
|
|
79
79
|
# getting correct on and right_on column names
|
|
80
|
+
on_ = on
|
|
80
81
|
on = left.signals_schema.resolve(*on).db_signals() # type: ignore[assignment]
|
|
81
|
-
right_on = right.signals_schema.resolve(*(right_on or
|
|
82
|
+
right_on = right.signals_schema.resolve(*(right_on or on_)).db_signals() # type: ignore[assignment]
|
|
82
83
|
|
|
83
84
|
# getting correct compare and right_compare column names if they are defined
|
|
84
85
|
if compare:
|
|
86
|
+
compare_ = compare
|
|
85
87
|
compare = left.signals_schema.resolve(*compare).db_signals() # type: ignore[assignment]
|
|
86
88
|
right_compare = right.signals_schema.resolve(
|
|
87
|
-
*(right_compare or
|
|
89
|
+
*(right_compare or compare_)
|
|
88
90
|
).db_signals() # type: ignore[assignment]
|
|
89
91
|
elif not compare and len(cols) != len(right_cols):
|
|
90
92
|
# here we will mark all rows that are not added or deleted as modified since
|
|
@@ -155,7 +157,11 @@ def _compare( # noqa: C901
|
|
|
155
157
|
if status_col:
|
|
156
158
|
cols_select.append(diff_col)
|
|
157
159
|
|
|
158
|
-
|
|
160
|
+
if not dc_diff._sys:
|
|
161
|
+
# TODO workaround when sys signal is not available in diff
|
|
162
|
+
dc_diff = dc_diff.settings(sys=True).select(*cols_select).settings(sys=False)
|
|
163
|
+
else:
|
|
164
|
+
dc_diff = dc_diff.select(*cols_select)
|
|
159
165
|
|
|
160
166
|
# final schema is schema from the left chain with status column added if needed
|
|
161
167
|
dc_diff.signals_schema = (
|
datachain/lib/dc/datachain.py
CHANGED
|
@@ -25,6 +25,7 @@ from tqdm import tqdm
|
|
|
25
25
|
|
|
26
26
|
from datachain import semver
|
|
27
27
|
from datachain.dataset import DatasetRecord
|
|
28
|
+
from datachain.delta import delta_disabled, delta_update
|
|
28
29
|
from datachain.func import literal
|
|
29
30
|
from datachain.func.base import Function
|
|
30
31
|
from datachain.func.func import Func
|
|
@@ -72,6 +73,9 @@ if TYPE_CHECKING:
|
|
|
72
73
|
P = ParamSpec("P")
|
|
73
74
|
|
|
74
75
|
|
|
76
|
+
T = TypeVar("T", bound="DataChain")
|
|
77
|
+
|
|
78
|
+
|
|
75
79
|
class DataChain:
|
|
76
80
|
"""DataChain - a data structure for batch data processing and evaluation.
|
|
77
81
|
|
|
@@ -164,6 +168,7 @@ class DataChain:
|
|
|
164
168
|
self.signals_schema = signal_schema
|
|
165
169
|
self._setup: dict = setup or {}
|
|
166
170
|
self._sys = _sys
|
|
171
|
+
self._delta = False
|
|
167
172
|
|
|
168
173
|
def __repr__(self) -> str:
|
|
169
174
|
"""Return a string representation of the chain."""
|
|
@@ -177,6 +182,32 @@ class DataChain:
|
|
|
177
182
|
self.print_schema(file=file)
|
|
178
183
|
return file.getvalue()
|
|
179
184
|
|
|
185
|
+
def _as_delta(
|
|
186
|
+
self,
|
|
187
|
+
on: Optional[Union[str, Sequence[str]]] = None,
|
|
188
|
+
right_on: Optional[Union[str, Sequence[str]]] = None,
|
|
189
|
+
compare: Optional[Union[str, Sequence[str]]] = None,
|
|
190
|
+
) -> "Self":
|
|
191
|
+
"""Marks this chain as delta, which means special delta process will be
|
|
192
|
+
called on saving dataset for optimization"""
|
|
193
|
+
if on is None:
|
|
194
|
+
raise ValueError("'delta on' fields must be defined")
|
|
195
|
+
self._delta = True
|
|
196
|
+
self._delta_on = on
|
|
197
|
+
self._delta_result_on = right_on
|
|
198
|
+
self._delta_compare = compare
|
|
199
|
+
return self
|
|
200
|
+
|
|
201
|
+
@property
|
|
202
|
+
def empty(self) -> bool:
|
|
203
|
+
"""Returns True if chain has zero number of rows"""
|
|
204
|
+
return not bool(self.count())
|
|
205
|
+
|
|
206
|
+
@property
|
|
207
|
+
def delta(self) -> bool:
|
|
208
|
+
"""Returns True if this chain is ran in "delta" update mode"""
|
|
209
|
+
return self._delta
|
|
210
|
+
|
|
180
211
|
@property
|
|
181
212
|
def schema(self) -> dict[str, DataType]:
|
|
182
213
|
"""Get schema of the chain."""
|
|
@@ -254,9 +285,17 @@ class DataChain:
|
|
|
254
285
|
signal_schema = copy.deepcopy(self.signals_schema)
|
|
255
286
|
if _sys is None:
|
|
256
287
|
_sys = self._sys
|
|
257
|
-
|
|
288
|
+
chain = type(self)(
|
|
258
289
|
query, settings, signal_schema=signal_schema, setup=self._setup, _sys=_sys
|
|
259
290
|
)
|
|
291
|
+
if self.delta:
|
|
292
|
+
chain = chain._as_delta(
|
|
293
|
+
on=self._delta_on,
|
|
294
|
+
right_on=self._delta_result_on,
|
|
295
|
+
compare=self._delta_compare,
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
return chain
|
|
260
299
|
|
|
261
300
|
def settings(
|
|
262
301
|
self,
|
|
@@ -461,8 +500,9 @@ class DataChain:
|
|
|
461
500
|
version: Optional[str] = None,
|
|
462
501
|
description: Optional[str] = None,
|
|
463
502
|
attrs: Optional[list[str]] = None,
|
|
503
|
+
update_version: Optional[str] = "patch",
|
|
464
504
|
**kwargs,
|
|
465
|
-
) -> "
|
|
505
|
+
) -> "DataChain":
|
|
466
506
|
"""Save to a Dataset. It returns the chain itself.
|
|
467
507
|
|
|
468
508
|
Parameters:
|
|
@@ -472,11 +512,52 @@ class DataChain:
|
|
|
472
512
|
description : description of a dataset.
|
|
473
513
|
attrs : attributes of a dataset. They can be without value, e.g "NLP",
|
|
474
514
|
or with a value, e.g "location=US".
|
|
515
|
+
update_version: which part of the dataset version to automatically increase.
|
|
516
|
+
Available values: `major`, `minor` or `patch`. Default is `patch`.
|
|
475
517
|
"""
|
|
476
518
|
if version is not None:
|
|
477
519
|
semver.validate(version)
|
|
478
520
|
|
|
521
|
+
if update_version is not None and update_version not in [
|
|
522
|
+
"patch",
|
|
523
|
+
"major",
|
|
524
|
+
"minor",
|
|
525
|
+
]:
|
|
526
|
+
raise ValueError(
|
|
527
|
+
"update_version can have one of the following values: major, minor or"
|
|
528
|
+
" patch"
|
|
529
|
+
)
|
|
530
|
+
|
|
479
531
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
532
|
+
if self.delta and name:
|
|
533
|
+
delta_ds, dependencies, has_changes = delta_update(
|
|
534
|
+
self,
|
|
535
|
+
name,
|
|
536
|
+
on=self._delta_on,
|
|
537
|
+
right_on=self._delta_result_on,
|
|
538
|
+
compare=self._delta_compare,
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
if delta_ds:
|
|
542
|
+
return self._evolve(
|
|
543
|
+
query=delta_ds._query.save(
|
|
544
|
+
name=name,
|
|
545
|
+
version=version,
|
|
546
|
+
feature_schema=schema,
|
|
547
|
+
dependencies=dependencies,
|
|
548
|
+
**kwargs,
|
|
549
|
+
)
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
if not has_changes:
|
|
553
|
+
# sources have not been changed so new version of resulting dataset
|
|
554
|
+
# would be the same as previous one. To avoid duplicating exact
|
|
555
|
+
# datasets, we won't create new version of it and we will return
|
|
556
|
+
# current latest version instead.
|
|
557
|
+
from .datasets import read_dataset
|
|
558
|
+
|
|
559
|
+
return read_dataset(name, **kwargs)
|
|
560
|
+
|
|
480
561
|
return self._evolve(
|
|
481
562
|
query=self._query.save(
|
|
482
563
|
name=name,
|
|
@@ -484,6 +565,7 @@ class DataChain:
|
|
|
484
565
|
description=description,
|
|
485
566
|
attrs=attrs,
|
|
486
567
|
feature_schema=schema,
|
|
568
|
+
update_version=update_version,
|
|
487
569
|
**kwargs,
|
|
488
570
|
)
|
|
489
571
|
)
|
|
@@ -601,6 +683,7 @@ class DataChain:
|
|
|
601
683
|
signal_schema=udf_obj.output,
|
|
602
684
|
)
|
|
603
685
|
|
|
686
|
+
@delta_disabled
|
|
604
687
|
def agg(
|
|
605
688
|
self,
|
|
606
689
|
func: Optional[Callable] = None,
|
|
@@ -754,6 +837,7 @@ class DataChain:
|
|
|
754
837
|
|
|
755
838
|
return self._evolve(query=self._query.order_by(*args))
|
|
756
839
|
|
|
840
|
+
@delta_disabled
|
|
757
841
|
def distinct(self, arg: str, *args: str) -> "Self": # type: ignore[override]
|
|
758
842
|
"""Removes duplicate rows based on uniqueness of some input column(s)
|
|
759
843
|
i.e if rows are found with the same value of input column(s), only one
|
|
@@ -788,6 +872,7 @@ class DataChain:
|
|
|
788
872
|
query=self._query.select(*columns), signal_schema=new_schema
|
|
789
873
|
)
|
|
790
874
|
|
|
875
|
+
@delta_disabled # type: ignore[arg-type]
|
|
791
876
|
def group_by(
|
|
792
877
|
self,
|
|
793
878
|
*,
|
|
@@ -1146,6 +1231,7 @@ class DataChain:
|
|
|
1146
1231
|
schema = self.signals_schema.clone_without_file_signals()
|
|
1147
1232
|
return self.select(*schema.values.keys())
|
|
1148
1233
|
|
|
1234
|
+
@delta_disabled
|
|
1149
1235
|
def merge(
|
|
1150
1236
|
self,
|
|
1151
1237
|
right_ds: "DataChain",
|
|
@@ -1254,6 +1340,7 @@ class DataChain:
|
|
|
1254
1340
|
|
|
1255
1341
|
return ds
|
|
1256
1342
|
|
|
1343
|
+
@delta_disabled
|
|
1257
1344
|
def union(self, other: "Self") -> "Self":
|
|
1258
1345
|
"""Return the set union of the two datasets.
|
|
1259
1346
|
|
datachain/lib/dc/datasets.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from collections.abc import Sequence
|
|
1
2
|
from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
|
|
2
3
|
|
|
3
4
|
from datachain.error import DatasetVersionNotFoundError
|
|
@@ -27,6 +28,10 @@ def read_dataset(
|
|
|
27
28
|
session: Optional[Session] = None,
|
|
28
29
|
settings: Optional[dict] = None,
|
|
29
30
|
fallback_to_studio: bool = True,
|
|
31
|
+
delta: Optional[bool] = False,
|
|
32
|
+
delta_on: Optional[Union[str, Sequence[str]]] = None,
|
|
33
|
+
delta_result_on: Optional[Union[str, Sequence[str]]] = None,
|
|
34
|
+
delta_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
30
35
|
) -> "DataChain":
|
|
31
36
|
"""Get data from a saved Dataset. It returns the chain itself.
|
|
32
37
|
If dataset or version is not found locally, it will try to pull it from Studio.
|
|
@@ -38,6 +43,36 @@ def read_dataset(
|
|
|
38
43
|
settings : Settings to use for the chain.
|
|
39
44
|
fallback_to_studio : Try to pull dataset from Studio if not found locally.
|
|
40
45
|
Default is True.
|
|
46
|
+
delta: If set to True, we optimize the creation of new dataset versions by
|
|
47
|
+
calculating the diff between the latest version of this storage and the
|
|
48
|
+
version used to create the most recent version of the resulting chain
|
|
49
|
+
dataset (the one specified in `.save()`). We then run the "diff" chain
|
|
50
|
+
using only the diff data, rather than the entire storage data, and merge
|
|
51
|
+
that diff chain with the latest version of the resulting dataset to create
|
|
52
|
+
a new version. This approach avoids applying modifications to all records
|
|
53
|
+
from storage every time, which can be an expensive operation.
|
|
54
|
+
The diff is calculated using the `DataChain.compare()` method, which
|
|
55
|
+
compares the `delta_on` fields to find matches and checks the compare
|
|
56
|
+
fields to determine if a record has changed. Note that this process only
|
|
57
|
+
considers added and modified records in storage; deleted records are not
|
|
58
|
+
removed from the new dataset version.
|
|
59
|
+
This calculation is based on the difference between the current version
|
|
60
|
+
of the source and the version used to create the dataset.
|
|
61
|
+
delta_on: A list of fields that uniquely identify rows in the source.
|
|
62
|
+
If two rows have the same values, they are considered the same (e.g., they
|
|
63
|
+
could be different versions of the same row in a versioned source).
|
|
64
|
+
This is used in the delta update to calculate the diff.
|
|
65
|
+
delta_result_on: A list of fields in the resulting dataset that correspond
|
|
66
|
+
to the `delta_on` fields from the source.
|
|
67
|
+
This is needed to identify rows that have changed in the source but are
|
|
68
|
+
already present in the current version of the resulting dataset, in order
|
|
69
|
+
to avoid including outdated versions of those rows in the new dataset.
|
|
70
|
+
We retain only the latest versions of rows to prevent duplication.
|
|
71
|
+
There is no need to define this if the `delta_on` fields are present in
|
|
72
|
+
the final dataset and have not been renamed.
|
|
73
|
+
delta_compare: A list of fields used to check if the same row has been modified
|
|
74
|
+
in the new version of the source.
|
|
75
|
+
If not defined, all fields except those defined in delta_on will be used.
|
|
41
76
|
|
|
42
77
|
Example:
|
|
43
78
|
```py
|
|
@@ -113,7 +148,12 @@ def read_dataset(
|
|
|
113
148
|
signals_schema |= SignalSchema.deserialize(query.feature_schema)
|
|
114
149
|
else:
|
|
115
150
|
signals_schema |= SignalSchema.from_column_types(query.column_types or {})
|
|
116
|
-
|
|
151
|
+
chain = DataChain(query, _settings, signals_schema)
|
|
152
|
+
if delta:
|
|
153
|
+
chain = chain._as_delta(
|
|
154
|
+
on=delta_on, right_on=delta_result_on, compare=delta_compare
|
|
155
|
+
)
|
|
156
|
+
return chain
|
|
117
157
|
|
|
118
158
|
|
|
119
159
|
def datasets(
|
datachain/lib/dc/storage.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import os.path
|
|
2
|
+
from collections.abc import Sequence
|
|
3
|
+
from functools import reduce
|
|
2
4
|
from typing import (
|
|
3
5
|
TYPE_CHECKING,
|
|
4
6
|
Optional,
|
|
5
7
|
Union,
|
|
6
8
|
)
|
|
7
9
|
|
|
8
|
-
from datachain.error import DatasetNotFoundError
|
|
9
10
|
from datachain.lib.file import (
|
|
10
11
|
FileType,
|
|
11
12
|
get_file_type,
|
|
@@ -33,6 +34,10 @@ def read_storage(
|
|
|
33
34
|
column: str = "file",
|
|
34
35
|
update: bool = False,
|
|
35
36
|
anon: bool = False,
|
|
37
|
+
delta: Optional[bool] = False,
|
|
38
|
+
delta_on: Optional[Union[str, Sequence[str]]] = None,
|
|
39
|
+
delta_result_on: Optional[Union[str, Sequence[str]]] = None,
|
|
40
|
+
delta_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
36
41
|
client_config: Optional[dict] = None,
|
|
37
42
|
) -> "DataChain":
|
|
38
43
|
"""Get data from storage(s) as a list of file with all file attributes.
|
|
@@ -48,6 +53,36 @@ def read_storage(
|
|
|
48
53
|
update : force storage reindexing. Default is False.
|
|
49
54
|
anon : If True, we will treat cloud bucket as public one
|
|
50
55
|
client_config : Optional client configuration for the storage client.
|
|
56
|
+
delta: If set to True, we optimize the creation of new dataset versions by
|
|
57
|
+
calculating the diff between the latest version of this storage and the
|
|
58
|
+
version used to create the most recent version of the resulting chain
|
|
59
|
+
dataset (the one specified in `.save()`). We then run the "diff" chain
|
|
60
|
+
using only the diff data, rather than the entire storage data, and merge
|
|
61
|
+
that diff chain with the latest version of the resulting dataset to create
|
|
62
|
+
a new version. This approach avoids applying modifications to all records
|
|
63
|
+
from storage every time, which can be an expensive operation.
|
|
64
|
+
The diff is calculated using the `DataChain.compare()` method, which
|
|
65
|
+
compares the `delta_on` fields to find matches and checks the compare
|
|
66
|
+
fields to determine if a record has changed. Note that this process only
|
|
67
|
+
considers added and modified records in storage; deleted records are not
|
|
68
|
+
removed from the new dataset version.
|
|
69
|
+
This calculation is based on the difference between the current version
|
|
70
|
+
of the source and the version used to create the dataset.
|
|
71
|
+
delta_on: A list of fields that uniquely identify rows in the source.
|
|
72
|
+
If two rows have the same values, they are considered the same (e.g., they
|
|
73
|
+
could be different versions of the same row in a versioned source).
|
|
74
|
+
This is used in the delta update to calculate the diff.
|
|
75
|
+
delta_result_on: A list of fields in the resulting dataset that correspond
|
|
76
|
+
to the `delta_on` fields from the source.
|
|
77
|
+
This is needed to identify rows that have changed in the source but are
|
|
78
|
+
already present in the current version of the resulting dataset, in order
|
|
79
|
+
to avoid including outdated versions of those rows in the new dataset.
|
|
80
|
+
We retain only the latest versions of rows to prevent duplication.
|
|
81
|
+
There is no need to define this if the `delta_on` fields are present in
|
|
82
|
+
the final dataset and have not been renamed.
|
|
83
|
+
delta_compare: A list of fields used to check if the same row has been modified
|
|
84
|
+
in the new version of the source.
|
|
85
|
+
If not defined, all fields except those defined in `delta_on` will be used.
|
|
51
86
|
|
|
52
87
|
Returns:
|
|
53
88
|
DataChain: A DataChain object containing the file information.
|
|
@@ -107,7 +142,7 @@ def read_storage(
|
|
|
107
142
|
if not uris:
|
|
108
143
|
raise ValueError("No URIs provided")
|
|
109
144
|
|
|
110
|
-
|
|
145
|
+
chains = []
|
|
111
146
|
listed_ds_name = set()
|
|
112
147
|
file_values = []
|
|
113
148
|
|
|
@@ -132,11 +167,6 @@ def read_storage(
|
|
|
132
167
|
|
|
133
168
|
def lst_fn(ds_name, lst_uri):
|
|
134
169
|
# disable prefetch for listing, as it pre-downloads all files
|
|
135
|
-
try:
|
|
136
|
-
version = catalog.get_dataset(ds_name).next_version_major
|
|
137
|
-
except DatasetNotFoundError:
|
|
138
|
-
version = None
|
|
139
|
-
|
|
140
170
|
(
|
|
141
171
|
read_records(
|
|
142
172
|
DataChain.DEFAULT_FILE_RECORD,
|
|
@@ -150,18 +180,18 @@ def read_storage(
|
|
|
150
180
|
output={f"{column}": file_type},
|
|
151
181
|
)
|
|
152
182
|
# for internal listing datasets, we always bump major version
|
|
153
|
-
.save(ds_name, listing=True,
|
|
183
|
+
.save(ds_name, listing=True, update_version="major")
|
|
154
184
|
)
|
|
155
185
|
|
|
156
186
|
dc._query.set_listing_fn(
|
|
157
187
|
lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
|
|
158
188
|
)
|
|
159
189
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
storage_chain = storage_chain.union(chain) if storage_chain else chain
|
|
190
|
+
chains.append(ls(dc, list_path, recursive=recursive, column=column))
|
|
163
191
|
listed_ds_name.add(list_ds_name)
|
|
164
192
|
|
|
193
|
+
storage_chain = None if not chains else reduce(lambda x, y: x.union(y), chains)
|
|
194
|
+
|
|
165
195
|
if file_values:
|
|
166
196
|
file_chain = read_values(
|
|
167
197
|
session=session,
|
|
@@ -176,4 +206,8 @@ def read_storage(
|
|
|
176
206
|
|
|
177
207
|
assert storage_chain is not None
|
|
178
208
|
|
|
209
|
+
if delta:
|
|
210
|
+
storage_chain = storage_chain._as_delta(
|
|
211
|
+
on=delta_on, right_on=delta_result_on, compare=delta_compare
|
|
212
|
+
)
|
|
179
213
|
return storage_chain
|
datachain/lib/signal_schema.py
CHANGED
|
@@ -461,14 +461,13 @@ class SignalSchema:
|
|
|
461
461
|
pos += 1
|
|
462
462
|
return objs
|
|
463
463
|
|
|
464
|
-
def
|
|
465
|
-
for
|
|
466
|
-
if (fr := ModelStore.to_pydantic(
|
|
464
|
+
def get_file_signal(self) -> Optional[str]:
|
|
465
|
+
for signal_name, signal_type in self.values.items():
|
|
466
|
+
if (fr := ModelStore.to_pydantic(signal_type)) is not None and issubclass(
|
|
467
467
|
fr, File
|
|
468
468
|
):
|
|
469
|
-
return
|
|
470
|
-
|
|
471
|
-
return False
|
|
469
|
+
return signal_name
|
|
470
|
+
return None
|
|
472
471
|
|
|
473
472
|
def slice(
|
|
474
473
|
self,
|
|
@@ -705,6 +704,13 @@ class SignalSchema:
|
|
|
705
704
|
|
|
706
705
|
return SignalSchema(self.values | schema_right)
|
|
707
706
|
|
|
707
|
+
def append(self, right: "SignalSchema") -> "SignalSchema":
|
|
708
|
+
missing_schema = {
|
|
709
|
+
key: right.values[key]
|
|
710
|
+
for key in [k for k in right.values if k not in self.values]
|
|
711
|
+
}
|
|
712
|
+
return SignalSchema(self.values | missing_schema)
|
|
713
|
+
|
|
708
714
|
def get_signals(self, target_type: type[DataModel]) -> Iterator[str]:
|
|
709
715
|
for path, type_, has_subtree, _ in self.get_flat_tree():
|
|
710
716
|
if has_subtree and issubclass(type_, target_type):
|
datachain/query/dataset.py
CHANGED
|
@@ -41,7 +41,7 @@ from datachain.data_storage.schema import (
|
|
|
41
41
|
partition_col_names,
|
|
42
42
|
partition_columns,
|
|
43
43
|
)
|
|
44
|
-
from datachain.dataset import DATASET_PREFIX, DatasetStatus, RowDict
|
|
44
|
+
from datachain.dataset import DATASET_PREFIX, DatasetDependency, DatasetStatus, RowDict
|
|
45
45
|
from datachain.error import DatasetNotFoundError, QueryScriptCancelError
|
|
46
46
|
from datachain.func.base import Function
|
|
47
47
|
from datachain.lib.listing import is_listing_dataset, listing_dataset_expired
|
|
@@ -166,11 +166,13 @@ class Step(ABC):
|
|
|
166
166
|
|
|
167
167
|
@frozen
|
|
168
168
|
class QueryStep:
|
|
169
|
+
"""A query that returns all rows from specific dataset version"""
|
|
170
|
+
|
|
169
171
|
catalog: "Catalog"
|
|
170
172
|
dataset_name: str
|
|
171
173
|
dataset_version: str
|
|
172
174
|
|
|
173
|
-
def apply(self):
|
|
175
|
+
def apply(self) -> "StepResult":
|
|
174
176
|
def q(*columns):
|
|
175
177
|
return sqlalchemy.select(*columns)
|
|
176
178
|
|
|
@@ -1127,9 +1129,14 @@ class DatasetQuery:
|
|
|
1127
1129
|
self.version = version
|
|
1128
1130
|
|
|
1129
1131
|
if is_listing_dataset(name):
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1132
|
+
if version:
|
|
1133
|
+
# this listing dataset should already be listed as we specify
|
|
1134
|
+
# exact version
|
|
1135
|
+
self._set_starting_step(self.catalog.get_dataset(name))
|
|
1136
|
+
else:
|
|
1137
|
+
# not setting query step yet as listing dataset might not exist at
|
|
1138
|
+
# this point
|
|
1139
|
+
self.list_ds_name = name
|
|
1133
1140
|
elif fallback_to_studio and is_token_set():
|
|
1134
1141
|
self._set_starting_step(
|
|
1135
1142
|
self.catalog.get_dataset_with_remote_fallback(name, version)
|
|
@@ -1205,11 +1212,8 @@ class DatasetQuery:
|
|
|
1205
1212
|
"""Setting listing function to be run if needed"""
|
|
1206
1213
|
self.listing_fn = fn
|
|
1207
1214
|
|
|
1208
|
-
def
|
|
1209
|
-
"""
|
|
1210
|
-
Apply the steps in the query and return the resulting
|
|
1211
|
-
sqlalchemy.SelectBase.
|
|
1212
|
-
"""
|
|
1215
|
+
def apply_listing_pre_step(self) -> None:
|
|
1216
|
+
"""Runs listing pre-step if needed"""
|
|
1213
1217
|
if self.list_ds_name and not self.starting_step:
|
|
1214
1218
|
listing_ds = None
|
|
1215
1219
|
try:
|
|
@@ -1225,6 +1229,13 @@ class DatasetQuery:
|
|
|
1225
1229
|
# at this point we know what is our starting listing dataset name
|
|
1226
1230
|
self._set_starting_step(listing_ds) # type: ignore [arg-type]
|
|
1227
1231
|
|
|
1232
|
+
def apply_steps(self) -> QueryGenerator:
|
|
1233
|
+
"""
|
|
1234
|
+
Apply the steps in the query and return the resulting
|
|
1235
|
+
sqlalchemy.SelectBase.
|
|
1236
|
+
"""
|
|
1237
|
+
self.apply_listing_pre_step()
|
|
1238
|
+
|
|
1228
1239
|
query = self.clone()
|
|
1229
1240
|
|
|
1230
1241
|
index = os.getenv("DATACHAIN_QUERY_CHUNK_INDEX", self._chunk_index)
|
|
@@ -1687,8 +1698,10 @@ class DatasetQuery:
|
|
|
1687
1698
|
name: Optional[str] = None,
|
|
1688
1699
|
version: Optional[str] = None,
|
|
1689
1700
|
feature_schema: Optional[dict] = None,
|
|
1701
|
+
dependencies: Optional[list[DatasetDependency]] = None,
|
|
1690
1702
|
description: Optional[str] = None,
|
|
1691
1703
|
attrs: Optional[list[str]] = None,
|
|
1704
|
+
update_version: Optional[str] = "patch",
|
|
1692
1705
|
**kwargs,
|
|
1693
1706
|
) -> "Self":
|
|
1694
1707
|
"""Save the query as a dataset."""
|
|
@@ -1723,6 +1736,7 @@ class DatasetQuery:
|
|
|
1723
1736
|
columns=columns,
|
|
1724
1737
|
description=description,
|
|
1725
1738
|
attrs=attrs,
|
|
1739
|
+
update_version=update_version,
|
|
1726
1740
|
**kwargs,
|
|
1727
1741
|
)
|
|
1728
1742
|
version = version or dataset.latest_version
|
|
@@ -1740,6 +1754,9 @@ class DatasetQuery:
|
|
|
1740
1754
|
)
|
|
1741
1755
|
self.catalog.update_dataset_version_with_warehouse_info(dataset, version)
|
|
1742
1756
|
|
|
1757
|
+
if dependencies:
|
|
1758
|
+
# overriding dependencies
|
|
1759
|
+
self.dependencies = {(dep.name, dep.version) for dep in dependencies}
|
|
1743
1760
|
self._add_dependencies(dataset, version) # type: ignore [arg-type]
|
|
1744
1761
|
finally:
|
|
1745
1762
|
self.cleanup()
|
datachain/remote/studio.py
CHANGED
|
@@ -388,6 +388,7 @@ class StudioClient:
|
|
|
388
388
|
python_version: Optional[str] = None,
|
|
389
389
|
requirements: Optional[str] = None,
|
|
390
390
|
repository: Optional[str] = None,
|
|
391
|
+
priority: Optional[int] = None,
|
|
391
392
|
) -> Response[JobData]:
|
|
392
393
|
data = {
|
|
393
394
|
"query": query,
|
|
@@ -399,6 +400,7 @@ class StudioClient:
|
|
|
399
400
|
"python_version": python_version,
|
|
400
401
|
"requirements": requirements,
|
|
401
402
|
"repository": repository,
|
|
403
|
+
"priority": priority,
|
|
402
404
|
}
|
|
403
405
|
return self._send_request("datachain/job", data)
|
|
404
406
|
|
datachain/studio.py
CHANGED
|
@@ -40,6 +40,7 @@ def process_jobs_args(args: "Namespace"):
|
|
|
40
40
|
args.repository,
|
|
41
41
|
args.req,
|
|
42
42
|
args.req_file,
|
|
43
|
+
args.priority,
|
|
43
44
|
)
|
|
44
45
|
|
|
45
46
|
if args.cmd == "cancel":
|
|
@@ -266,6 +267,7 @@ def create_job(
|
|
|
266
267
|
repository: Optional[str] = None,
|
|
267
268
|
req: Optional[list[str]] = None,
|
|
268
269
|
req_file: Optional[str] = None,
|
|
270
|
+
priority: Optional[int] = None,
|
|
269
271
|
):
|
|
270
272
|
query_type = "PYTHON" if query_file.endswith(".py") else "SHELL"
|
|
271
273
|
with open(query_file) as f:
|
|
@@ -294,6 +296,7 @@ def create_job(
|
|
|
294
296
|
python_version=python_version,
|
|
295
297
|
repository=repository,
|
|
296
298
|
requirements=requirements,
|
|
299
|
+
priority=priority,
|
|
297
300
|
)
|
|
298
301
|
if not response.ok:
|
|
299
302
|
raise DataChainError(response.message)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.18.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -44,7 +44,7 @@ Requires-Dist: datamodel-code-generator>=0.25
|
|
|
44
44
|
Requires-Dist: Pillow<12,>=10.0.0
|
|
45
45
|
Requires-Dist: msgpack<2,>=1.0.4
|
|
46
46
|
Requires-Dist: psutil
|
|
47
|
-
Requires-Dist: huggingface_hub
|
|
47
|
+
Requires-Dist: huggingface_hub
|
|
48
48
|
Requires-Dist: iterative-telemetry>=0.0.10
|
|
49
49
|
Requires-Dist: platformdirs
|
|
50
50
|
Requires-Dist: dvc-studio-client<1,>=0.21
|
|
@@ -3,7 +3,8 @@ datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
|
3
3
|
datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
|
|
4
4
|
datachain/cache.py,sha256=yQblPhOh_Mq74Ma7xT1CL1idLJ0HgrQxpGVYvRy_9Eg,3623
|
|
5
5
|
datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
|
|
6
|
-
datachain/dataset.py,sha256=
|
|
6
|
+
datachain/dataset.py,sha256=F0Yk9SmyAf0RNSAEWGjOyZ4nxgMNi538FaQaLPe7bJk,20531
|
|
7
|
+
datachain/delta.py,sha256=q-ritPMxgsTh53qJYd2N1TqZ3Inxc7GJ9JED9rE-Z1M,3994
|
|
7
8
|
datachain/error.py,sha256=bxAAL32lSeMgzsQDEHbGTGORj-mPzzpCRvWDPueJNN4,1092
|
|
8
9
|
datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
|
|
9
10
|
datachain/listing.py,sha256=JtExYIfKMFhEIIcSSWBmaxWpoS3ben7kb692cHHm4Lo,7079
|
|
@@ -14,11 +15,11 @@ datachain/progress.py,sha256=lRzxoYP4Qv2XBwD78sOkmYRzHFpZ2ExVNJF8wAeICtY,770
|
|
|
14
15
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
16
|
datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
|
|
16
17
|
datachain/semver.py,sha256=t_3Y5OGLEthrstBwuwrf5pXVquEuRFu3ZoGe3ajfJB8,1715
|
|
17
|
-
datachain/studio.py,sha256=
|
|
18
|
+
datachain/studio.py,sha256=1J2ANFVVA1ysPxBuLibQSnSXt0U9Vfgz9ZNGikYtWdk,11038
|
|
18
19
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
19
20
|
datachain/utils.py,sha256=DNqOi-Ydb7InyWvD9m7_yailxz6-YGpZzh00biQaHNo,15305
|
|
20
21
|
datachain/catalog/__init__.py,sha256=cMZzSz3VoUi-6qXSVaHYN-agxQuAcz2XSqnEPZ55crE,353
|
|
21
|
-
datachain/catalog/catalog.py,sha256=
|
|
22
|
+
datachain/catalog/catalog.py,sha256=aB8IGLuvWjZVROOmOKksA0gKiLQyur9Z4GCRdjgfdXo,58725
|
|
22
23
|
datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
|
|
23
24
|
datachain/catalog/loader.py,sha256=UXjYD6BNRoupPvkiz3-b04jepXhtLHCA4gzKFnXxOtQ,5987
|
|
24
25
|
datachain/cli/__init__.py,sha256=eKCyqT05OMESHXCC93iQdqzusBdk1ptqZbBeaEghkgo,8344
|
|
@@ -32,7 +33,7 @@ datachain/cli/commands/misc.py,sha256=c0DmkOLwcDI2YhA8ArOuLJk6aGzSMZCiKL_E2JGibV
|
|
|
32
33
|
datachain/cli/commands/query.py,sha256=Xzfgh14nPVH-sclqX1tpZqgfdTugw5s_44v0D33z6FA,1505
|
|
33
34
|
datachain/cli/commands/show.py,sha256=Cf8wBs12h-xtdOzjU5GTDy2C8rF5HJSF0hDJYER1zH8,1606
|
|
34
35
|
datachain/cli/parser/__init__.py,sha256=sjCIcosAtZqa0m50GMQHqmCkZSYxKyZNwQ29XwRQlP0,15913
|
|
35
|
-
datachain/cli/parser/job.py,sha256=
|
|
36
|
+
datachain/cli/parser/job.py,sha256=acdVYuTsqluRDI_FYhZ1ohjQcVtBj-taUm8y9tGb0_0,4580
|
|
36
37
|
datachain/cli/parser/studio.py,sha256=Y-1OlQGecLVi9QofvWUfSlPd2ISyaESf7QFGZqGsrdw,3609
|
|
37
38
|
datachain/cli/parser/utils.py,sha256=rETdD-9Hq9A4OolgfT7jQw4aoawtbfmkdtH6E7nkhpI,2888
|
|
38
39
|
datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
|
|
@@ -50,8 +51,8 @@ datachain/data_storage/metastore.py,sha256=vo2ab-U_-BKfeFYTmvpbCoMyMZEVxrVqM9Djj
|
|
|
50
51
|
datachain/data_storage/schema.py,sha256=asZYz1cg_WKfe2Q-k5W51E2z2CzHU5B4QEDZDMFr8yo,9346
|
|
51
52
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
52
53
|
datachain/data_storage/sqlite.py,sha256=bwZAB_NUMT2WMv5tPQnnLFA0P-PiQtxzSaQ1q6xDxOU,24590
|
|
53
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
54
|
-
datachain/diff/__init__.py,sha256
|
|
54
|
+
datachain/data_storage/warehouse.py,sha256=RkdX1cunfmpDkRYRdOGNy0kLw7RekIokVl3Dd0i-hrA,31534
|
|
55
|
+
datachain/diff/__init__.py,sha256=-OFZzgOplqO84iWgGY7kfe60NXaWR9JRIh9T-uJboAM,9668
|
|
55
56
|
datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
56
57
|
datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
|
|
57
58
|
datachain/fs/utils.py,sha256=s-FkTOCGBk-b6TT3toQH51s9608pofoFjUSTc1yy7oE,825
|
|
@@ -80,7 +81,7 @@ datachain/lib/meta_formats.py,sha256=Epydbdch1g4CojK8wd_ePzmwmljC4fVWlJtZ16jsX-A
|
|
|
80
81
|
datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
|
|
81
82
|
datachain/lib/pytorch.py,sha256=elrmJ4YUDC2LZ9yXM1KwImVBOYIBJf6k0ZR7eSe6Aao,7712
|
|
82
83
|
datachain/lib/settings.py,sha256=ZELRCTLbi5vzRPiDX6cQ9LLg9TefJ_A05gIGni0lll8,2535
|
|
83
|
-
datachain/lib/signal_schema.py,sha256=
|
|
84
|
+
datachain/lib/signal_schema.py,sha256=Zhg8qThFDf9eoNWFH6KGeYB-sIGys7A_ybq2CUBG7Dg,36127
|
|
84
85
|
datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
|
|
85
86
|
datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
|
|
86
87
|
datachain/lib/udf.py,sha256=FWqA476ygdk4MU-0qehYKxvnt8Tekh21Cyf3RgddD1k,16674
|
|
@@ -98,15 +99,15 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
|
|
|
98
99
|
datachain/lib/dc/__init__.py,sha256=HD0NYrdy44u6kkpvgGjJcvGz-UGTHui2azghcT8ZUg0,838
|
|
99
100
|
datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
|
|
100
101
|
datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,6051
|
|
101
|
-
datachain/lib/dc/datachain.py,sha256=
|
|
102
|
-
datachain/lib/dc/datasets.py,sha256=
|
|
102
|
+
datachain/lib/dc/datachain.py,sha256=DIlbIyO3aUidDTo3S2pOrSDyrVhr49iLCFGgL_otSig,80558
|
|
103
|
+
datachain/lib/dc/datasets.py,sha256=G65leCuo_3bItmvjoV1wK0pzj7a2IQqe3xRsflpF3xM,10794
|
|
103
104
|
datachain/lib/dc/hf.py,sha256=PJl2wiLjdRsMz0SYbLT-6H8b-D5i2WjeH7li8HHOk_0,2145
|
|
104
105
|
datachain/lib/dc/json.py,sha256=ZUThPDAaP2gBFIL5vsQTwKBcuN_dhvC_O44wdDv0jEc,2683
|
|
105
106
|
datachain/lib/dc/listings.py,sha256=2na9v63xO1vPUNaoBSzA-TSN49V7zQAb-4iS1wOPLFE,1029
|
|
106
107
|
datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
|
|
107
108
|
datachain/lib/dc/parquet.py,sha256=zYcSgrWwyEDW9UxGUSVdIVsCu15IGEf0xL8KfWQqK94,1782
|
|
108
109
|
datachain/lib/dc/records.py,sha256=J1I69J2gFIBjRTGr2LG-5qn_rTVzRLcr2y3tVDrmHdg,3068
|
|
109
|
-
datachain/lib/dc/storage.py,sha256=
|
|
110
|
+
datachain/lib/dc/storage.py,sha256=YUlw3OtdRmYc2k24AmqjnqJK8k1H-onjh-mCxu_3BbE,8195
|
|
110
111
|
datachain/lib/dc/utils.py,sha256=VawOAlJSvAtZbsMg33s5tJe21TRx1Km3QggI1nN6tnw,3984
|
|
111
112
|
datachain/lib/dc/values.py,sha256=7l1n352xWrEdql2NhBcZ3hj8xyPglWiY4qHjFPjn6iw,1428
|
|
112
113
|
datachain/model/__init__.py,sha256=R9faX5OHV1xh2EW-g2MPedwbtEqt3LodJRyluB-QylI,189
|
|
@@ -120,7 +121,7 @@ datachain/model/ultralytics/pose.py,sha256=gXAWfAk4OWZl93hKcQPKZvqJa3nIrECB4RM8K
|
|
|
120
121
|
datachain/model/ultralytics/segment.py,sha256=koq1HASo29isf0in6oSlzmU4IzsmOXe87F1ajQQVfh4,2911
|
|
121
122
|
datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
|
|
122
123
|
datachain/query/batch.py,sha256=-goxLpE0EUvaDHu66rstj53UnfHpYfBUGux8GSpJ93k,4306
|
|
123
|
-
datachain/query/dataset.py,sha256=
|
|
124
|
+
datachain/query/dataset.py,sha256=ac4mameklmZ-mnL3ZSzv5n8teaPnoXT8aWCdGlgkZE8,59904
|
|
124
125
|
datachain/query/dispatch.py,sha256=15M3zlTUFKM6D2ijITX4o5QxCkRe2klkODsIDi3aQOg,15544
|
|
125
126
|
datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
|
|
126
127
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
@@ -130,7 +131,7 @@ datachain/query/session.py,sha256=3nyOvPmLiA86IdHc3BL6Dt_himtHVvaDz_I1h3hZ_gI,65
|
|
|
130
131
|
datachain/query/udf.py,sha256=e753bDJzTNjGFQn1WGTvOAWSwjDbrFI1-_DDWkWN2ls,1343
|
|
131
132
|
datachain/query/utils.py,sha256=HaSDNH_XGvp_NIcXjcB7j4vJRPi4_tbztDWclYelHY4,1208
|
|
132
133
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
133
|
-
datachain/remote/studio.py,sha256=
|
|
134
|
+
datachain/remote/studio.py,sha256=pkgrhG7Bc5Z8RykgTg0S_XXiI8CpRnEbyXrOb5osgAM,13598
|
|
134
135
|
datachain/sql/__init__.py,sha256=6SQRdbljO3d2hx3EAVXEZrHQKv5jth0Jh98PogT59No,262
|
|
135
136
|
datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
|
|
136
137
|
datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
|
|
@@ -152,9 +153,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
152
153
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
153
154
|
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
154
155
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
155
|
-
datachain-0.
|
|
156
|
-
datachain-0.
|
|
157
|
-
datachain-0.
|
|
158
|
-
datachain-0.
|
|
159
|
-
datachain-0.
|
|
160
|
-
datachain-0.
|
|
156
|
+
datachain-0.18.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
157
|
+
datachain-0.18.0.dist-info/METADATA,sha256=seFHQYDt0EnbQiTRz-SixSCKTMFmF9p94Bd0E4lAyvY,11331
|
|
158
|
+
datachain-0.18.0.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
|
|
159
|
+
datachain-0.18.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
160
|
+
datachain-0.18.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
161
|
+
datachain-0.18.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|