datachain 0.6.7__py3-none-any.whl → 0.6.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +2 -1
- datachain/catalog/catalog.py +5 -0
- datachain/cli.py +137 -23
- datachain/client/fsspec.py +1 -1
- datachain/data_storage/metastore.py +4 -0
- datachain/dataset.py +5 -0
- datachain/lib/dataset_info.py +3 -0
- datachain/lib/dc.py +26 -6
- datachain/lib/file.py +0 -3
- datachain/lib/meta_formats.py +1 -0
- datachain/lib/models/__init__.py +5 -0
- datachain/lib/models/bbox.py +45 -0
- datachain/lib/models/pose.py +37 -0
- datachain/lib/models/yolo.py +39 -0
- datachain/lib/signal_schema.py +1 -1
- datachain/remote/studio.py +12 -2
- datachain/studio.py +18 -6
- {datachain-0.6.7.dist-info → datachain-0.6.9.dist-info}/METADATA +43 -21
- {datachain-0.6.7.dist-info → datachain-0.6.9.dist-info}/RECORD +23 -19
- {datachain-0.6.7.dist-info → datachain-0.6.9.dist-info}/WHEEL +1 -1
- {datachain-0.6.7.dist-info → datachain-0.6.9.dist-info}/LICENSE +0 -0
- {datachain-0.6.7.dist-info → datachain-0.6.9.dist-info}/entry_points.txt +0 -0
- {datachain-0.6.7.dist-info → datachain-0.6.9.dist-info}/top_level.txt +0 -0
datachain/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from datachain.lib import func
|
|
1
|
+
from datachain.lib import func, models
|
|
2
2
|
from datachain.lib.data_model import DataModel, DataType, is_chain_type
|
|
3
3
|
from datachain.lib.dc import C, Column, DataChain, Sys
|
|
4
4
|
from datachain.lib.file import (
|
|
@@ -38,5 +38,6 @@ __all__ = [
|
|
|
38
38
|
"func",
|
|
39
39
|
"is_chain_type",
|
|
40
40
|
"metrics",
|
|
41
|
+
"models",
|
|
41
42
|
"param",
|
|
42
43
|
]
|
datachain/catalog/catalog.py
CHANGED
|
@@ -769,6 +769,7 @@ class Catalog:
|
|
|
769
769
|
create_rows: Optional[bool] = True,
|
|
770
770
|
validate_version: Optional[bool] = True,
|
|
771
771
|
listing: Optional[bool] = False,
|
|
772
|
+
uuid: Optional[str] = None,
|
|
772
773
|
) -> "DatasetRecord":
|
|
773
774
|
"""
|
|
774
775
|
Creates new dataset of a specific version.
|
|
@@ -816,6 +817,7 @@ class Catalog:
|
|
|
816
817
|
query_script=query_script,
|
|
817
818
|
create_rows_table=create_rows,
|
|
818
819
|
columns=columns,
|
|
820
|
+
uuid=uuid,
|
|
819
821
|
)
|
|
820
822
|
|
|
821
823
|
def create_new_dataset_version(
|
|
@@ -832,6 +834,7 @@ class Catalog:
|
|
|
832
834
|
script_output="",
|
|
833
835
|
create_rows_table=True,
|
|
834
836
|
job_id: Optional[str] = None,
|
|
837
|
+
uuid: Optional[str] = None,
|
|
835
838
|
) -> DatasetRecord:
|
|
836
839
|
"""
|
|
837
840
|
Creates dataset version if it doesn't exist.
|
|
@@ -855,6 +858,7 @@ class Catalog:
|
|
|
855
858
|
schema=schema,
|
|
856
859
|
job_id=job_id,
|
|
857
860
|
ignore_if_exists=True,
|
|
861
|
+
uuid=uuid,
|
|
858
862
|
)
|
|
859
863
|
|
|
860
864
|
if create_rows_table:
|
|
@@ -1400,6 +1404,7 @@ class Catalog:
|
|
|
1400
1404
|
columns=columns,
|
|
1401
1405
|
feature_schema=remote_dataset_version.feature_schema,
|
|
1402
1406
|
validate_version=False,
|
|
1407
|
+
uuid=remote_dataset_version.uuid,
|
|
1403
1408
|
)
|
|
1404
1409
|
|
|
1405
1410
|
# asking remote to export dataset rows table to s3 and to return signed
|
datachain/cli.py
CHANGED
|
@@ -4,18 +4,21 @@ import shlex
|
|
|
4
4
|
import sys
|
|
5
5
|
import traceback
|
|
6
6
|
from argparse import Action, ArgumentParser, ArgumentTypeError, Namespace
|
|
7
|
-
from collections.abc import Iterable, Iterator,
|
|
7
|
+
from collections.abc import Iterable, Iterator, Sequence
|
|
8
8
|
from importlib.metadata import PackageNotFoundError, version
|
|
9
9
|
from itertools import chain
|
|
10
10
|
from multiprocessing import freeze_support
|
|
11
11
|
from typing import TYPE_CHECKING, Optional, Union
|
|
12
12
|
|
|
13
13
|
import shtab
|
|
14
|
+
from tabulate import tabulate
|
|
14
15
|
|
|
15
16
|
from datachain import Session, utils
|
|
16
17
|
from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
|
|
18
|
+
from datachain.config import Config
|
|
19
|
+
from datachain.error import DataChainError
|
|
17
20
|
from datachain.lib.dc import DataChain
|
|
18
|
-
from datachain.studio import process_studio_cli_args
|
|
21
|
+
from datachain.studio import list_datasets, process_studio_cli_args
|
|
19
22
|
from datachain.telemetry import telemetry
|
|
20
23
|
|
|
21
24
|
if TYPE_CHECKING:
|
|
@@ -416,7 +419,36 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
416
419
|
help="Dataset labels",
|
|
417
420
|
)
|
|
418
421
|
|
|
419
|
-
subp.add_parser(
|
|
422
|
+
datasets_parser = subp.add_parser(
|
|
423
|
+
"datasets", parents=[parent_parser], description="List datasets"
|
|
424
|
+
)
|
|
425
|
+
datasets_parser.add_argument(
|
|
426
|
+
"--studio",
|
|
427
|
+
action="store_true",
|
|
428
|
+
default=False,
|
|
429
|
+
help="List the files in the Studio",
|
|
430
|
+
)
|
|
431
|
+
datasets_parser.add_argument(
|
|
432
|
+
"-L",
|
|
433
|
+
"--local",
|
|
434
|
+
action="store_true",
|
|
435
|
+
default=False,
|
|
436
|
+
help="List local files only",
|
|
437
|
+
)
|
|
438
|
+
datasets_parser.add_argument(
|
|
439
|
+
"-a",
|
|
440
|
+
"--all",
|
|
441
|
+
action="store_true",
|
|
442
|
+
default=True,
|
|
443
|
+
help="List all files including hidden files",
|
|
444
|
+
)
|
|
445
|
+
datasets_parser.add_argument(
|
|
446
|
+
"--team",
|
|
447
|
+
action="store",
|
|
448
|
+
default=None,
|
|
449
|
+
help="The team to list datasets for. By default, it will use team from config.",
|
|
450
|
+
)
|
|
451
|
+
|
|
420
452
|
rm_dataset_parser = subp.add_parser(
|
|
421
453
|
"rm-dataset", parents=[parent_parser], description="Removes dataset"
|
|
422
454
|
)
|
|
@@ -474,10 +506,30 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
474
506
|
help="List files in the long format",
|
|
475
507
|
)
|
|
476
508
|
parse_ls.add_argument(
|
|
477
|
-
"--
|
|
509
|
+
"--studio",
|
|
510
|
+
action="store_true",
|
|
511
|
+
default=False,
|
|
512
|
+
help="List the files in the Studio",
|
|
513
|
+
)
|
|
514
|
+
parse_ls.add_argument(
|
|
515
|
+
"-L",
|
|
516
|
+
"--local",
|
|
517
|
+
action="store_true",
|
|
518
|
+
default=False,
|
|
519
|
+
help="List local files only",
|
|
520
|
+
)
|
|
521
|
+
parse_ls.add_argument(
|
|
522
|
+
"-a",
|
|
523
|
+
"--all",
|
|
524
|
+
action="store_true",
|
|
525
|
+
default=True,
|
|
526
|
+
help="List all files including hidden files",
|
|
527
|
+
)
|
|
528
|
+
parse_ls.add_argument(
|
|
529
|
+
"--team",
|
|
478
530
|
action="store",
|
|
479
|
-
default=
|
|
480
|
-
help="
|
|
531
|
+
default=None,
|
|
532
|
+
help="The team to list datasets for. By default, it will use team from config.",
|
|
481
533
|
)
|
|
482
534
|
|
|
483
535
|
parse_du = subp.add_parser(
|
|
@@ -758,11 +810,12 @@ def format_ls_entry(entry: str) -> str:
|
|
|
758
810
|
def ls_remote(
|
|
759
811
|
paths: Iterable[str],
|
|
760
812
|
long: bool = False,
|
|
813
|
+
team: Optional[str] = None,
|
|
761
814
|
):
|
|
762
815
|
from datachain.node import long_line_str
|
|
763
816
|
from datachain.remote.studio import StudioClient
|
|
764
817
|
|
|
765
|
-
client = StudioClient()
|
|
818
|
+
client = StudioClient(team=team)
|
|
766
819
|
first = True
|
|
767
820
|
for path, response in client.ls(paths):
|
|
768
821
|
if not first:
|
|
@@ -789,28 +842,66 @@ def ls_remote(
|
|
|
789
842
|
def ls(
|
|
790
843
|
sources,
|
|
791
844
|
long: bool = False,
|
|
792
|
-
|
|
793
|
-
|
|
845
|
+
studio: bool = False,
|
|
846
|
+
local: bool = False,
|
|
847
|
+
all: bool = True,
|
|
848
|
+
team: Optional[str] = None,
|
|
794
849
|
**kwargs,
|
|
795
850
|
):
|
|
796
|
-
|
|
797
|
-
|
|
851
|
+
token = Config().read().get("studio", {}).get("token")
|
|
852
|
+
all, local, studio = _determine_flavors(studio, local, all, token)
|
|
798
853
|
|
|
799
|
-
|
|
800
|
-
remote_type = config["type"]
|
|
801
|
-
if remote_type == "local":
|
|
854
|
+
if all or local:
|
|
802
855
|
ls_local(sources, long=long, **kwargs)
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
856
|
+
|
|
857
|
+
if (all or studio) and token:
|
|
858
|
+
ls_remote(sources, long=long, team=team)
|
|
859
|
+
|
|
860
|
+
|
|
861
|
+
def datasets(
|
|
862
|
+
catalog: "Catalog",
|
|
863
|
+
studio: bool = False,
|
|
864
|
+
local: bool = False,
|
|
865
|
+
all: bool = True,
|
|
866
|
+
team: Optional[str] = None,
|
|
867
|
+
):
|
|
868
|
+
token = Config().read().get("studio", {}).get("token")
|
|
869
|
+
all, local, studio = _determine_flavors(studio, local, all, token)
|
|
870
|
+
|
|
871
|
+
local_datasets = set(list_datasets_local(catalog)) if all or local else set()
|
|
872
|
+
studio_datasets = (
|
|
873
|
+
set(list_datasets(team=team)) if (all or studio) and token else set()
|
|
874
|
+
)
|
|
875
|
+
|
|
876
|
+
rows = [
|
|
877
|
+
_datasets_tabulate_row(
|
|
878
|
+
name=name,
|
|
879
|
+
version=version,
|
|
880
|
+
both=(all or (local and studio)) and token,
|
|
881
|
+
local=(name, version) in local_datasets,
|
|
882
|
+
studio=(name, version) in studio_datasets,
|
|
807
883
|
)
|
|
884
|
+
for name, version in local_datasets.union(studio_datasets)
|
|
885
|
+
]
|
|
886
|
+
|
|
887
|
+
print(tabulate(rows, headers="keys"))
|
|
808
888
|
|
|
809
889
|
|
|
810
|
-
def
|
|
890
|
+
def list_datasets_local(catalog: "Catalog"):
|
|
811
891
|
for d in catalog.ls_datasets():
|
|
812
892
|
for v in d.versions:
|
|
813
|
-
|
|
893
|
+
yield (d.name, v.version)
|
|
894
|
+
|
|
895
|
+
|
|
896
|
+
def _datasets_tabulate_row(name, version, both, local, studio):
|
|
897
|
+
row = {
|
|
898
|
+
"Name": name,
|
|
899
|
+
"Version": version,
|
|
900
|
+
}
|
|
901
|
+
if both:
|
|
902
|
+
row["Studio"] = "\u2714" if studio else "\u2716"
|
|
903
|
+
row["Local"] = "\u2714" if local else "\u2716"
|
|
904
|
+
return row
|
|
814
905
|
|
|
815
906
|
|
|
816
907
|
def rm_dataset(
|
|
@@ -953,6 +1044,20 @@ def completion(shell: str) -> str:
|
|
|
953
1044
|
)
|
|
954
1045
|
|
|
955
1046
|
|
|
1047
|
+
def _determine_flavors(studio: bool, local: bool, all: bool, token: Optional[str]):
|
|
1048
|
+
if studio and not token:
|
|
1049
|
+
raise DataChainError(
|
|
1050
|
+
"Not logged in to Studio. Log in with 'datachain studio login'."
|
|
1051
|
+
)
|
|
1052
|
+
|
|
1053
|
+
if local or studio:
|
|
1054
|
+
all = False
|
|
1055
|
+
|
|
1056
|
+
all = all and not (local or studio)
|
|
1057
|
+
|
|
1058
|
+
return all, local, studio
|
|
1059
|
+
|
|
1060
|
+
|
|
956
1061
|
def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR0915
|
|
957
1062
|
# Required for Windows multiprocessing support
|
|
958
1063
|
freeze_support()
|
|
@@ -1032,12 +1137,21 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1032
1137
|
ls(
|
|
1033
1138
|
args.sources,
|
|
1034
1139
|
long=bool(args.long),
|
|
1035
|
-
|
|
1140
|
+
studio=args.studio,
|
|
1141
|
+
local=args.local,
|
|
1142
|
+
all=args.all,
|
|
1143
|
+
team=args.team,
|
|
1036
1144
|
update=bool(args.update),
|
|
1037
1145
|
client_config=client_config,
|
|
1038
1146
|
)
|
|
1039
|
-
elif args.command == "
|
|
1040
|
-
|
|
1147
|
+
elif args.command == "datasets":
|
|
1148
|
+
datasets(
|
|
1149
|
+
catalog=catalog,
|
|
1150
|
+
studio=args.studio,
|
|
1151
|
+
local=args.local,
|
|
1152
|
+
all=args.all,
|
|
1153
|
+
team=args.team,
|
|
1154
|
+
)
|
|
1041
1155
|
elif args.command == "show":
|
|
1042
1156
|
show(
|
|
1043
1157
|
catalog,
|
datachain/client/fsspec.py
CHANGED
|
@@ -358,7 +358,7 @@ class Client(ABC):
|
|
|
358
358
|
) -> BinaryIO:
|
|
359
359
|
"""Open a file, including files in tar archives."""
|
|
360
360
|
if use_cache and (cache_path := self.cache.get_path(file)):
|
|
361
|
-
return open(cache_path, mode="rb")
|
|
361
|
+
return open(cache_path, mode="rb")
|
|
362
362
|
assert not file.location
|
|
363
363
|
return FileWrapper(self.fs.open(self.get_full_path(file.path)), cb) # type: ignore[return-value]
|
|
364
364
|
|
|
@@ -138,6 +138,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
138
138
|
size: Optional[int] = None,
|
|
139
139
|
preview: Optional[list[dict]] = None,
|
|
140
140
|
job_id: Optional[str] = None,
|
|
141
|
+
uuid: Optional[str] = None,
|
|
141
142
|
) -> DatasetRecord:
|
|
142
143
|
"""Creates new dataset version."""
|
|
143
144
|
|
|
@@ -352,6 +353,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
352
353
|
"""Datasets versions table columns."""
|
|
353
354
|
return [
|
|
354
355
|
Column("id", Integer, primary_key=True),
|
|
356
|
+
Column("uuid", Text, nullable=False, default=uuid4()),
|
|
355
357
|
Column(
|
|
356
358
|
"dataset_id",
|
|
357
359
|
Integer,
|
|
@@ -545,6 +547,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
545
547
|
size: Optional[int] = None,
|
|
546
548
|
preview: Optional[list[dict]] = None,
|
|
547
549
|
job_id: Optional[str] = None,
|
|
550
|
+
uuid: Optional[str] = None,
|
|
548
551
|
conn=None,
|
|
549
552
|
) -> DatasetRecord:
|
|
550
553
|
"""Creates new dataset version."""
|
|
@@ -555,6 +558,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
555
558
|
|
|
556
559
|
query = self._datasets_versions_insert().values(
|
|
557
560
|
dataset_id=dataset.id,
|
|
561
|
+
uuid=uuid or str(uuid4()),
|
|
558
562
|
version=version,
|
|
559
563
|
status=status,
|
|
560
564
|
feature_schema=json.dumps(feature_schema or {}),
|
datachain/dataset.py
CHANGED
|
@@ -163,6 +163,7 @@ class DatasetStatus:
|
|
|
163
163
|
@dataclass
|
|
164
164
|
class DatasetVersion:
|
|
165
165
|
id: int
|
|
166
|
+
uuid: str
|
|
166
167
|
dataset_id: int
|
|
167
168
|
version: int
|
|
168
169
|
status: int
|
|
@@ -184,6 +185,7 @@ class DatasetVersion:
|
|
|
184
185
|
def parse( # noqa: PLR0913
|
|
185
186
|
cls: type[V],
|
|
186
187
|
id: int,
|
|
188
|
+
uuid: str,
|
|
187
189
|
dataset_id: int,
|
|
188
190
|
version: int,
|
|
189
191
|
status: int,
|
|
@@ -203,6 +205,7 @@ class DatasetVersion:
|
|
|
203
205
|
):
|
|
204
206
|
return cls(
|
|
205
207
|
id,
|
|
208
|
+
uuid,
|
|
206
209
|
dataset_id,
|
|
207
210
|
version,
|
|
208
211
|
status,
|
|
@@ -306,6 +309,7 @@ class DatasetRecord:
|
|
|
306
309
|
query_script: str,
|
|
307
310
|
schema: str,
|
|
308
311
|
version_id: int,
|
|
312
|
+
version_uuid: str,
|
|
309
313
|
version_dataset_id: int,
|
|
310
314
|
version: int,
|
|
311
315
|
version_status: int,
|
|
@@ -331,6 +335,7 @@ class DatasetRecord:
|
|
|
331
335
|
|
|
332
336
|
dataset_version = DatasetVersion.parse(
|
|
333
337
|
version_id,
|
|
338
|
+
version_uuid,
|
|
334
339
|
version_dataset_id,
|
|
335
340
|
version,
|
|
336
341
|
version_status,
|
datachain/lib/dataset_info.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
4
|
+
from uuid import uuid4
|
|
4
5
|
|
|
5
6
|
from pydantic import Field, field_validator
|
|
6
7
|
|
|
@@ -15,6 +16,7 @@ if TYPE_CHECKING:
|
|
|
15
16
|
|
|
16
17
|
class DatasetInfo(DataModel):
|
|
17
18
|
name: str
|
|
19
|
+
uuid: str = Field(default=str(uuid4()))
|
|
18
20
|
version: int = Field(default=1)
|
|
19
21
|
status: int = Field(default=DatasetStatus.CREATED)
|
|
20
22
|
created_at: datetime = Field(default=TIME_ZERO)
|
|
@@ -60,6 +62,7 @@ class DatasetInfo(DataModel):
|
|
|
60
62
|
job: Optional[Job],
|
|
61
63
|
) -> "Self":
|
|
62
64
|
return cls(
|
|
65
|
+
uuid=version.uuid,
|
|
63
66
|
name=dataset.name,
|
|
64
67
|
version=version.version,
|
|
65
68
|
status=version.status,
|
datachain/lib/dc.py
CHANGED
|
@@ -30,7 +30,7 @@ from datachain.client.local import FileClient
|
|
|
30
30
|
from datachain.dataset import DatasetRecord
|
|
31
31
|
from datachain.lib.convert.python_to_sql import python_to_sql
|
|
32
32
|
from datachain.lib.convert.values_to_tuples import values_to_tuples
|
|
33
|
-
from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
|
|
33
|
+
from datachain.lib.data_model import DataModel, DataType, DataValue, dict_to_data_model
|
|
34
34
|
from datachain.lib.dataset_info import DatasetInfo
|
|
35
35
|
from datachain.lib.file import ArrowRow, File, get_file_type
|
|
36
36
|
from datachain.lib.file import ExportPlacement as FileExportPlacement
|
|
@@ -895,7 +895,7 @@ class DataChain:
|
|
|
895
895
|
2. Group-based UDF function input: Instead of individual rows, the function
|
|
896
896
|
receives a list all rows within each group defined by `partition_by`.
|
|
897
897
|
|
|
898
|
-
|
|
898
|
+
Examples:
|
|
899
899
|
```py
|
|
900
900
|
chain = chain.agg(
|
|
901
901
|
total=lambda category, amount: [sum(amount)],
|
|
@@ -904,6 +904,26 @@ class DataChain:
|
|
|
904
904
|
)
|
|
905
905
|
chain.save("new_dataset")
|
|
906
906
|
```
|
|
907
|
+
|
|
908
|
+
An alternative syntax, when you need to specify a more complex function:
|
|
909
|
+
|
|
910
|
+
```py
|
|
911
|
+
# It automatically resolves which columns to pass to the function
|
|
912
|
+
# by looking at the function signature.
|
|
913
|
+
def agg_sum(
|
|
914
|
+
file: list[File], amount: list[float]
|
|
915
|
+
) -> Iterator[tuple[File, float]]:
|
|
916
|
+
yield file[0], sum(amount)
|
|
917
|
+
|
|
918
|
+
chain = chain.agg(
|
|
919
|
+
agg_sum,
|
|
920
|
+
output={"file": File, "total": float},
|
|
921
|
+
# Alternative syntax is to use `C` (short for Column) to specify
|
|
922
|
+
# a column name or a nested column, e.g. C("file.path").
|
|
923
|
+
partition_by=C("category"),
|
|
924
|
+
)
|
|
925
|
+
chain.save("new_dataset")
|
|
926
|
+
```
|
|
907
927
|
"""
|
|
908
928
|
udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
|
|
909
929
|
return self._evolve(
|
|
@@ -1242,15 +1262,15 @@ class DataChain:
|
|
|
1242
1262
|
return self.results(row_factory=to_dict)
|
|
1243
1263
|
|
|
1244
1264
|
@overload
|
|
1245
|
-
def collect(self) -> Iterator[tuple[
|
|
1265
|
+
def collect(self) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1246
1266
|
|
|
1247
1267
|
@overload
|
|
1248
|
-
def collect(self, col: str) -> Iterator[
|
|
1268
|
+
def collect(self, col: str) -> Iterator[DataValue]: ...
|
|
1249
1269
|
|
|
1250
1270
|
@overload
|
|
1251
|
-
def collect(self, *cols: str) -> Iterator[tuple[
|
|
1271
|
+
def collect(self, *cols: str) -> Iterator[tuple[DataValue, ...]]: ...
|
|
1252
1272
|
|
|
1253
|
-
def collect(self, *cols: str) -> Iterator[Union[
|
|
1273
|
+
def collect(self, *cols: str) -> Iterator[Union[DataValue, tuple[DataValue, ...]]]: # type: ignore[overload-overlap,misc]
|
|
1254
1274
|
"""Yields rows of values, optionally limited to the specified columns.
|
|
1255
1275
|
|
|
1256
1276
|
Args:
|
datachain/lib/file.py
CHANGED
|
@@ -20,9 +20,6 @@ from PIL import Image
|
|
|
20
20
|
from pyarrow.dataset import dataset
|
|
21
21
|
from pydantic import Field, field_validator
|
|
22
22
|
|
|
23
|
-
if TYPE_CHECKING:
|
|
24
|
-
from typing_extensions import Self
|
|
25
|
-
|
|
26
23
|
from datachain.client.fileslice import FileSlice
|
|
27
24
|
from datachain.lib.data_model import DataModel
|
|
28
25
|
from datachain.lib.utils import DataChainError
|
datachain/lib/meta_formats.py
CHANGED
|
@@ -114,6 +114,7 @@ def read_meta( # noqa: C901
|
|
|
114
114
|
)
|
|
115
115
|
)
|
|
116
116
|
(model_output,) = chain.collect("meta_schema")
|
|
117
|
+
assert isinstance(model_output, str)
|
|
117
118
|
if print_schema:
|
|
118
119
|
print(f"{model_output}")
|
|
119
120
|
# Below 'spec' should be a dynamically converted DataModel from Pydantic
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import Field
|
|
4
|
+
|
|
5
|
+
from datachain.lib.data_model import DataModel
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BBox(DataModel):
|
|
9
|
+
"""
|
|
10
|
+
A data model for representing bounding boxes.
|
|
11
|
+
|
|
12
|
+
Attributes:
|
|
13
|
+
title (str): The title of the bounding box.
|
|
14
|
+
x1 (float): The x-coordinate of the top-left corner of the bounding box.
|
|
15
|
+
y1 (float): The y-coordinate of the top-left corner of the bounding box.
|
|
16
|
+
x2 (float): The x-coordinate of the bottom-right corner of the bounding box.
|
|
17
|
+
y2 (float): The y-coordinate of the bottom-right corner of the bounding box.
|
|
18
|
+
|
|
19
|
+
The bounding box is defined by two points:
|
|
20
|
+
- (x1, y1): The top-left corner of the box.
|
|
21
|
+
- (x2, y2): The bottom-right corner of the box.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
title: str = Field(default="")
|
|
25
|
+
x1: float = Field(default=0)
|
|
26
|
+
y1: float = Field(default=0)
|
|
27
|
+
x2: float = Field(default=0)
|
|
28
|
+
y2: float = Field(default=0)
|
|
29
|
+
|
|
30
|
+
@staticmethod
|
|
31
|
+
def from_xywh(bbox: list[float], title: Optional[str] = None) -> "BBox":
|
|
32
|
+
"""
|
|
33
|
+
Converts a bounding box in (x, y, width, height) format
|
|
34
|
+
to a BBox data model instance.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
bbox (list[float]): A bounding box, represented as a list
|
|
38
|
+
of four floats [x, y, width, height].
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
BBox2D: An instance of the BBox data model.
|
|
42
|
+
"""
|
|
43
|
+
assert len(bbox) == 4, f"Bounding box must have 4 elements, got f{len(bbox)}"
|
|
44
|
+
x, y, w, h = bbox
|
|
45
|
+
return BBox(title=title or "", x1=x, y1=y, x2=x + w, y2=y + h)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from pydantic import Field
|
|
2
|
+
|
|
3
|
+
from datachain.lib.data_model import DataModel
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Pose(DataModel):
|
|
7
|
+
"""
|
|
8
|
+
A data model for representing pose keypoints.
|
|
9
|
+
|
|
10
|
+
Attributes:
|
|
11
|
+
x (list[float]): The x-coordinates of the keypoints.
|
|
12
|
+
y (list[float]): The y-coordinates of the keypoints.
|
|
13
|
+
|
|
14
|
+
The keypoints are represented as lists of x and y coordinates, where each index
|
|
15
|
+
corresponds to a specific body part.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
x: list[float] = Field(default=None)
|
|
19
|
+
y: list[float] = Field(default=None)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Pose3D(DataModel):
|
|
23
|
+
"""
|
|
24
|
+
A data model for representing 3D pose keypoints.
|
|
25
|
+
|
|
26
|
+
Attributes:
|
|
27
|
+
x (list[float]): The x-coordinates of the keypoints.
|
|
28
|
+
y (list[float]): The y-coordinates of the keypoints.
|
|
29
|
+
visible (list[float]): The visibility of the keypoints.
|
|
30
|
+
|
|
31
|
+
The keypoints are represented as lists of x, y, and visibility values,
|
|
32
|
+
where each index corresponds to a specific body part.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
x: list[float] = Field(default=None)
|
|
36
|
+
y: list[float] = Field(default=None)
|
|
37
|
+
visible: list[float] = Field(default=None)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module contains the YOLO models.
|
|
3
|
+
|
|
4
|
+
YOLO stands for "You Only Look Once", a family of object detection models that
|
|
5
|
+
are designed to be fast and accurate. The models are trained to detect objects
|
|
6
|
+
in images by dividing the image into a grid and predicting the bounding boxes
|
|
7
|
+
and class probabilities for each grid cell.
|
|
8
|
+
|
|
9
|
+
More information about YOLO can be found here:
|
|
10
|
+
- https://pjreddie.com/darknet/yolo/
|
|
11
|
+
- https://docs.ultralytics.com/
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class PoseBodyPart:
|
|
16
|
+
"""
|
|
17
|
+
An enumeration of body parts for YOLO pose keypoints.
|
|
18
|
+
|
|
19
|
+
More information about the body parts can be found here:
|
|
20
|
+
https://docs.ultralytics.com/tasks/pose/
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
nose = 0
|
|
24
|
+
left_eye = 1
|
|
25
|
+
right_eye = 2
|
|
26
|
+
left_ear = 3
|
|
27
|
+
right_ear = 4
|
|
28
|
+
left_shoulder = 5
|
|
29
|
+
right_shoulder = 6
|
|
30
|
+
left_elbow = 7
|
|
31
|
+
right_elbow = 8
|
|
32
|
+
left_wrist = 9
|
|
33
|
+
right_wrist = 10
|
|
34
|
+
left_hip = 11
|
|
35
|
+
right_hip = 12
|
|
36
|
+
left_knee = 13
|
|
37
|
+
right_knee = 14
|
|
38
|
+
left_ankle = 15
|
|
39
|
+
right_ankle = 16
|
datachain/lib/signal_schema.py
CHANGED
datachain/remote/studio.py
CHANGED
|
@@ -131,6 +131,12 @@ class StudioClient:
|
|
|
131
131
|
timeout=self.timeout,
|
|
132
132
|
)
|
|
133
133
|
ok = response.ok
|
|
134
|
+
if not ok:
|
|
135
|
+
if response.status_code == 403:
|
|
136
|
+
message = f"Not authorized for the team {self.team}"
|
|
137
|
+
raise DataChainError(message)
|
|
138
|
+
logger.error("Got bad response from Studio")
|
|
139
|
+
|
|
134
140
|
content = msgpack.unpackb(response.content, ext_hook=self._unpacker_hook)
|
|
135
141
|
response_data = content.get("data")
|
|
136
142
|
if ok and response_data is None:
|
|
@@ -177,8 +183,12 @@ class StudioClient:
|
|
|
177
183
|
response.content.decode("utf-8"),
|
|
178
184
|
)
|
|
179
185
|
if response.status_code == 403:
|
|
180
|
-
message = "Not authorized"
|
|
186
|
+
message = f"Not authorized for the team {self.team}"
|
|
181
187
|
else:
|
|
188
|
+
logger.error(
|
|
189
|
+
"Got bad response from Studio, content is %s",
|
|
190
|
+
response.content.decode("utf-8"),
|
|
191
|
+
)
|
|
182
192
|
message = data.get("message", "")
|
|
183
193
|
else:
|
|
184
194
|
message = ""
|
|
@@ -214,7 +224,7 @@ class StudioClient:
|
|
|
214
224
|
# to handle cases where a path will be expanded (i.e. globs)
|
|
215
225
|
response: Response[LsData]
|
|
216
226
|
for path in paths:
|
|
217
|
-
response = self._send_request_msgpack("ls", {"source": path})
|
|
227
|
+
response = self._send_request_msgpack("datachain/ls", {"source": path})
|
|
218
228
|
yield path, response
|
|
219
229
|
|
|
220
230
|
def ls_datasets(self) -> Response[LsData]:
|
datachain/studio.py
CHANGED
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from typing import TYPE_CHECKING
|
|
2
|
+
from typing import TYPE_CHECKING, Optional
|
|
3
|
+
|
|
4
|
+
from tabulate import tabulate
|
|
3
5
|
|
|
4
6
|
from datachain.catalog.catalog import raise_remote_error
|
|
5
7
|
from datachain.config import Config, ConfigLevel
|
|
8
|
+
from datachain.dataset import QUERY_DATASET_PREFIX
|
|
6
9
|
from datachain.error import DataChainError
|
|
7
10
|
from datachain.remote.studio import StudioClient
|
|
8
11
|
from datachain.utils import STUDIO_URL
|
|
@@ -24,7 +27,13 @@ def process_studio_cli_args(args: "Namespace"):
|
|
|
24
27
|
if args.cmd == "token":
|
|
25
28
|
return token()
|
|
26
29
|
if args.cmd == "datasets":
|
|
27
|
-
|
|
30
|
+
rows = [
|
|
31
|
+
{"Name": name, "Version": version}
|
|
32
|
+
for name, version in list_datasets(args.team)
|
|
33
|
+
]
|
|
34
|
+
print(tabulate(rows, headers="keys"))
|
|
35
|
+
return 0
|
|
36
|
+
|
|
28
37
|
if args.cmd == "team":
|
|
29
38
|
return set_team(args)
|
|
30
39
|
raise DataChainError(f"Unknown command '{args.cmd}'.")
|
|
@@ -103,19 +112,22 @@ def token():
|
|
|
103
112
|
print(token)
|
|
104
113
|
|
|
105
114
|
|
|
106
|
-
def list_datasets(
|
|
107
|
-
client = StudioClient(team=
|
|
115
|
+
def list_datasets(team: Optional[str] = None):
|
|
116
|
+
client = StudioClient(team=team)
|
|
108
117
|
response = client.ls_datasets()
|
|
109
118
|
if not response.ok:
|
|
110
119
|
raise_remote_error(response.message)
|
|
111
120
|
if not response.data:
|
|
112
|
-
print("No datasets found.")
|
|
113
121
|
return
|
|
122
|
+
|
|
114
123
|
for d in response.data:
|
|
115
124
|
name = d.get("name")
|
|
125
|
+
if name and name.startswith(QUERY_DATASET_PREFIX):
|
|
126
|
+
continue
|
|
127
|
+
|
|
116
128
|
for v in d.get("versions", []):
|
|
117
129
|
version = v.get("version")
|
|
118
|
-
|
|
130
|
+
yield (name, version)
|
|
119
131
|
|
|
120
132
|
|
|
121
133
|
def save_config(hostname, token):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.9
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -45,6 +45,7 @@ Requires-Dist: huggingface-hub
|
|
|
45
45
|
Requires-Dist: iterative-telemetry >=0.0.9
|
|
46
46
|
Requires-Dist: platformdirs
|
|
47
47
|
Requires-Dist: dvc-studio-client <1,>=0.21
|
|
48
|
+
Requires-Dist: tabulate
|
|
48
49
|
Provides-Extra: dev
|
|
49
50
|
Requires-Dist: datachain[docs,tests] ; extra == 'dev'
|
|
50
51
|
Requires-Dist: mypy ==1.13.0 ; extra == 'dev'
|
|
@@ -52,6 +53,7 @@ Requires-Dist: types-python-dateutil ; extra == 'dev'
|
|
|
52
53
|
Requires-Dist: types-pytz ; extra == 'dev'
|
|
53
54
|
Requires-Dist: types-PyYAML ; extra == 'dev'
|
|
54
55
|
Requires-Dist: types-requests ; extra == 'dev'
|
|
56
|
+
Requires-Dist: types-tabulate ; extra == 'dev'
|
|
55
57
|
Provides-Extra: docs
|
|
56
58
|
Requires-Dist: mkdocs >=1.5.2 ; extra == 'docs'
|
|
57
59
|
Requires-Dist: mkdocs-gen-files >=0.5.0 ; extra == 'docs'
|
|
@@ -118,33 +120,41 @@ Requires-Dist: usearch ; extra == 'vector'
|
|
|
118
120
|
:target: https://github.com/iterative/datachain/actions/workflows/tests.yml
|
|
119
121
|
:alt: Tests
|
|
120
122
|
|
|
121
|
-
DataChain is a
|
|
122
|
-
|
|
123
|
-
|
|
123
|
+
DataChain is a Python-based AI-data warehouse for transforming and analyzing unstructured
|
|
124
|
+
data like images, audio, videos, text and PDFs. It integrates with external storage
|
|
125
|
+
(e.g., S3) to process data efficiently without data duplication and manages metadata
|
|
126
|
+
in an internal database for easy and efficient querying.
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
Use Cases
|
|
130
|
+
=========
|
|
131
|
+
|
|
132
|
+
1. **Multimodal Dataset Preparation and Curation**: ideal for organizing and
|
|
133
|
+
refining data in pre-training, finetuning or LLM evaluating stages.
|
|
134
|
+
2. **GenAI Data Analytics**: Enables advanced analytics for multimodal data and
|
|
135
|
+
ad-hoc analytics using LLMs.
|
|
124
136
|
|
|
125
137
|
Key Features
|
|
126
138
|
============
|
|
127
139
|
|
|
128
|
-
📂 **
|
|
129
|
-
-
|
|
130
|
-
file systems.
|
|
131
|
-
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet.
|
|
140
|
+
📂 **Multimodal Dataset Versioning.**
|
|
141
|
+
- Version unstructured data without redundant data copies, by supporitng
|
|
142
|
+
references to S3, GCP, Azure, and local file systems.
|
|
143
|
+
- Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet, etc.
|
|
132
144
|
- Unite files and metadata together into persistent, versioned, columnar datasets.
|
|
133
145
|
|
|
134
|
-
🐍 **Python-friendly
|
|
135
|
-
- Operate on Python objects and object fields
|
|
136
|
-
|
|
146
|
+
🐍 **Python-friendly.**
|
|
147
|
+
- Operate on Python objects and object fields: float scores, strings, matrixes,
|
|
148
|
+
LLM response objects.
|
|
149
|
+
- Run Python code in a high-scale, terabytes size datasets, with built-in
|
|
150
|
+
parallelization and memory-efficient computing — no SQL or Spark required.
|
|
137
151
|
|
|
138
152
|
🧠 **Data Enrichment and Processing.**
|
|
139
153
|
- Generate metadata using local AI models and LLM APIs.
|
|
140
|
-
- Filter, join, and group by metadata. Search by vector embeddings.
|
|
154
|
+
- Filter, join, and group datasets by metadata. Search by vector embeddings.
|
|
155
|
+
- High-performance vectorized operations on Python objects: sum, count, avg, etc.
|
|
141
156
|
- Pass datasets to Pytorch and Tensorflow, or export them back into storage.
|
|
142
157
|
|
|
143
|
-
🚀 **Efficiency.**
|
|
144
|
-
- Parallelization, out-of-memory workloads and data caching.
|
|
145
|
-
- Vectorized operations on Python object fields: sum, count, avg, etc.
|
|
146
|
-
- Optimized vector search.
|
|
147
|
-
|
|
148
158
|
|
|
149
159
|
Quick Start
|
|
150
160
|
-----------
|
|
@@ -194,7 +204,7 @@ Batch inference with a simple sentiment model using the `transformers` library:
|
|
|
194
204
|
|
|
195
205
|
pip install transformers
|
|
196
206
|
|
|
197
|
-
The code below downloads files the cloud, and applies a user-defined function
|
|
207
|
+
The code below downloads files from the cloud, and applies a user-defined function
|
|
198
208
|
to each one of them. All files with a positive sentiment
|
|
199
209
|
detected are then copied to the local directory.
|
|
200
210
|
|
|
@@ -427,6 +437,19 @@ name suffix, the following code will do it:
|
|
|
427
437
|
loader = DataLoader(chain, batch_size=1)
|
|
428
438
|
|
|
429
439
|
|
|
440
|
+
DataChain Studio Platform
|
|
441
|
+
-------------------------
|
|
442
|
+
|
|
443
|
+
`DataChain Studio`_ is a proprietary solution for teams that offers:
|
|
444
|
+
|
|
445
|
+
- **Centralized dataset registry** to manage data, code and dependency
|
|
446
|
+
dependencies in one place.
|
|
447
|
+
- **Data Lineage** for data sources as well as direvative dataset.
|
|
448
|
+
- **UI for Multimodal Data** like images, videos, and PDFs.
|
|
449
|
+
- **Scalable Compute** to handle large datasets (100M+ files) and in-house
|
|
450
|
+
AI model inference.
|
|
451
|
+
- **Access control** including SSO and team based collaboration.
|
|
452
|
+
|
|
430
453
|
Tutorials
|
|
431
454
|
---------
|
|
432
455
|
|
|
@@ -460,6 +483,5 @@ Community and Support
|
|
|
460
483
|
.. _Pydantic: https://github.com/pydantic/pydantic
|
|
461
484
|
.. _publicly available: https://radar.kit.edu/radar/en/dataset/FdJmclKpjHzLfExE.ExpBot%2B-%2BA%2Bdataset%2Bof%2B79%2Bdialogs%2Bwith%2Ban%2Bexperimental%2Bcustomer%2Bservice%2Bchatbot
|
|
462
485
|
.. _SQLite: https://www.sqlite.org/
|
|
463
|
-
.. _Getting Started: https://datachain.
|
|
464
|
-
..
|
|
465
|
-
:alt: DataChain FlowChart
|
|
486
|
+
.. _Getting Started: https://docs.datachain.ai/
|
|
487
|
+
.. _DataChain Studio: https://studio.datachain.ai/
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
datachain/__init__.py,sha256=
|
|
1
|
+
datachain/__init__.py,sha256=nnTyB5MpCfBZ6D85JPz-5hUT7i-68Is-47Bxgew8lRw,930
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=Lg3Ck1PQLjQziMx9KU4atzbEnJXTE0924WMYkhgWtGU,8247
|
|
4
4
|
datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
|
|
5
|
-
datachain/cli.py,sha256=
|
|
5
|
+
datachain/cli.py,sha256=hdVt_HJumQVgtaBAtBVJm-uPyYVogMXNVLmRcZyWHgk,36677
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
7
7
|
datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
|
|
8
|
-
datachain/dataset.py,sha256=
|
|
8
|
+
datachain/dataset.py,sha256=0IN-5y723y-bnFlieKtOFZLCjwX_yplFo3q0DV7LRPw,14821
|
|
9
9
|
datachain/error.py,sha256=bxAAL32lSeMgzsQDEHbGTGORj-mPzzpCRvWDPueJNN4,1092
|
|
10
10
|
datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
|
|
11
11
|
datachain/listing.py,sha256=AV23WZq-k6e2zeeNBhVQP1-2PrwNCYidO0HBDKzpVaA,7152
|
|
@@ -14,17 +14,17 @@ datachain/nodes_fetcher.py,sha256=F-73-h19HHNGtHFBGKk7p3mc0ALm4a9zGnzhtuUjnp4,11
|
|
|
14
14
|
datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
|
|
15
15
|
datachain/progress.py,sha256=5KotcvvzAUL_RF0GEj4JY0IB1lyImnmHxe89YkT1XO4,4330
|
|
16
16
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
-
datachain/studio.py,sha256=
|
|
17
|
+
datachain/studio.py,sha256=6kxF7VxPAbh9D7_Bk8_SghS5OXrwUwSpDaw19eNCTP4,4083
|
|
18
18
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
19
19
|
datachain/utils.py,sha256=-mSFowjIidJ4_sMXInvNHLn4rK_QnHuIlLuH1_lMGmI,13897
|
|
20
20
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
21
|
-
datachain/catalog/catalog.py,sha256=
|
|
21
|
+
datachain/catalog/catalog.py,sha256=Iwb562grttdGcrNVHCna_n7e884BqwGhQwAgYagBwyg,57347
|
|
22
22
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
23
23
|
datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
|
|
24
24
|
datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
|
|
25
25
|
datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
|
|
26
26
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
27
|
-
datachain/client/fsspec.py,sha256=
|
|
27
|
+
datachain/client/fsspec.py,sha256=Ai5m7alkAnv-RWXuLbZ95SKEPaQ3Pyk5ujDy50JDX5w,12692
|
|
28
28
|
datachain/client/gcs.py,sha256=cnTIr5GS6dbYOEYfqehhyQu3dr6XNjPHSg5U3FkivUk,4124
|
|
29
29
|
datachain/client/hf.py,sha256=XeVJVbiNViZCpn3sfb90Fr8SYO3BdLmfE3hOWMoqInE,951
|
|
30
30
|
datachain/client/local.py,sha256=vwbgCwZ7IqY2voj2l7tLJjgov7Dp--fEUvUwUBsMbls,4457
|
|
@@ -33,7 +33,7 @@ datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZ
|
|
|
33
33
|
datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
|
|
34
34
|
datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
|
|
35
35
|
datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
|
|
36
|
-
datachain/data_storage/metastore.py,sha256
|
|
36
|
+
datachain/data_storage/metastore.py,sha256=5b7o_CSHC2djottebYn-Hq5q0yaSLOKPIRCnaVRvjsU,36056
|
|
37
37
|
datachain/data_storage/schema.py,sha256=scANMQqozita3HjEtq7eupMgh6yYkrZHoXtfuL2RoQg,9879
|
|
38
38
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
39
39
|
datachain/data_storage/sqlite.py,sha256=wb8xlMJYYyt59wft0psJj587d-AwpNThzIqspVcKnRI,27388
|
|
@@ -42,18 +42,18 @@ datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
42
42
|
datachain/lib/arrow.py,sha256=-hu9tic79a01SY2UBqkA3U6wUr6tnE3T3q5q_BnO93A,9156
|
|
43
43
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
44
44
|
datachain/lib/data_model.py,sha256=dau4AlZBhOFvF7pEKMeqCeRkcFFg5KFvTBWW_2CdH5g,2371
|
|
45
|
-
datachain/lib/dataset_info.py,sha256=
|
|
46
|
-
datachain/lib/dc.py,sha256=
|
|
47
|
-
datachain/lib/file.py,sha256=
|
|
45
|
+
datachain/lib/dataset_info.py,sha256=q0EW9tj5jXGSD9Lzct9zbH4P1lfIGd_cIWqhnMxv7Q0,2464
|
|
46
|
+
datachain/lib/dc.py,sha256=RQ8p95rzCMRY4ygFecO_hhQ3IgQHmbLXNqhcaINvGcI,85841
|
|
47
|
+
datachain/lib/file.py,sha256=lHxE1wOGR4QJBQ3AYjhPLwpX72dOi06vkcwA-WSAGlg,14817
|
|
48
48
|
datachain/lib/hf.py,sha256=BW2NPpqxkpPwkSaGlppT8Rbs8zPpyYC-tR6htY08c-0,5817
|
|
49
49
|
datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
|
|
50
50
|
datachain/lib/listing.py,sha256=cVkCp7TRVpcZKSx-Bbk9t51bQI9Mw0o86W6ZPhAsuzM,3667
|
|
51
51
|
datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
|
|
52
|
-
datachain/lib/meta_formats.py,sha256=
|
|
52
|
+
datachain/lib/meta_formats.py,sha256=anK2bDVbaeCCh0yvKUBaW2MVos3zRgdaSV8uSduzPcU,6680
|
|
53
53
|
datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
|
|
54
54
|
datachain/lib/pytorch.py,sha256=W-ARi2xH1f1DUkVfRuerW-YWYgSaJASmNCxtz2lrJGI,6072
|
|
55
55
|
datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
|
|
56
|
-
datachain/lib/signal_schema.py,sha256=
|
|
56
|
+
datachain/lib/signal_schema.py,sha256=xwkE5bxJxUhZTjrA6jqN87XbSXPikCbL6eOPL9WyrKM,24556
|
|
57
57
|
datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
|
|
58
58
|
datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
|
|
59
59
|
datachain/lib/udf.py,sha256=4CqK51n3bntXCmkwoOQIrX34wMKOknkC23HtR4D_2vM,12705
|
|
@@ -71,6 +71,10 @@ datachain/lib/convert/values_to_tuples.py,sha256=varRCnSMT_pZmHznrd2Yi05qXLLz_v9
|
|
|
71
71
|
datachain/lib/func/__init__.py,sha256=wlAKhGV0QDg9y7reSwoUF8Vicfqh_YOUNIXLzxICGz4,403
|
|
72
72
|
datachain/lib/func/aggregate.py,sha256=H1ziFQdaK9zvnxvttfnEzkkyGvEEmMAvmgCsBV6nfm8,10917
|
|
73
73
|
datachain/lib/func/func.py,sha256=HAJZ_tpiRG2R-et7pr0WnoyNZYtpbPn3_HBuL3RQpbU,4800
|
|
74
|
+
datachain/lib/models/__init__.py,sha256=AGvjPbUokJiir3uelTa4XGtNSECkMFc5Xmi_N3AtxPQ,119
|
|
75
|
+
datachain/lib/models/bbox.py,sha256=aiYNhvEcRK3dEN4MBcptmkPKc9kMP16ZQdu7xPk6hek,1555
|
|
76
|
+
datachain/lib/models/pose.py,sha256=peuJPNSiGuTXfCfGIABwv8PGYistvTTBmtf-8X8E_eA,1077
|
|
77
|
+
datachain/lib/models/yolo.py,sha256=eftoJDUa8iOpFTF1EkKVAd5Q-3HRd6X4eCIZ9h5p4nI,972
|
|
74
78
|
datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
|
|
75
79
|
datachain/query/batch.py,sha256=5fEhORFe7li12SdYddaSK3LyqksMfCHhwN1_A6TfsA4,3485
|
|
76
80
|
datachain/query/dataset.py,sha256=MGArYxioeGvm8w7hQtQAjEI6wsZN_XAoh4-jO4d0U5Q,53926
|
|
@@ -81,7 +85,7 @@ datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
|
|
|
81
85
|
datachain/query/schema.py,sha256=b_KnVy6B26Ol4nYG0LqNNpeQ1QYPk95YRGUjXfdaQWs,6606
|
|
82
86
|
datachain/query/session.py,sha256=50SOdLNCjqHHKI-L4xGXyzTVxzMWfANqKqjeYre-c2k,5959
|
|
83
87
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
84
|
-
datachain/remote/studio.py,sha256=
|
|
88
|
+
datachain/remote/studio.py,sha256=g88kHdlRhmruiWwoIxq_JJoymZUrtMAL937NWQyWyXI,9209
|
|
85
89
|
datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
|
|
86
90
|
datachain/sql/selectable.py,sha256=fBM-wS1TUA42kVEAAiwqGtibIevyZAEritwt8PZGyLQ,1589
|
|
87
91
|
datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
|
|
@@ -100,9 +104,9 @@ datachain/sql/sqlite/base.py,sha256=aHSZVvh4XSVkvZ07h3jMoRlHI4sWD8y3SnmGs9xMG9Y,
|
|
|
100
104
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
101
105
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
102
106
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
103
|
-
datachain-0.6.
|
|
104
|
-
datachain-0.6.
|
|
105
|
-
datachain-0.6.
|
|
106
|
-
datachain-0.6.
|
|
107
|
-
datachain-0.6.
|
|
108
|
-
datachain-0.6.
|
|
107
|
+
datachain-0.6.9.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
108
|
+
datachain-0.6.9.dist-info/METADATA,sha256=McKhuW43_7Q3iJKxueIYbk-rpYF6rbIKeFinzeeUzMo,18037
|
|
109
|
+
datachain-0.6.9.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
|
110
|
+
datachain-0.6.9.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
111
|
+
datachain-0.6.9.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
112
|
+
datachain-0.6.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|