datachain 0.8.8__py3-none-any.whl → 0.8.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/cli/__init__.py +14 -7
- datachain/cli/commands/datasets.py +2 -3
- datachain/cli/parser/__init__.py +69 -82
- datachain/cli/parser/job.py +20 -25
- datachain/cli/parser/studio.py +41 -65
- datachain/cli/parser/utils.py +1 -1
- datachain/cli/utils.py +1 -1
- datachain/client/local.py +1 -1
- datachain/data_storage/sqlite.py +38 -7
- datachain/data_storage/warehouse.py +2 -2
- datachain/lib/arrow.py +1 -1
- datachain/lib/convert/python_to_sql.py +15 -3
- datachain/lib/convert/unflatten.py +1 -2
- datachain/lib/dc.py +26 -5
- datachain/lib/file.py +27 -4
- datachain/lib/listing.py +4 -4
- datachain/lib/pytorch.py +3 -1
- datachain/lib/udf.py +56 -20
- datachain/model/bbox.py +9 -9
- datachain/model/pose.py +9 -9
- datachain/model/segment.py +6 -6
- datachain/progress.py +0 -13
- datachain/query/dataset.py +20 -14
- datachain/remote/studio.py +2 -2
- datachain/sql/sqlite/base.py +35 -14
- datachain/studio.py +22 -16
- {datachain-0.8.8.dist-info → datachain-0.8.10.dist-info}/METADATA +4 -3
- {datachain-0.8.8.dist-info → datachain-0.8.10.dist-info}/RECORD +32 -32
- {datachain-0.8.8.dist-info → datachain-0.8.10.dist-info}/LICENSE +0 -0
- {datachain-0.8.8.dist-info → datachain-0.8.10.dist-info}/WHEEL +0 -0
- {datachain-0.8.8.dist-info → datachain-0.8.10.dist-info}/entry_points.txt +0 -0
- {datachain-0.8.8.dist-info → datachain-0.8.10.dist-info}/top_level.txt +0 -0
datachain/cli/parser/studio.py
CHANGED
|
@@ -1,34 +1,31 @@
|
|
|
1
|
-
def
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
"
|
|
5
|
-
"
|
|
6
|
-
"DataChain will utilize it for seamlessly sharing datasets\n"
|
|
7
|
-
"and using Studio features from CLI"
|
|
1
|
+
def add_auth_parser(subparsers, parent_parser) -> None:
|
|
2
|
+
auth_help = "Manage Studio authentication"
|
|
3
|
+
auth_description = (
|
|
4
|
+
"Manage authentication and settings for Studio. "
|
|
5
|
+
"Configure tokens for sharing datasets and using Studio features."
|
|
8
6
|
)
|
|
9
7
|
|
|
10
|
-
|
|
11
|
-
"
|
|
8
|
+
auth_parser = subparsers.add_parser(
|
|
9
|
+
"auth",
|
|
12
10
|
parents=[parent_parser],
|
|
13
|
-
description=
|
|
14
|
-
help=
|
|
11
|
+
description=auth_description,
|
|
12
|
+
help=auth_help,
|
|
15
13
|
)
|
|
16
|
-
|
|
14
|
+
auth_subparser = auth_parser.add_subparsers(
|
|
17
15
|
dest="cmd",
|
|
18
|
-
help="Use `
|
|
19
|
-
required=True,
|
|
16
|
+
help="Use `datachain auth CMD --help` to display command-specific help",
|
|
20
17
|
)
|
|
21
18
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
"
|
|
25
|
-
"
|
|
19
|
+
auth_login_help = "Authenticate with Studio"
|
|
20
|
+
auth_login_description = (
|
|
21
|
+
"Authenticate with Studio using default scopes. "
|
|
22
|
+
"A random name will be assigned as the token name if not specified."
|
|
26
23
|
)
|
|
27
|
-
login_parser =
|
|
24
|
+
login_parser = auth_subparser.add_parser(
|
|
28
25
|
"login",
|
|
29
26
|
parents=[parent_parser],
|
|
30
|
-
description=
|
|
31
|
-
help=
|
|
27
|
+
description=auth_login_description,
|
|
28
|
+
help=auth_login_help,
|
|
32
29
|
)
|
|
33
30
|
|
|
34
31
|
login_parser.add_argument(
|
|
@@ -36,14 +33,14 @@ def add_studio_parser(subparsers, parent_parser) -> None:
|
|
|
36
33
|
"--hostname",
|
|
37
34
|
action="store",
|
|
38
35
|
default=None,
|
|
39
|
-
help="
|
|
36
|
+
help="Hostname of the Studio instance",
|
|
40
37
|
)
|
|
41
38
|
login_parser.add_argument(
|
|
42
39
|
"-s",
|
|
43
40
|
"--scopes",
|
|
44
41
|
action="store",
|
|
45
42
|
default=None,
|
|
46
|
-
help="
|
|
43
|
+
help="Authentication token scopes",
|
|
47
44
|
)
|
|
48
45
|
|
|
49
46
|
login_parser.add_argument(
|
|
@@ -51,76 +48,55 @@ def add_studio_parser(subparsers, parent_parser) -> None:
|
|
|
51
48
|
"--name",
|
|
52
49
|
action="store",
|
|
53
50
|
default=None,
|
|
54
|
-
help="
|
|
55
|
-
"identify token shown in Studio profile.",
|
|
51
|
+
help="Authentication token name (shown in Studio profile)",
|
|
56
52
|
)
|
|
57
53
|
|
|
58
54
|
login_parser.add_argument(
|
|
59
55
|
"--no-open",
|
|
60
56
|
action="store_true",
|
|
61
57
|
default=False,
|
|
62
|
-
help="Use
|
|
63
|
-
"You will be presented with user code to enter in browser.\n"
|
|
64
|
-
"DataChain will also use this if it cannot launch browser on your behalf.",
|
|
58
|
+
help="Use code-based authentication without browser",
|
|
65
59
|
)
|
|
66
60
|
|
|
67
|
-
|
|
68
|
-
|
|
61
|
+
auth_logout_help = "Log out from Studio"
|
|
62
|
+
auth_logout_description = (
|
|
63
|
+
"Remove the Studio authentication token from global config."
|
|
64
|
+
)
|
|
69
65
|
|
|
70
|
-
|
|
66
|
+
auth_subparser.add_parser(
|
|
71
67
|
"logout",
|
|
72
68
|
parents=[parent_parser],
|
|
73
|
-
description=
|
|
74
|
-
help=
|
|
69
|
+
description=auth_logout_description,
|
|
70
|
+
help=auth_logout_help,
|
|
75
71
|
)
|
|
76
72
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
"Set the default team for DataChain to use when interacting with Studio."
|
|
80
|
-
)
|
|
73
|
+
auth_team_help = "Set default team for Studio operations"
|
|
74
|
+
auth_team_description = "Set the default team for Studio operations."
|
|
81
75
|
|
|
82
|
-
team_parser =
|
|
76
|
+
team_parser = auth_subparser.add_parser(
|
|
83
77
|
"team",
|
|
84
78
|
parents=[parent_parser],
|
|
85
|
-
description=
|
|
86
|
-
help=
|
|
79
|
+
description=auth_team_description,
|
|
80
|
+
help=auth_team_help,
|
|
87
81
|
)
|
|
88
82
|
team_parser.add_argument(
|
|
89
83
|
"team_name",
|
|
90
84
|
action="store",
|
|
91
|
-
help="
|
|
85
|
+
help="Name of the team to set as default",
|
|
92
86
|
)
|
|
93
87
|
team_parser.add_argument(
|
|
94
88
|
"--global",
|
|
95
89
|
action="store_true",
|
|
96
90
|
default=False,
|
|
97
|
-
help="Set
|
|
91
|
+
help="Set team globally for all projects",
|
|
98
92
|
)
|
|
99
93
|
|
|
100
|
-
|
|
94
|
+
auth_token_help = "View Studio authentication token" # noqa: S105
|
|
95
|
+
auth_token_description = "Display the current authentication token for Studio." # noqa: S105
|
|
101
96
|
|
|
102
|
-
|
|
97
|
+
auth_subparser.add_parser(
|
|
103
98
|
"token",
|
|
104
99
|
parents=[parent_parser],
|
|
105
|
-
description=
|
|
106
|
-
help=
|
|
107
|
-
)
|
|
108
|
-
|
|
109
|
-
studio_ls_dataset_help = "List the available datasets from Studio"
|
|
110
|
-
studio_ls_dataset_description = (
|
|
111
|
-
"This command lists all the datasets available in Studio.\n"
|
|
112
|
-
"It will show the dataset name and the number of versions available."
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
ls_dataset_parser = studio_subparser.add_parser(
|
|
116
|
-
"dataset",
|
|
117
|
-
parents=[parent_parser],
|
|
118
|
-
description=studio_ls_dataset_description,
|
|
119
|
-
help=studio_ls_dataset_help,
|
|
120
|
-
)
|
|
121
|
-
ls_dataset_parser.add_argument(
|
|
122
|
-
"--team",
|
|
123
|
-
action="store",
|
|
124
|
-
default=None,
|
|
125
|
-
help="The team to list datasets for. By default, it will use team from config.",
|
|
100
|
+
description=auth_token_description,
|
|
101
|
+
help=auth_token_help,
|
|
126
102
|
)
|
datachain/cli/parser/utils.py
CHANGED
datachain/cli/utils.py
CHANGED
|
@@ -87,7 +87,7 @@ def get_logging_level(args: Namespace) -> int:
|
|
|
87
87
|
def determine_flavors(studio: bool, local: bool, all: bool, token: Optional[str]):
|
|
88
88
|
if studio and not token:
|
|
89
89
|
raise DataChainError(
|
|
90
|
-
"Not logged in to Studio. Log in with 'datachain
|
|
90
|
+
"Not logged in to Studio. Log in with 'datachain auth login'."
|
|
91
91
|
)
|
|
92
92
|
|
|
93
93
|
if local or studio:
|
datachain/client/local.py
CHANGED
|
@@ -38,7 +38,7 @@ class FileClient(Client):
|
|
|
38
38
|
def get_uri(cls, name: str) -> "StorageURI":
|
|
39
39
|
from datachain.dataset import StorageURI
|
|
40
40
|
|
|
41
|
-
return StorageURI(f
|
|
41
|
+
return StorageURI(f"{cls.PREFIX}/{name.removeprefix('/')}")
|
|
42
42
|
|
|
43
43
|
@classmethod
|
|
44
44
|
def ls_buckets(cls, **kwargs):
|
datachain/data_storage/sqlite.py
CHANGED
|
@@ -19,6 +19,7 @@ from sqlalchemy import MetaData, Table, UniqueConstraint, exists, select
|
|
|
19
19
|
from sqlalchemy.dialects import sqlite
|
|
20
20
|
from sqlalchemy.schema import CreateIndex, CreateTable, DropTable
|
|
21
21
|
from sqlalchemy.sql import func
|
|
22
|
+
from sqlalchemy.sql.elements import BinaryExpression, BooleanClauseList
|
|
22
23
|
from sqlalchemy.sql.expression import bindparam, cast
|
|
23
24
|
from sqlalchemy.sql.selectable import Select
|
|
24
25
|
from tqdm.auto import tqdm
|
|
@@ -40,7 +41,6 @@ if TYPE_CHECKING:
|
|
|
40
41
|
from sqlalchemy.schema import SchemaItem
|
|
41
42
|
from sqlalchemy.sql._typing import _FromClauseArgument, _OnClauseArgument
|
|
42
43
|
from sqlalchemy.sql.elements import ColumnElement
|
|
43
|
-
from sqlalchemy.sql.selectable import Join
|
|
44
44
|
from sqlalchemy.types import TypeEngine
|
|
45
45
|
|
|
46
46
|
from datachain.lib.file import File
|
|
@@ -654,16 +654,47 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
654
654
|
right: "_FromClauseArgument",
|
|
655
655
|
onclause: "_OnClauseArgument",
|
|
656
656
|
inner: bool = True,
|
|
657
|
-
|
|
657
|
+
full: bool = False,
|
|
658
|
+
columns=None,
|
|
659
|
+
) -> "Select":
|
|
658
660
|
"""
|
|
659
661
|
Join two tables together.
|
|
660
662
|
"""
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
663
|
+
if not full:
|
|
664
|
+
join_query = sqlalchemy.join(
|
|
665
|
+
left,
|
|
666
|
+
right,
|
|
667
|
+
onclause,
|
|
668
|
+
isouter=not inner,
|
|
669
|
+
)
|
|
670
|
+
return sqlalchemy.select(*columns).select_from(join_query)
|
|
671
|
+
|
|
672
|
+
left_right_join = sqlalchemy.select(*columns).select_from(
|
|
673
|
+
sqlalchemy.join(left, right, onclause, isouter=True)
|
|
666
674
|
)
|
|
675
|
+
right_left_join = sqlalchemy.select(*columns).select_from(
|
|
676
|
+
sqlalchemy.join(right, left, onclause, isouter=True)
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
def add_left_rows_filter(exp: BinaryExpression):
|
|
680
|
+
"""
|
|
681
|
+
Adds filter to right_left_join to remove unmatched left table rows by
|
|
682
|
+
getting column names that need to be NULL from BinaryExpressions in onclause
|
|
683
|
+
"""
|
|
684
|
+
return right_left_join.where(
|
|
685
|
+
getattr(left.c, exp.left.name) == None # type: ignore[union-attr] # noqa: E711
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
if isinstance(onclause, BinaryExpression):
|
|
689
|
+
right_left_join = add_left_rows_filter(onclause)
|
|
690
|
+
|
|
691
|
+
if isinstance(onclause, BooleanClauseList):
|
|
692
|
+
for c in onclause.get_children():
|
|
693
|
+
if isinstance(c, BinaryExpression):
|
|
694
|
+
right_left_join = add_left_rows_filter(c)
|
|
695
|
+
|
|
696
|
+
union = sqlalchemy.union(left_right_join, right_left_join).subquery()
|
|
697
|
+
return sqlalchemy.select(*union.c).select_from(union)
|
|
667
698
|
|
|
668
699
|
def create_pre_udf_table(self, query: "Select") -> "Table":
|
|
669
700
|
"""
|
|
@@ -31,7 +31,7 @@ if TYPE_CHECKING:
|
|
|
31
31
|
_FromClauseArgument,
|
|
32
32
|
_OnClauseArgument,
|
|
33
33
|
)
|
|
34
|
-
from sqlalchemy.sql.selectable import
|
|
34
|
+
from sqlalchemy.sql.selectable import Select
|
|
35
35
|
from sqlalchemy.types import TypeEngine
|
|
36
36
|
|
|
37
37
|
from datachain.data_storage import schema
|
|
@@ -873,7 +873,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
873
873
|
right: "_FromClauseArgument",
|
|
874
874
|
onclause: "_OnClauseArgument",
|
|
875
875
|
inner: bool = True,
|
|
876
|
-
) -> "
|
|
876
|
+
) -> "Select":
|
|
877
877
|
"""
|
|
878
878
|
Join two tables together.
|
|
879
879
|
"""
|
datachain/lib/arrow.py
CHANGED
|
@@ -33,7 +33,7 @@ class ReferenceFileSystem(fsspec.implementations.reference.ReferenceFileSystem):
|
|
|
33
33
|
# reads the whole file in-memory.
|
|
34
34
|
(uri,) = self.references[path]
|
|
35
35
|
protocol, _ = split_protocol(uri)
|
|
36
|
-
return self.fss[protocol].
|
|
36
|
+
return self.fss[protocol].open(uri, mode, *args, **kwargs)
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
class ArrowGenerator(Generator):
|
|
@@ -52,15 +52,15 @@ def python_to_sql(typ): # noqa: PLR0911
|
|
|
52
52
|
|
|
53
53
|
args = get_args(typ)
|
|
54
54
|
if inspect.isclass(orig) and (issubclass(list, orig) or issubclass(tuple, orig)):
|
|
55
|
-
if args is None
|
|
55
|
+
if args is None:
|
|
56
56
|
raise TypeError(f"Cannot resolve type '{typ}' for flattening features")
|
|
57
57
|
|
|
58
58
|
args0 = args[0]
|
|
59
59
|
if ModelStore.is_pydantic(args0):
|
|
60
60
|
return Array(JSON())
|
|
61
61
|
|
|
62
|
-
|
|
63
|
-
return Array(
|
|
62
|
+
list_type = list_of_args_to_type(args)
|
|
63
|
+
return Array(list_type)
|
|
64
64
|
|
|
65
65
|
if orig is Annotated:
|
|
66
66
|
# Ignoring annotations
|
|
@@ -82,6 +82,18 @@ def python_to_sql(typ): # noqa: PLR0911
|
|
|
82
82
|
raise TypeError(f"Cannot recognize type {typ}")
|
|
83
83
|
|
|
84
84
|
|
|
85
|
+
def list_of_args_to_type(args) -> SQLType:
|
|
86
|
+
first_type = python_to_sql(args[0])
|
|
87
|
+
for next_arg in args[1:]:
|
|
88
|
+
try:
|
|
89
|
+
next_type = python_to_sql(next_arg)
|
|
90
|
+
if next_type != first_type:
|
|
91
|
+
return JSON()
|
|
92
|
+
except TypeError:
|
|
93
|
+
return JSON()
|
|
94
|
+
return first_type
|
|
95
|
+
|
|
96
|
+
|
|
85
97
|
def _is_json_inside_union(orig, args) -> bool:
|
|
86
98
|
if orig == Union and len(args) >= 2:
|
|
87
99
|
# List in JSON: Union[dict, list[dict]]
|
|
@@ -35,8 +35,7 @@ def unflatten_to_json_pos(
|
|
|
35
35
|
def _normalize(name: str) -> str:
|
|
36
36
|
if DEFAULT_DELIMITER in name:
|
|
37
37
|
raise RuntimeError(
|
|
38
|
-
f"variable '{name}' cannot be used "
|
|
39
|
-
f"because it contains {DEFAULT_DELIMITER}"
|
|
38
|
+
f"variable '{name}' cannot be used because it contains {DEFAULT_DELIMITER}"
|
|
40
39
|
)
|
|
41
40
|
return _to_snake_case(name)
|
|
42
41
|
|
datachain/lib/dc.py
CHANGED
|
@@ -11,6 +11,7 @@ from typing import (
|
|
|
11
11
|
BinaryIO,
|
|
12
12
|
Callable,
|
|
13
13
|
ClassVar,
|
|
14
|
+
Literal,
|
|
14
15
|
Optional,
|
|
15
16
|
TypeVar,
|
|
16
17
|
Union,
|
|
@@ -1276,7 +1277,12 @@ class DataChain:
|
|
|
1276
1277
|
yield ret[0] if len(cols) == 1 else tuple(ret)
|
|
1277
1278
|
|
|
1278
1279
|
def to_pytorch(
|
|
1279
|
-
self,
|
|
1280
|
+
self,
|
|
1281
|
+
transform=None,
|
|
1282
|
+
tokenizer=None,
|
|
1283
|
+
tokenizer_kwargs=None,
|
|
1284
|
+
num_samples=0,
|
|
1285
|
+
remove_prefetched: bool = False,
|
|
1280
1286
|
):
|
|
1281
1287
|
"""Convert to pytorch dataset format.
|
|
1282
1288
|
|
|
@@ -1286,6 +1292,7 @@ class DataChain:
|
|
|
1286
1292
|
tokenizer_kwargs (dict): Additional kwargs to pass when calling tokenizer.
|
|
1287
1293
|
num_samples (int): Number of random samples to draw for each epoch.
|
|
1288
1294
|
This argument is ignored if `num_samples=0` (the default).
|
|
1295
|
+
remove_prefetched (bool): Whether to remove prefetched files after reading.
|
|
1289
1296
|
|
|
1290
1297
|
Example:
|
|
1291
1298
|
```py
|
|
@@ -1312,6 +1319,7 @@ class DataChain:
|
|
|
1312
1319
|
tokenizer_kwargs=tokenizer_kwargs,
|
|
1313
1320
|
num_samples=num_samples,
|
|
1314
1321
|
dc_settings=chain._settings,
|
|
1322
|
+
remove_prefetched=remove_prefetched,
|
|
1315
1323
|
)
|
|
1316
1324
|
|
|
1317
1325
|
def remove_file_signals(self) -> "Self": # noqa: D102
|
|
@@ -1324,6 +1332,7 @@ class DataChain:
|
|
|
1324
1332
|
on: Union[MergeColType, Sequence[MergeColType]],
|
|
1325
1333
|
right_on: Optional[Union[MergeColType, Sequence[MergeColType]]] = None,
|
|
1326
1334
|
inner=False,
|
|
1335
|
+
full=False,
|
|
1327
1336
|
rname="right_",
|
|
1328
1337
|
) -> "Self":
|
|
1329
1338
|
"""Merge two chains based on the specified criteria.
|
|
@@ -1337,6 +1346,7 @@ class DataChain:
|
|
|
1337
1346
|
right_on: Optional predicate or list of Predicates for the `right_ds`
|
|
1338
1347
|
to join.
|
|
1339
1348
|
inner (bool): Whether to run inner join or outer join.
|
|
1349
|
+
full (bool): Whether to run full outer join.
|
|
1340
1350
|
rname (str): Name prefix for conflicting signal names.
|
|
1341
1351
|
|
|
1342
1352
|
Examples:
|
|
@@ -1411,7 +1421,7 @@ class DataChain:
|
|
|
1411
1421
|
)
|
|
1412
1422
|
|
|
1413
1423
|
query = self._query.join(
|
|
1414
|
-
right_ds._query, sqlalchemy.and_(*ops), inner, rname + "{name}"
|
|
1424
|
+
right_ds._query, sqlalchemy.and_(*ops), inner, full, rname + "{name}"
|
|
1415
1425
|
)
|
|
1416
1426
|
query.feature_schema = None
|
|
1417
1427
|
ds = self._evolve(query=query)
|
|
@@ -2415,11 +2425,22 @@ class DataChain:
|
|
|
2415
2425
|
def export_files(
|
|
2416
2426
|
self,
|
|
2417
2427
|
output: str,
|
|
2418
|
-
signal="file",
|
|
2428
|
+
signal: str = "file",
|
|
2419
2429
|
placement: FileExportPlacement = "fullpath",
|
|
2420
2430
|
use_cache: bool = True,
|
|
2431
|
+
link_type: Literal["copy", "symlink"] = "copy",
|
|
2421
2432
|
) -> None:
|
|
2422
|
-
"""
|
|
2433
|
+
"""Export files from a specified signal to a directory.
|
|
2434
|
+
|
|
2435
|
+
Args:
|
|
2436
|
+
output: Path to the target directory for exporting files.
|
|
2437
|
+
signal: Name of the signal to export files from.
|
|
2438
|
+
placement: The method to use for naming exported files.
|
|
2439
|
+
The possible values are: "filename", "etag", "fullpath", and "checksum".
|
|
2440
|
+
use_cache: If `True`, cache the files before exporting.
|
|
2441
|
+
link_type: Method to use for exporting files.
|
|
2442
|
+
Falls back to `'copy'` if symlinking fails.
|
|
2443
|
+
"""
|
|
2423
2444
|
if placement == "filename" and (
|
|
2424
2445
|
self._query.distinct(pathfunc.name(C(f"{signal}__path"))).count()
|
|
2425
2446
|
!= self._query.count()
|
|
@@ -2427,7 +2448,7 @@ class DataChain:
|
|
|
2427
2448
|
raise ValueError("Files with the same name found")
|
|
2428
2449
|
|
|
2429
2450
|
for file in self.collect(signal):
|
|
2430
|
-
file.export(output, placement, use_cache) # type: ignore[union-attr]
|
|
2451
|
+
file.export(output, placement, use_cache, link_type=link_type) # type: ignore[union-attr]
|
|
2431
2452
|
|
|
2432
2453
|
def shuffle(self) -> "Self":
|
|
2433
2454
|
"""Shuffle the rows of the chain deterministically."""
|
datachain/lib/file.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import errno
|
|
1
2
|
import hashlib
|
|
2
3
|
import io
|
|
3
4
|
import json
|
|
@@ -76,18 +77,18 @@ class TarVFile(VFile):
|
|
|
76
77
|
def open(cls, file: "File", location: list[dict]):
|
|
77
78
|
"""Stream file from tar archive based on location in archive."""
|
|
78
79
|
if len(location) > 1:
|
|
79
|
-
VFileError(file, "multiple 'location's are not supported yet")
|
|
80
|
+
raise VFileError(file, "multiple 'location's are not supported yet")
|
|
80
81
|
|
|
81
82
|
loc = location[0]
|
|
82
83
|
|
|
83
84
|
if (offset := loc.get("offset", None)) is None:
|
|
84
|
-
VFileError(file, "'offset' is not specified")
|
|
85
|
+
raise VFileError(file, "'offset' is not specified")
|
|
85
86
|
|
|
86
87
|
if (size := loc.get("size", None)) is None:
|
|
87
|
-
VFileError(file, "'size' is not specified")
|
|
88
|
+
raise VFileError(file, "'size' is not specified")
|
|
88
89
|
|
|
89
90
|
if (parent := loc.get("parent", None)) is None:
|
|
90
|
-
VFileError(file, "'parent' is not specified")
|
|
91
|
+
raise VFileError(file, "'parent' is not specified")
|
|
91
92
|
|
|
92
93
|
tar_file = File(**parent)
|
|
93
94
|
tar_file._set_stream(file._catalog)
|
|
@@ -236,11 +237,26 @@ class File(DataModel):
|
|
|
236
237
|
with open(destination, mode="wb") as f:
|
|
237
238
|
f.write(self.read())
|
|
238
239
|
|
|
240
|
+
def _symlink_to(self, destination: str):
|
|
241
|
+
if self.location:
|
|
242
|
+
raise OSError(errno.ENOTSUP, "Symlinking virtual file is not supported")
|
|
243
|
+
|
|
244
|
+
if self._caching_enabled:
|
|
245
|
+
self.ensure_cached()
|
|
246
|
+
source = self.get_local_path()
|
|
247
|
+
assert source, "File was not cached"
|
|
248
|
+
elif self.source.startswith("file://"):
|
|
249
|
+
source = self.get_path()
|
|
250
|
+
else:
|
|
251
|
+
raise OSError(errno.EXDEV, "can't link across filesystems")
|
|
252
|
+
return os.symlink(source, destination)
|
|
253
|
+
|
|
239
254
|
def export(
|
|
240
255
|
self,
|
|
241
256
|
output: str,
|
|
242
257
|
placement: ExportPlacement = "fullpath",
|
|
243
258
|
use_cache: bool = True,
|
|
259
|
+
link_type: Literal["copy", "symlink"] = "copy",
|
|
244
260
|
) -> None:
|
|
245
261
|
"""Export file to new location."""
|
|
246
262
|
if use_cache:
|
|
@@ -249,6 +265,13 @@ class File(DataModel):
|
|
|
249
265
|
dst_dir = os.path.dirname(dst)
|
|
250
266
|
os.makedirs(dst_dir, exist_ok=True)
|
|
251
267
|
|
|
268
|
+
if link_type == "symlink":
|
|
269
|
+
try:
|
|
270
|
+
return self._symlink_to(dst)
|
|
271
|
+
except OSError as exc:
|
|
272
|
+
if exc.errno not in (errno.ENOTSUP, errno.EXDEV, errno.ENOSYS):
|
|
273
|
+
raise
|
|
274
|
+
|
|
252
275
|
self.save(dst)
|
|
253
276
|
|
|
254
277
|
def _set_stream(
|
datachain/lib/listing.py
CHANGED
|
@@ -113,14 +113,14 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[Optional[str], st
|
|
|
113
113
|
telemetry.log_param("client", client.PREFIX)
|
|
114
114
|
|
|
115
115
|
if not uri.endswith("/") and _isfile(client, uri):
|
|
116
|
-
return None, f
|
|
116
|
+
return None, f"{storage_uri}/{path.lstrip('/')}", path
|
|
117
117
|
if uses_glob(path):
|
|
118
118
|
lst_uri_path = posixpath.dirname(path)
|
|
119
119
|
else:
|
|
120
|
-
storage_uri, path = Client.parse_url(f
|
|
120
|
+
storage_uri, path = Client.parse_url(f"{uri.rstrip('/')}/")
|
|
121
121
|
lst_uri_path = path
|
|
122
122
|
|
|
123
|
-
lst_uri = f
|
|
123
|
+
lst_uri = f"{storage_uri}/{lst_uri_path.lstrip('/')}"
|
|
124
124
|
ds_name = (
|
|
125
125
|
f"{LISTING_PREFIX}{storage_uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
|
|
126
126
|
)
|
|
@@ -180,7 +180,7 @@ def get_listing(
|
|
|
180
180
|
# for local file system we need to fix listing path / prefix
|
|
181
181
|
# if we are reusing existing listing
|
|
182
182
|
if isinstance(client, FileClient) and listing and listing.name != ds_name:
|
|
183
|
-
list_path = f
|
|
183
|
+
list_path = f"{ds_name.strip('/').removeprefix(listing.name)}/{list_path}"
|
|
184
184
|
|
|
185
185
|
ds_name = listing.name if listing else ds_name
|
|
186
186
|
|
datachain/lib/pytorch.py
CHANGED
|
@@ -50,6 +50,7 @@ class PytorchDataset(IterableDataset):
|
|
|
50
50
|
tokenizer_kwargs: Optional[dict[str, Any]] = None,
|
|
51
51
|
num_samples: int = 0,
|
|
52
52
|
dc_settings: Optional[Settings] = None,
|
|
53
|
+
remove_prefetched: bool = False,
|
|
53
54
|
):
|
|
54
55
|
"""
|
|
55
56
|
Pytorch IterableDataset that streams DataChain datasets.
|
|
@@ -84,6 +85,7 @@ class PytorchDataset(IterableDataset):
|
|
|
84
85
|
|
|
85
86
|
self._cache = catalog.cache
|
|
86
87
|
self._prefetch_cache: Optional[Cache] = None
|
|
88
|
+
self._remove_prefetched = remove_prefetched
|
|
87
89
|
if prefetch and not self.cache:
|
|
88
90
|
tmp_dir = catalog.cache.tmp_dir
|
|
89
91
|
assert tmp_dir
|
|
@@ -147,7 +149,7 @@ class PytorchDataset(IterableDataset):
|
|
|
147
149
|
rows,
|
|
148
150
|
self.prefetch,
|
|
149
151
|
download_cb=download_cb,
|
|
150
|
-
|
|
152
|
+
remove_prefetched=self._remove_prefetched,
|
|
151
153
|
)
|
|
152
154
|
|
|
153
155
|
with download_cb, closing(rows):
|