datachain 0.2.18__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/cache.py +5 -10
- datachain/catalog/catalog.py +10 -20
- datachain/client/azure.py +5 -12
- datachain/client/fsspec.py +6 -10
- datachain/client/gcs.py +4 -14
- datachain/client/local.py +4 -11
- datachain/client/s3.py +4 -8
- datachain/data_storage/schema.py +7 -15
- datachain/data_storage/warehouse.py +34 -45
- datachain/lib/dc.py +8 -6
- datachain/lib/file.py +19 -18
- datachain/lib/udf.py +21 -14
- datachain/lib/webdataset.py +2 -3
- datachain/listing.py +14 -20
- datachain/node.py +32 -21
- datachain/query/batch.py +45 -41
- datachain/query/builtins.py +5 -12
- datachain/query/dataset.py +15 -8
- datachain/query/dispatch.py +53 -68
- datachain/query/queue.py +120 -0
- datachain/query/schema.py +3 -7
- datachain/query/udf.py +23 -8
- datachain/utils.py +17 -2
- {datachain-0.2.18.dist-info → datachain-0.3.1.dist-info}/METADATA +1 -1
- {datachain-0.2.18.dist-info → datachain-0.3.1.dist-info}/RECORD +29 -28
- {datachain-0.2.18.dist-info → datachain-0.3.1.dist-info}/LICENSE +0 -0
- {datachain-0.2.18.dist-info → datachain-0.3.1.dist-info}/WHEEL +0 -0
- {datachain-0.2.18.dist-info → datachain-0.3.1.dist-info}/entry_points.txt +0 -0
- {datachain-0.2.18.dist-info → datachain-0.3.1.dist-info}/top_level.txt +0 -0
datachain/cache.py
CHANGED
|
@@ -24,8 +24,7 @@ sha256 = partial(hashlib.sha256, usedforsecurity=False)
|
|
|
24
24
|
@attrs.frozen
|
|
25
25
|
class UniqueId:
|
|
26
26
|
storage: "StorageURI"
|
|
27
|
-
|
|
28
|
-
name: str
|
|
27
|
+
path: str
|
|
29
28
|
size: int
|
|
30
29
|
etag: str
|
|
31
30
|
version: str = ""
|
|
@@ -34,10 +33,6 @@ class UniqueId:
|
|
|
34
33
|
location: Optional[str] = None
|
|
35
34
|
last_modified: datetime = TIME_ZERO
|
|
36
35
|
|
|
37
|
-
@property
|
|
38
|
-
def path(self) -> str:
|
|
39
|
-
return f"{self.parent}/{self.name}" if self.parent else self.name
|
|
40
|
-
|
|
41
36
|
def get_parsed_location(self) -> Optional[dict]:
|
|
42
37
|
if not self.location:
|
|
43
38
|
return None
|
|
@@ -53,10 +48,10 @@ class UniqueId:
|
|
|
53
48
|
return loc_stack[0]
|
|
54
49
|
|
|
55
50
|
def get_hash(self) -> str:
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
f"
|
|
59
|
-
).hexdigest()
|
|
51
|
+
fingerprint = f"{self.storage}/{self.path}/{self.version}/{self.etag}"
|
|
52
|
+
if self.location:
|
|
53
|
+
fingerprint += f"/{self.location}"
|
|
54
|
+
return sha256(fingerprint.encode()).hexdigest()
|
|
60
55
|
|
|
61
56
|
|
|
62
57
|
def try_scandir(path):
|
datachain/catalog/catalog.py
CHANGED
|
@@ -529,21 +529,16 @@ def find_column_to_str( # noqa: PLR0911
|
|
|
529
529
|
if column == "du":
|
|
530
530
|
return str(
|
|
531
531
|
src.listing.du(
|
|
532
|
-
{
|
|
533
|
-
f: row[field_lookup[f]]
|
|
534
|
-
for f in ["dir_type", "size", "parent", "name"]
|
|
535
|
-
}
|
|
532
|
+
{f: row[field_lookup[f]] for f in ["dir_type", "size", "path"]}
|
|
536
533
|
)[0]
|
|
537
534
|
)
|
|
538
535
|
if column == "name":
|
|
539
|
-
return row[field_lookup["
|
|
536
|
+
return posixpath.basename(row[field_lookup["path"]]) or ""
|
|
540
537
|
if column == "owner":
|
|
541
538
|
return row[field_lookup["owner_name"]] or ""
|
|
542
539
|
if column == "path":
|
|
543
540
|
is_dir = row[field_lookup["dir_type"]] == DirType.DIR
|
|
544
|
-
|
|
545
|
-
name = row[field_lookup["name"]]
|
|
546
|
-
path = f"{parent}/{name}" if parent else name
|
|
541
|
+
path = row[field_lookup["path"]]
|
|
547
542
|
if is_dir and path:
|
|
548
543
|
full_path = path + "/"
|
|
549
544
|
else:
|
|
@@ -681,7 +676,7 @@ class Catalog:
|
|
|
681
676
|
|
|
682
677
|
def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
|
|
683
678
|
config = config or self.client_config
|
|
684
|
-
return Client.parse_url(uri, self.
|
|
679
|
+
return Client.parse_url(uri, self.cache, **config)
|
|
685
680
|
|
|
686
681
|
def get_client(self, uri: StorageURI, **config: Any) -> Client:
|
|
687
682
|
"""
|
|
@@ -724,8 +719,7 @@ class Catalog:
|
|
|
724
719
|
columns = [
|
|
725
720
|
Column("vtype", String),
|
|
726
721
|
Column("dir_type", Int),
|
|
727
|
-
Column("
|
|
728
|
-
Column("name", String),
|
|
722
|
+
Column("path", String),
|
|
729
723
|
Column("etag", String),
|
|
730
724
|
Column("version", String),
|
|
731
725
|
Column("is_latest", Boolean),
|
|
@@ -1623,8 +1617,7 @@ class Catalog:
|
|
|
1623
1617
|
Example output:
|
|
1624
1618
|
{
|
|
1625
1619
|
"source": "s3://ldb-public",
|
|
1626
|
-
"
|
|
1627
|
-
"name": "dog.jpg",
|
|
1620
|
+
"path": "animals/dogs/dog.jpg",
|
|
1628
1621
|
...
|
|
1629
1622
|
}
|
|
1630
1623
|
"""
|
|
@@ -1675,8 +1668,7 @@ class Catalog:
|
|
|
1675
1668
|
def _get_row_uid(self, row: RowDict) -> UniqueId:
|
|
1676
1669
|
return UniqueId(
|
|
1677
1670
|
row["source"],
|
|
1678
|
-
row["
|
|
1679
|
-
row["name"],
|
|
1671
|
+
row["path"],
|
|
1680
1672
|
row["size"],
|
|
1681
1673
|
row["etag"],
|
|
1682
1674
|
row["version"],
|
|
@@ -2308,16 +2300,14 @@ class Catalog:
|
|
|
2308
2300
|
if column == "du":
|
|
2309
2301
|
field_set.add("dir_type")
|
|
2310
2302
|
field_set.add("size")
|
|
2311
|
-
field_set.add("
|
|
2312
|
-
field_set.add("name")
|
|
2303
|
+
field_set.add("path")
|
|
2313
2304
|
elif column == "name":
|
|
2314
|
-
field_set.add("
|
|
2305
|
+
field_set.add("path")
|
|
2315
2306
|
elif column == "owner":
|
|
2316
2307
|
field_set.add("owner_name")
|
|
2317
2308
|
elif column == "path":
|
|
2318
2309
|
field_set.add("dir_type")
|
|
2319
|
-
field_set.add("
|
|
2320
|
-
field_set.add("name")
|
|
2310
|
+
field_set.add("path")
|
|
2321
2311
|
elif column == "size":
|
|
2322
2312
|
field_set.add("size")
|
|
2323
2313
|
elif column == "type":
|
datachain/client/azure.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import posixpath
|
|
2
1
|
from typing import Any
|
|
3
2
|
|
|
4
3
|
from adlfs import AzureBlobFileSystem
|
|
@@ -14,16 +13,10 @@ class AzureClient(Client):
|
|
|
14
13
|
PREFIX = "az://"
|
|
15
14
|
protocol = "az"
|
|
16
15
|
|
|
17
|
-
def convert_info(self, v: dict[str, Any],
|
|
16
|
+
def convert_info(self, v: dict[str, Any], path: str) -> Entry:
|
|
18
17
|
version_id = v.get("version_id")
|
|
19
|
-
name = v.get("name", "").split(DELIMITER)[-1]
|
|
20
|
-
if version_id:
|
|
21
|
-
version_suffix = f"?versionid={version_id}"
|
|
22
|
-
if name.endswith(version_suffix):
|
|
23
|
-
name = name[: -len(version_suffix)]
|
|
24
18
|
return Entry.from_file(
|
|
25
|
-
|
|
26
|
-
name=name,
|
|
19
|
+
path=path,
|
|
27
20
|
etag=v.get("etag", "").strip('"'),
|
|
28
21
|
version=version_id or "",
|
|
29
22
|
is_latest=version_id is None or bool(v.get("is_current_version")),
|
|
@@ -50,9 +43,9 @@ class AzureClient(Client):
|
|
|
50
43
|
if not self._is_valid_key(b["name"]):
|
|
51
44
|
continue
|
|
52
45
|
info = (await self.fs._details([b]))[0]
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
46
|
+
entries.append(
|
|
47
|
+
self.convert_info(info, self.rel_path(info["name"]))
|
|
48
|
+
)
|
|
56
49
|
if entries:
|
|
57
50
|
await result_queue.put(entries)
|
|
58
51
|
pbar.update(len(entries))
|
datachain/client/fsspec.py
CHANGED
|
@@ -37,7 +37,6 @@ from datachain.storage import StorageURI
|
|
|
37
37
|
if TYPE_CHECKING:
|
|
38
38
|
from fsspec.spec import AbstractFileSystem
|
|
39
39
|
|
|
40
|
-
from datachain.data_storage import AbstractMetastore
|
|
41
40
|
|
|
42
41
|
logger = logging.getLogger("datachain")
|
|
43
42
|
|
|
@@ -116,13 +115,12 @@ class Client(ABC):
|
|
|
116
115
|
@staticmethod
|
|
117
116
|
def parse_url(
|
|
118
117
|
source: str,
|
|
119
|
-
metastore: "AbstractMetastore",
|
|
120
118
|
cache: DataChainCache,
|
|
121
119
|
**kwargs,
|
|
122
120
|
) -> tuple["Client", str]:
|
|
123
121
|
cls = Client.get_implementation(source)
|
|
124
122
|
storage_url, rel_path = cls.split_url(source)
|
|
125
|
-
client = cls.from_name(storage_url,
|
|
123
|
+
client = cls.from_name(storage_url, cache, kwargs)
|
|
126
124
|
return client, rel_path
|
|
127
125
|
|
|
128
126
|
@classmethod
|
|
@@ -136,7 +134,6 @@ class Client(ABC):
|
|
|
136
134
|
def from_name(
|
|
137
135
|
cls,
|
|
138
136
|
name: str,
|
|
139
|
-
metastore: "AbstractMetastore",
|
|
140
137
|
cache: DataChainCache,
|
|
141
138
|
kwargs: dict[str, Any],
|
|
142
139
|
) -> "Client":
|
|
@@ -277,7 +274,7 @@ class Client(ABC):
|
|
|
277
274
|
if info["type"] == "directory":
|
|
278
275
|
subdirs.add(subprefix)
|
|
279
276
|
else:
|
|
280
|
-
files.append(self.convert_info(info,
|
|
277
|
+
files.append(self.convert_info(info, subprefix))
|
|
281
278
|
if files:
|
|
282
279
|
await result_queue.put(files)
|
|
283
280
|
found_count = len(subdirs) + len(files)
|
|
@@ -360,12 +357,11 @@ class Client(ABC):
|
|
|
360
357
|
|
|
361
358
|
parent_uid = UniqueId(
|
|
362
359
|
parent["source"],
|
|
363
|
-
parent["
|
|
364
|
-
parent["name"],
|
|
365
|
-
parent["etag"],
|
|
360
|
+
parent["path"],
|
|
366
361
|
parent["size"],
|
|
367
|
-
parent["
|
|
368
|
-
parent["
|
|
362
|
+
parent["etag"],
|
|
363
|
+
vtype=parent["vtype"],
|
|
364
|
+
location=parent["location"],
|
|
369
365
|
)
|
|
370
366
|
f = self.open_object(parent_uid, use_cache=use_cache)
|
|
371
367
|
return FileSlice(f, offset, size, posixpath.basename(uid.path))
|
datachain/client/gcs.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import json
|
|
3
3
|
import os
|
|
4
|
-
import posixpath
|
|
5
4
|
from collections.abc import Iterable
|
|
6
5
|
from datetime import datetime
|
|
7
6
|
from typing import Any, Optional, cast
|
|
@@ -110,20 +109,11 @@ class GCSClient(Client):
|
|
|
110
109
|
|
|
111
110
|
def _entry_from_dict(self, d: dict[str, Any]) -> Entry:
|
|
112
111
|
info = self.fs._process_object(self.name, d)
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
return self.convert_info(info, parent)
|
|
117
|
-
|
|
118
|
-
def convert_info(self, v: dict[str, Any], parent: str) -> Entry:
|
|
119
|
-
name = v.get("name", "").split(DELIMITER)[-1]
|
|
120
|
-
if "generation" in v:
|
|
121
|
-
gen = f"#{v['generation']}"
|
|
122
|
-
if name.endswith(gen):
|
|
123
|
-
name = name[: -len(gen)]
|
|
112
|
+
return self.convert_info(info, self.rel_path(info["name"]))
|
|
113
|
+
|
|
114
|
+
def convert_info(self, v: dict[str, Any], path: str) -> Entry:
|
|
124
115
|
return Entry.from_file(
|
|
125
|
-
|
|
126
|
-
name=name,
|
|
116
|
+
path=path,
|
|
127
117
|
etag=v.get("etag", ""),
|
|
128
118
|
version=v.get("generation", ""),
|
|
129
119
|
is_latest=not v.get("timeDeleted"),
|
datachain/client/local.py
CHANGED
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
import posixpath
|
|
3
3
|
from datetime import datetime, timezone
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import Any
|
|
6
6
|
from urllib.parse import urlparse
|
|
7
7
|
|
|
8
8
|
from fsspec.implementations.local import LocalFileSystem
|
|
@@ -12,9 +12,6 @@ from datachain.storage import StorageURI
|
|
|
12
12
|
|
|
13
13
|
from .fsspec import Client
|
|
14
14
|
|
|
15
|
-
if TYPE_CHECKING:
|
|
16
|
-
from datachain.data_storage import AbstractMetastore
|
|
17
|
-
|
|
18
15
|
|
|
19
16
|
class FileClient(Client):
|
|
20
17
|
FS_CLASS = LocalFileSystem
|
|
@@ -97,9 +94,7 @@ class FileClient(Client):
|
|
|
97
94
|
return cls.root_dir(), uri.removeprefix(cls.root_path().as_uri())
|
|
98
95
|
|
|
99
96
|
@classmethod
|
|
100
|
-
def from_name(
|
|
101
|
-
cls, name: str, metastore: "AbstractMetastore", cache, kwargs
|
|
102
|
-
) -> "FileClient":
|
|
97
|
+
def from_name(cls, name: str, cache, kwargs) -> "FileClient":
|
|
103
98
|
use_symlinks = kwargs.pop("use_symlinks", False)
|
|
104
99
|
return cls(name, kwargs, cache, use_symlinks=use_symlinks)
|
|
105
100
|
|
|
@@ -140,11 +135,9 @@ class FileClient(Client):
|
|
|
140
135
|
full_path += "/"
|
|
141
136
|
return full_path
|
|
142
137
|
|
|
143
|
-
def convert_info(self, v: dict[str, Any],
|
|
144
|
-
name = posixpath.basename(v["name"])
|
|
138
|
+
def convert_info(self, v: dict[str, Any], path: str) -> Entry:
|
|
145
139
|
return Entry.from_file(
|
|
146
|
-
|
|
147
|
-
name=name,
|
|
140
|
+
path=path,
|
|
148
141
|
etag=v["mtime"].hex(),
|
|
149
142
|
is_latest=True,
|
|
150
143
|
last_modified=datetime.fromtimestamp(v["mtime"], timezone.utc),
|
datachain/client/s3.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
import posixpath
|
|
3
2
|
from typing import Any, cast
|
|
4
3
|
|
|
5
4
|
from botocore.exceptions import NoCredentialsError
|
|
@@ -112,10 +111,8 @@ class ClientS3(Client):
|
|
|
112
111
|
await self._fetch_flat(start_prefix, result_queue)
|
|
113
112
|
|
|
114
113
|
def _entry_from_boto(self, v, bucket, versions=False):
|
|
115
|
-
parent, name = posixpath.split(v["Key"])
|
|
116
114
|
return Entry.from_file(
|
|
117
|
-
|
|
118
|
-
name=name,
|
|
115
|
+
path=v["Key"],
|
|
119
116
|
etag=v.get("ETag", "").strip('"'),
|
|
120
117
|
version=ClientS3.clean_s3_version(v.get("VersionId", "")),
|
|
121
118
|
is_latest=v.get("IsLatest", True),
|
|
@@ -145,7 +142,7 @@ class ClientS3(Client):
|
|
|
145
142
|
if info["type"] == "directory":
|
|
146
143
|
subdirs.add(subprefix)
|
|
147
144
|
else:
|
|
148
|
-
files.append(self.convert_info(info,
|
|
145
|
+
files.append(self.convert_info(info, subprefix))
|
|
149
146
|
pbar.update()
|
|
150
147
|
found = True
|
|
151
148
|
if not found:
|
|
@@ -159,10 +156,9 @@ class ClientS3(Client):
|
|
|
159
156
|
def clean_s3_version(ver):
|
|
160
157
|
return ver if ver != "null" else ""
|
|
161
158
|
|
|
162
|
-
def convert_info(self, v: dict[str, Any],
|
|
159
|
+
def convert_info(self, v: dict[str, Any], path: str) -> Entry:
|
|
163
160
|
return Entry.from_file(
|
|
164
|
-
|
|
165
|
-
name=v.get("Key", "").split(DELIMITER)[-1],
|
|
161
|
+
path=path,
|
|
166
162
|
etag=v.get("ETag", "").strip('"'),
|
|
167
163
|
version=ClientS3.clean_s3_version(v.get("VersionId", "")),
|
|
168
164
|
is_latest=v.get("IsLatest", True),
|
datachain/data_storage/schema.py
CHANGED
|
@@ -80,8 +80,7 @@ class DirExpansion:
|
|
|
80
80
|
q.c.vtype,
|
|
81
81
|
(q.c.dir_type == DirType.DIR).label("is_dir"),
|
|
82
82
|
q.c.source,
|
|
83
|
-
q.c.
|
|
84
|
-
q.c.name,
|
|
83
|
+
q.c.path,
|
|
85
84
|
q.c.version,
|
|
86
85
|
q.c.location,
|
|
87
86
|
)
|
|
@@ -94,36 +93,29 @@ class DirExpansion:
|
|
|
94
93
|
q.c.vtype,
|
|
95
94
|
q.c.is_dir,
|
|
96
95
|
q.c.source,
|
|
97
|
-
q.c.
|
|
98
|
-
q.c.name,
|
|
96
|
+
q.c.path,
|
|
99
97
|
q.c.version,
|
|
100
98
|
f.max(q.c.location).label("location"),
|
|
101
99
|
)
|
|
102
100
|
.select_from(q)
|
|
103
|
-
.group_by(
|
|
104
|
-
|
|
105
|
-
)
|
|
106
|
-
.order_by(
|
|
107
|
-
q.c.source, q.c.parent, q.c.name, q.c.vtype, q.c.is_dir, q.c.version
|
|
108
|
-
)
|
|
101
|
+
.group_by(q.c.source, q.c.path, q.c.vtype, q.c.is_dir, q.c.version)
|
|
102
|
+
.order_by(q.c.source, q.c.path, q.c.vtype, q.c.is_dir, q.c.version)
|
|
109
103
|
)
|
|
110
104
|
|
|
111
105
|
@classmethod
|
|
112
106
|
def query(cls, q):
|
|
113
107
|
q = cls.base_select(q).cte(recursive=True)
|
|
114
|
-
|
|
115
|
-
parent_name = path.name(q.c.parent)
|
|
108
|
+
parent = path.parent(q.c.path)
|
|
116
109
|
q = q.union_all(
|
|
117
110
|
sa.select(
|
|
118
111
|
sa.literal(-1).label("sys__id"),
|
|
119
112
|
sa.literal("").label("vtype"),
|
|
120
113
|
true().label("is_dir"),
|
|
121
114
|
q.c.source,
|
|
122
|
-
|
|
123
|
-
parent_name.label("name"),
|
|
115
|
+
parent.label("path"),
|
|
124
116
|
sa.literal("").label("version"),
|
|
125
117
|
null().label("location"),
|
|
126
|
-
).where(
|
|
118
|
+
).where(parent != "")
|
|
127
119
|
)
|
|
128
120
|
return cls.apply_group_by(q)
|
|
129
121
|
|
|
@@ -17,8 +17,9 @@ from sqlalchemy.sql.expression import true
|
|
|
17
17
|
|
|
18
18
|
from datachain.client import Client
|
|
19
19
|
from datachain.data_storage.serializer import Serializable
|
|
20
|
-
from datachain.dataset import DatasetRecord
|
|
20
|
+
from datachain.dataset import DatasetRecord
|
|
21
21
|
from datachain.node import DirType, DirTypeGroup, Entry, Node, NodeWithPath, get_path
|
|
22
|
+
from datachain.sql.functions import path as pathfunc
|
|
22
23
|
from datachain.sql.types import Int, SQLType
|
|
23
24
|
from datachain.storage import StorageURI
|
|
24
25
|
from datachain.utils import sql_escape_like
|
|
@@ -200,23 +201,17 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
200
201
|
def dataset_select_paginated(
|
|
201
202
|
self,
|
|
202
203
|
query,
|
|
203
|
-
limit: Optional[int] = None,
|
|
204
|
-
order_by: tuple["ColumnElement[Any]", ...] = (),
|
|
205
204
|
page_size: int = SELECT_BATCH_SIZE,
|
|
206
|
-
) -> Generator[
|
|
205
|
+
) -> Generator[Sequence, None, None]:
|
|
207
206
|
"""
|
|
208
207
|
This is equivalent to `db.execute`, but for selecting rows in batches
|
|
209
208
|
"""
|
|
210
|
-
|
|
211
|
-
|
|
209
|
+
limit = query._limit
|
|
210
|
+
paginated_query = query.limit(page_size)
|
|
212
211
|
|
|
213
|
-
if not
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
ordering = order_by # type: ignore[assignment]
|
|
217
|
-
|
|
218
|
-
# reset query order by and apply new order by id
|
|
219
|
-
paginated_query = query.order_by(None).order_by(*ordering).limit(page_size)
|
|
212
|
+
if not paginated_query._order_by_clauses:
|
|
213
|
+
# default order by is order by `sys__id`
|
|
214
|
+
paginated_query = paginated_query.order_by(query.selected_columns.sys__id)
|
|
220
215
|
|
|
221
216
|
results = None
|
|
222
217
|
offset = 0
|
|
@@ -235,7 +230,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
235
230
|
processed = False
|
|
236
231
|
for row in results:
|
|
237
232
|
processed = True
|
|
238
|
-
yield
|
|
233
|
+
yield row
|
|
239
234
|
num_yielded += 1
|
|
240
235
|
|
|
241
236
|
if not processed:
|
|
@@ -373,9 +368,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
373
368
|
|
|
374
369
|
else:
|
|
375
370
|
parent = self.get_node_by_path(dr, path.lstrip("/").rstrip("/*"))
|
|
376
|
-
select_query = select_query.where(
|
|
377
|
-
(dr.c.parent == parent.path) | (self.path_expr(dr) == path)
|
|
378
|
-
)
|
|
371
|
+
select_query = select_query.where(pathfunc.parent(dr.c.path) == parent.path)
|
|
379
372
|
return select_query
|
|
380
373
|
|
|
381
374
|
def rename_dataset_table(
|
|
@@ -532,8 +525,8 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
532
525
|
dr,
|
|
533
526
|
parent_path,
|
|
534
527
|
type="dir",
|
|
535
|
-
conds=[sa.Column("
|
|
536
|
-
order_by=["source", "
|
|
528
|
+
conds=[pathfunc.parent(sa.Column("path")) == parent_path],
|
|
529
|
+
order_by=["source", "path"],
|
|
537
530
|
)
|
|
538
531
|
return self.get_nodes(query)
|
|
539
532
|
|
|
@@ -556,7 +549,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
556
549
|
& ~self.instr(relpath, "/")
|
|
557
550
|
& (self.path_expr(de) != dirpath)
|
|
558
551
|
)
|
|
559
|
-
.order_by(de.c.source, de.c.
|
|
552
|
+
.order_by(de.c.source, de.c.path, de.c.version)
|
|
560
553
|
)
|
|
561
554
|
|
|
562
555
|
def _get_node_by_path_list(
|
|
@@ -572,8 +565,8 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
572
565
|
).subquery()
|
|
573
566
|
query = self.expand_query(de, dr)
|
|
574
567
|
|
|
575
|
-
q = query.where(
|
|
576
|
-
de.c.source, de.c.
|
|
568
|
+
q = query.where(de.c.path == get_path(parent, name)).order_by(
|
|
569
|
+
de.c.source, de.c.path, de.c.version
|
|
577
570
|
)
|
|
578
571
|
row = next(self.dataset_rows_select(q), None)
|
|
579
572
|
if not row:
|
|
@@ -636,8 +629,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
636
629
|
case((de.c.is_dir == true(), DirType.DIR), else_=dr.c.dir_type).label(
|
|
637
630
|
"dir_type"
|
|
638
631
|
),
|
|
639
|
-
de.c.
|
|
640
|
-
de.c.name,
|
|
632
|
+
de.c.path,
|
|
641
633
|
with_default(dr.c.etag),
|
|
642
634
|
de.c.version,
|
|
643
635
|
with_default(dr.c.is_latest),
|
|
@@ -670,7 +662,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
670
662
|
.where(
|
|
671
663
|
dr.c.is_latest == true(),
|
|
672
664
|
dr.c.dir_type != DirType.DIR,
|
|
673
|
-
|
|
665
|
+
dr.c.path.startswith(path),
|
|
674
666
|
)
|
|
675
667
|
.exists()
|
|
676
668
|
)
|
|
@@ -678,8 +670,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
678
670
|
if not row:
|
|
679
671
|
raise FileNotFoundError(f"Unable to resolve path {path}")
|
|
680
672
|
path = path.removesuffix("/")
|
|
681
|
-
|
|
682
|
-
return Node.from_dir(parent, name)
|
|
673
|
+
return Node.from_dir(path)
|
|
683
674
|
|
|
684
675
|
def expand_path(self, dataset_rows: "DataTable", path: str) -> list[Node]:
|
|
685
676
|
"""Simulates Unix-like shell expansion"""
|
|
@@ -703,18 +694,21 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
703
694
|
de = dr.dataset_dir_expansion(
|
|
704
695
|
dr.select().where(dr.c.is_latest == true()).subquery()
|
|
705
696
|
).subquery()
|
|
706
|
-
where_cond = de.c.
|
|
697
|
+
where_cond = pathfunc.parent(de.c.path) == parent_path
|
|
707
698
|
if parent_path == "":
|
|
708
699
|
# Exclude the root dir
|
|
709
|
-
where_cond = where_cond & (de.c.
|
|
700
|
+
where_cond = where_cond & (de.c.path != "")
|
|
710
701
|
inner_query = self.expand_query(de, dr).where(where_cond).subquery()
|
|
702
|
+
|
|
703
|
+
def field_to_expr(f):
|
|
704
|
+
if f == "name":
|
|
705
|
+
return pathfunc.name(inner_query.c.path)
|
|
706
|
+
return getattr(inner_query.c, f)
|
|
707
|
+
|
|
711
708
|
return self.db.execute(
|
|
712
|
-
|
|
713
|
-
.select_from(inner_query)
|
|
714
|
-
.order_by(
|
|
709
|
+
select(*(field_to_expr(f) for f in fields)).order_by(
|
|
715
710
|
inner_query.c.source,
|
|
716
|
-
inner_query.c.
|
|
717
|
-
inner_query.c.name,
|
|
711
|
+
inner_query.c.path,
|
|
718
712
|
inner_query.c.version,
|
|
719
713
|
)
|
|
720
714
|
)
|
|
@@ -727,21 +721,20 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
727
721
|
"""
|
|
728
722
|
dr = dataset_rows
|
|
729
723
|
dirpath = f"{parent_path}/"
|
|
730
|
-
relpath = func.substr(self.path_expr(dr), len(dirpath) + 1)
|
|
731
724
|
|
|
732
725
|
def field_to_expr(f):
|
|
733
726
|
if f == "name":
|
|
734
|
-
return
|
|
727
|
+
return pathfunc.name(dr.c.path)
|
|
735
728
|
return getattr(dr.c, f)
|
|
736
729
|
|
|
737
730
|
q = (
|
|
738
731
|
select(*(field_to_expr(f) for f in fields))
|
|
739
732
|
.where(
|
|
740
733
|
self.path_expr(dr).like(f"{sql_escape_like(dirpath)}%"),
|
|
741
|
-
~self.instr(
|
|
734
|
+
~self.instr(pathfunc.name(dr.c.path), "/"),
|
|
742
735
|
dr.c.is_latest == true(),
|
|
743
736
|
)
|
|
744
|
-
.order_by(dr.c.source, dr.c.
|
|
737
|
+
.order_by(dr.c.source, dr.c.path, dr.c.version, dr.c.etag)
|
|
745
738
|
)
|
|
746
739
|
return self.db.execute(q)
|
|
747
740
|
|
|
@@ -758,7 +751,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
758
751
|
if isinstance(node, dict):
|
|
759
752
|
is_dir = node.get("is_dir", node["dir_type"] in DirTypeGroup.SUBOBJ_DIR)
|
|
760
753
|
node_size = node["size"]
|
|
761
|
-
path =
|
|
754
|
+
path = node["path"]
|
|
762
755
|
else:
|
|
763
756
|
is_dir = node.is_container
|
|
764
757
|
node_size = node.size
|
|
@@ -790,7 +783,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
790
783
|
return results[0] or 0, 0
|
|
791
784
|
|
|
792
785
|
def path_expr(self, t):
|
|
793
|
-
return
|
|
786
|
+
return t.c.path
|
|
794
787
|
|
|
795
788
|
def _find_query(
|
|
796
789
|
self,
|
|
@@ -947,11 +940,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
947
940
|
tq = target_query.alias("target_query")
|
|
948
941
|
|
|
949
942
|
source_target_join = sa.join(
|
|
950
|
-
sq,
|
|
951
|
-
tq,
|
|
952
|
-
(sq.c.source == tq.c.source)
|
|
953
|
-
& (sq.c.parent == tq.c.parent)
|
|
954
|
-
& (sq.c.name == tq.c.name),
|
|
943
|
+
sq, tq, (sq.c.source == tq.c.source) & (sq.c.path == tq.c.path)
|
|
955
944
|
)
|
|
956
945
|
|
|
957
946
|
return (
|
datachain/lib/dc.py
CHANGED
|
@@ -49,6 +49,7 @@ from datachain.query.dataset import (
|
|
|
49
49
|
detach,
|
|
50
50
|
)
|
|
51
51
|
from datachain.query.schema import Column, DatasetRow
|
|
52
|
+
from datachain.sql.functions import path as pathfunc
|
|
52
53
|
from datachain.utils import inside_notebook
|
|
53
54
|
|
|
54
55
|
if TYPE_CHECKING:
|
|
@@ -202,7 +203,7 @@ class DataChain(DatasetQuery):
|
|
|
202
203
|
|
|
203
204
|
DEFAULT_FILE_RECORD: ClassVar[dict] = {
|
|
204
205
|
"source": "",
|
|
205
|
-
"
|
|
206
|
+
"path": "",
|
|
206
207
|
"vtype": "",
|
|
207
208
|
"size": 0,
|
|
208
209
|
}
|
|
@@ -1586,10 +1587,11 @@ class DataChain(DatasetQuery):
|
|
|
1586
1587
|
use_cache: bool = True,
|
|
1587
1588
|
) -> None:
|
|
1588
1589
|
"""Method that exports all files from chain to some folder."""
|
|
1589
|
-
if placement == "filename"
|
|
1590
|
-
|
|
1591
|
-
|
|
1592
|
-
|
|
1590
|
+
if placement == "filename" and (
|
|
1591
|
+
super().distinct(pathfunc.name(C(f"{signal}__path"))).count()
|
|
1592
|
+
!= self.count()
|
|
1593
|
+
):
|
|
1594
|
+
raise ValueError("Files with the same name found")
|
|
1593
1595
|
|
|
1594
1596
|
for file in self.collect(signal):
|
|
1595
1597
|
file.export(output, placement, use_cache) # type: ignore[union-attr]
|
|
@@ -1621,7 +1623,7 @@ class DataChain(DatasetQuery):
|
|
|
1621
1623
|
|
|
1622
1624
|
Using glob to match patterns
|
|
1623
1625
|
```py
|
|
1624
|
-
dc.filter(C("file.name").glob("*.jpg))
|
|
1626
|
+
dc.filter(C("file.name").glob("*.jpg"))
|
|
1625
1627
|
```
|
|
1626
1628
|
|
|
1627
1629
|
Using `datachain.sql.functions`
|