datachain 0.2.18__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/cache.py +5 -10
- datachain/catalog/catalog.py +10 -20
- datachain/client/azure.py +5 -12
- datachain/client/fsspec.py +6 -10
- datachain/client/gcs.py +4 -14
- datachain/client/local.py +4 -11
- datachain/client/s3.py +4 -8
- datachain/data_storage/schema.py +7 -15
- datachain/data_storage/warehouse.py +34 -45
- datachain/lib/dc.py +8 -6
- datachain/lib/file.py +19 -18
- datachain/lib/udf.py +21 -14
- datachain/lib/webdataset.py +2 -3
- datachain/listing.py +14 -20
- datachain/node.py +32 -21
- datachain/query/batch.py +45 -41
- datachain/query/builtins.py +5 -12
- datachain/query/dataset.py +15 -8
- datachain/query/dispatch.py +53 -68
- datachain/query/queue.py +120 -0
- datachain/query/schema.py +3 -7
- datachain/query/udf.py +23 -8
- datachain/utils.py +17 -2
- {datachain-0.2.18.dist-info → datachain-0.3.1.dist-info}/METADATA +1 -1
- {datachain-0.2.18.dist-info → datachain-0.3.1.dist-info}/RECORD +29 -28
- {datachain-0.2.18.dist-info → datachain-0.3.1.dist-info}/LICENSE +0 -0
- {datachain-0.2.18.dist-info → datachain-0.3.1.dist-info}/WHEEL +0 -0
- {datachain-0.2.18.dist-info → datachain-0.3.1.dist-info}/entry_points.txt +0 -0
- {datachain-0.2.18.dist-info → datachain-0.3.1.dist-info}/top_level.txt +0 -0
datachain/lib/file.py
CHANGED
|
@@ -6,7 +6,7 @@ from abc import ABC, abstractmethod
|
|
|
6
6
|
from contextlib import contextmanager
|
|
7
7
|
from datetime import datetime
|
|
8
8
|
from io import BytesIO
|
|
9
|
-
from pathlib import Path
|
|
9
|
+
from pathlib import Path, PurePosixPath
|
|
10
10
|
from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
|
|
11
11
|
from urllib.parse import unquote, urlparse
|
|
12
12
|
from urllib.request import url2pathname
|
|
@@ -111,8 +111,7 @@ class File(DataModel):
|
|
|
111
111
|
"""`DataModel` for reading binary files."""
|
|
112
112
|
|
|
113
113
|
source: str = Field(default="")
|
|
114
|
-
|
|
115
|
-
name: str
|
|
114
|
+
path: str
|
|
116
115
|
size: int = Field(default=0)
|
|
117
116
|
version: str = Field(default="")
|
|
118
117
|
etag: str = Field(default="")
|
|
@@ -123,8 +122,7 @@ class File(DataModel):
|
|
|
123
122
|
|
|
124
123
|
_datachain_column_types: ClassVar[dict[str, Any]] = {
|
|
125
124
|
"source": String,
|
|
126
|
-
"
|
|
127
|
-
"name": String,
|
|
125
|
+
"path": String,
|
|
128
126
|
"size": Int,
|
|
129
127
|
"version": String,
|
|
130
128
|
"etag": String,
|
|
@@ -136,8 +134,7 @@ class File(DataModel):
|
|
|
136
134
|
|
|
137
135
|
_unique_id_keys: ClassVar[list[str]] = [
|
|
138
136
|
"source",
|
|
139
|
-
"
|
|
140
|
-
"name",
|
|
137
|
+
"path",
|
|
141
138
|
"size",
|
|
142
139
|
"etag",
|
|
143
140
|
"version",
|
|
@@ -168,11 +165,9 @@ class File(DataModel):
|
|
|
168
165
|
def validate_location(cls, v):
|
|
169
166
|
return File._validate_dict(v)
|
|
170
167
|
|
|
171
|
-
@field_validator("
|
|
168
|
+
@field_validator("path", mode="before")
|
|
172
169
|
@classmethod
|
|
173
170
|
def validate_path(cls, path):
|
|
174
|
-
if path == "":
|
|
175
|
-
return ""
|
|
176
171
|
return Path(path).as_posix()
|
|
177
172
|
|
|
178
173
|
def model_dump_custom(self):
|
|
@@ -185,6 +180,14 @@ class File(DataModel):
|
|
|
185
180
|
self._catalog = None
|
|
186
181
|
self._caching_enabled = False
|
|
187
182
|
|
|
183
|
+
@property
|
|
184
|
+
def name(self):
|
|
185
|
+
return PurePosixPath(self.path).name
|
|
186
|
+
|
|
187
|
+
@property
|
|
188
|
+
def parent(self):
|
|
189
|
+
return str(PurePosixPath(self.path).parent)
|
|
190
|
+
|
|
188
191
|
@contextmanager
|
|
189
192
|
def open(self, mode: Literal["rb", "r"] = "rb"):
|
|
190
193
|
"""Open the file and return a file object."""
|
|
@@ -261,19 +264,19 @@ class File(DataModel):
|
|
|
261
264
|
|
|
262
265
|
def get_file_suffix(self):
|
|
263
266
|
"""Returns last part of file name with `.`."""
|
|
264
|
-
return
|
|
267
|
+
return PurePosixPath(self.path).suffix
|
|
265
268
|
|
|
266
269
|
def get_file_ext(self):
|
|
267
270
|
"""Returns last part of file name without `.`."""
|
|
268
|
-
return
|
|
271
|
+
return PurePosixPath(self.path).suffix.strip(".")
|
|
269
272
|
|
|
270
273
|
def get_file_stem(self):
|
|
271
274
|
"""Returns file name without extension."""
|
|
272
|
-
return
|
|
275
|
+
return PurePosixPath(self.path).stem
|
|
273
276
|
|
|
274
277
|
def get_full_name(self):
|
|
275
278
|
"""Returns name with parent directories."""
|
|
276
|
-
return
|
|
279
|
+
return self.path
|
|
277
280
|
|
|
278
281
|
def get_uri(self):
|
|
279
282
|
"""Returns file URI."""
|
|
@@ -355,8 +358,7 @@ def get_file(type_: Literal["binary", "text", "image"] = "binary"):
|
|
|
355
358
|
|
|
356
359
|
def get_file_type(
|
|
357
360
|
source: str,
|
|
358
|
-
|
|
359
|
-
name: str,
|
|
361
|
+
path: str,
|
|
360
362
|
size: int,
|
|
361
363
|
version: str,
|
|
362
364
|
etag: str,
|
|
@@ -367,8 +369,7 @@ def get_file(type_: Literal["binary", "text", "image"] = "binary"):
|
|
|
367
369
|
) -> file: # type: ignore[valid-type]
|
|
368
370
|
return file(
|
|
369
371
|
source=source,
|
|
370
|
-
|
|
371
|
-
name=name,
|
|
372
|
+
path=path,
|
|
372
373
|
size=size,
|
|
373
374
|
version=version,
|
|
374
375
|
etag=etag,
|
datachain/lib/udf.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
import traceback
|
|
3
|
-
from collections.abc import Iterable, Iterator
|
|
4
3
|
from typing import TYPE_CHECKING, Callable, Optional
|
|
5
4
|
|
|
6
5
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
@@ -14,16 +13,19 @@ from datachain.lib.model_store import ModelStore
|
|
|
14
13
|
from datachain.lib.signal_schema import SignalSchema
|
|
15
14
|
from datachain.lib.udf_signature import UdfSignature
|
|
16
15
|
from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
|
|
17
|
-
from datachain.query.batch import
|
|
16
|
+
from datachain.query.batch import UDFInputBatch
|
|
18
17
|
from datachain.query.schema import ColumnParameter
|
|
19
18
|
from datachain.query.udf import UDFBase as _UDFBase
|
|
20
|
-
from datachain.query.udf import UDFProperties
|
|
19
|
+
from datachain.query.udf import UDFProperties
|
|
21
20
|
|
|
22
21
|
if TYPE_CHECKING:
|
|
22
|
+
from collections.abc import Iterable, Iterator, Sequence
|
|
23
|
+
|
|
23
24
|
from typing_extensions import Self
|
|
24
25
|
|
|
25
26
|
from datachain.catalog import Catalog
|
|
26
|
-
from datachain.query.batch import
|
|
27
|
+
from datachain.query.batch import RowsOutput, UDFInput
|
|
28
|
+
from datachain.query.udf import UDFResult
|
|
27
29
|
|
|
28
30
|
|
|
29
31
|
class UdfError(DataChainParamsError):
|
|
@@ -42,22 +44,27 @@ class UDFAdapter(_UDFBase):
|
|
|
42
44
|
|
|
43
45
|
def run(
|
|
44
46
|
self,
|
|
45
|
-
|
|
47
|
+
udf_fields: "Sequence[str]",
|
|
48
|
+
udf_inputs: "Iterable[RowsOutput]",
|
|
46
49
|
catalog: "Catalog",
|
|
47
50
|
is_generator: bool,
|
|
48
51
|
cache: bool,
|
|
49
52
|
download_cb: Callback = DEFAULT_CALLBACK,
|
|
50
53
|
processed_cb: Callback = DEFAULT_CALLBACK,
|
|
51
|
-
) -> Iterator[Iterable[
|
|
54
|
+
) -> "Iterator[Iterable[UDFResult]]":
|
|
52
55
|
self.inner._catalog = catalog
|
|
53
56
|
if hasattr(self.inner, "setup") and callable(self.inner.setup):
|
|
54
57
|
self.inner.setup()
|
|
55
58
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
59
|
+
yield from super().run(
|
|
60
|
+
udf_fields,
|
|
61
|
+
udf_inputs,
|
|
62
|
+
catalog,
|
|
63
|
+
is_generator,
|
|
64
|
+
cache,
|
|
65
|
+
download_cb,
|
|
66
|
+
processed_cb,
|
|
67
|
+
)
|
|
61
68
|
|
|
62
69
|
if hasattr(self.inner, "teardown") and callable(self.inner.teardown):
|
|
63
70
|
self.inner.teardown()
|
|
@@ -65,12 +72,12 @@ class UDFAdapter(_UDFBase):
|
|
|
65
72
|
def run_once(
|
|
66
73
|
self,
|
|
67
74
|
catalog: "Catalog",
|
|
68
|
-
arg: "
|
|
75
|
+
arg: "UDFInput",
|
|
69
76
|
is_generator: bool = False,
|
|
70
77
|
cache: bool = False,
|
|
71
78
|
cb: Callback = DEFAULT_CALLBACK,
|
|
72
|
-
) -> Iterable[UDFResult]:
|
|
73
|
-
if isinstance(arg,
|
|
79
|
+
) -> "Iterable[UDFResult]":
|
|
80
|
+
if isinstance(arg, UDFInputBatch):
|
|
74
81
|
udf_inputs = [
|
|
75
82
|
self.bind_parameters(catalog, row, cache=cache, cb=cb)
|
|
76
83
|
for row in arg.rows
|
datachain/lib/webdataset.py
CHANGED
|
@@ -119,7 +119,7 @@ class Builder:
|
|
|
119
119
|
return self._tar.extractfile(item).read().decode(self._encoding)
|
|
120
120
|
|
|
121
121
|
def add(self, file: tarfile.TarInfo):
|
|
122
|
-
fstream = File(
|
|
122
|
+
fstream = File(path=file.name)
|
|
123
123
|
ext = fstream.get_file_ext()
|
|
124
124
|
stem = fstream.get_file_stem()
|
|
125
125
|
|
|
@@ -176,9 +176,8 @@ class Builder:
|
|
|
176
176
|
)
|
|
177
177
|
etag = hashlib.md5(etag_string.encode(), usedforsecurity=False).hexdigest()
|
|
178
178
|
return File(
|
|
179
|
-
name=core_file.name,
|
|
180
179
|
source=self._tar_stream.source,
|
|
181
|
-
|
|
180
|
+
path=f"{new_parent}/{core_file.name}",
|
|
182
181
|
version=self._tar_stream.version,
|
|
183
182
|
size=core_file.size,
|
|
184
183
|
etag=etag,
|
datachain/listing.py
CHANGED
|
@@ -5,11 +5,12 @@ from itertools import zip_longest
|
|
|
5
5
|
from typing import TYPE_CHECKING, Optional
|
|
6
6
|
|
|
7
7
|
from fsspec.asyn import get_loop, sync
|
|
8
|
-
from sqlalchemy import Column
|
|
8
|
+
from sqlalchemy import Column
|
|
9
9
|
from sqlalchemy.sql import func
|
|
10
10
|
from tqdm import tqdm
|
|
11
11
|
|
|
12
12
|
from datachain.node import DirType, Entry, Node, NodeWithPath
|
|
13
|
+
from datachain.sql.functions import path as pathfunc
|
|
13
14
|
from datachain.utils import suffix_to_number
|
|
14
15
|
|
|
15
16
|
if TYPE_CHECKING:
|
|
@@ -129,7 +130,7 @@ class Listing:
|
|
|
129
130
|
dir_path = []
|
|
130
131
|
if not copy_dir_contents:
|
|
131
132
|
dir_path.append(node.name)
|
|
132
|
-
subtree_nodes = src.find(sort=["
|
|
133
|
+
subtree_nodes = src.find(sort=["path"])
|
|
133
134
|
all_nodes.extend(
|
|
134
135
|
NodeWithPath(n.n, path=dir_path + n.path) for n in subtree_nodes
|
|
135
136
|
)
|
|
@@ -148,8 +149,7 @@ class Listing:
|
|
|
148
149
|
elif from_dataset:
|
|
149
150
|
node_path = [
|
|
150
151
|
src.listing.client.name,
|
|
151
|
-
node.
|
|
152
|
-
node.name,
|
|
152
|
+
node.path,
|
|
153
153
|
]
|
|
154
154
|
else:
|
|
155
155
|
node_path = [node.name]
|
|
@@ -201,25 +201,19 @@ class Listing:
|
|
|
201
201
|
dr = self.dataset_rows
|
|
202
202
|
conds = []
|
|
203
203
|
if names:
|
|
204
|
-
|
|
205
|
-
|
|
204
|
+
for name in names:
|
|
205
|
+
conds.append(pathfunc.name(Column("path")).op("GLOB")(name))
|
|
206
206
|
if inames:
|
|
207
|
-
|
|
208
|
-
|
|
207
|
+
for iname in inames:
|
|
208
|
+
conds.append(
|
|
209
|
+
func.lower(pathfunc.name(Column("path"))).op("GLOB")(iname.lower())
|
|
210
|
+
)
|
|
209
211
|
if paths:
|
|
210
|
-
|
|
211
|
-
(Column("
|
|
212
|
-
else_=Column("parent") + "/" + Column("name"),
|
|
213
|
-
)
|
|
214
|
-
f = node_path.op("GLOB")
|
|
215
|
-
conds.extend(f(path) for path in paths)
|
|
212
|
+
for path in paths:
|
|
213
|
+
conds.append(Column("path").op("GLOB")(path))
|
|
216
214
|
if ipaths:
|
|
217
|
-
|
|
218
|
-
(Column("
|
|
219
|
-
else_=Column("parent") + "/" + Column("name"),
|
|
220
|
-
)
|
|
221
|
-
f = func.lower(node_path).op("GLOB")
|
|
222
|
-
conds.extend(f(ipath.lower()) for ipath in ipaths)
|
|
215
|
+
for ipath in ipaths:
|
|
216
|
+
conds.append(func.lower(Column("path")).op("GLOB")(ipath.lower()))
|
|
223
217
|
|
|
224
218
|
if size is not None:
|
|
225
219
|
size_limit = suffix_to_number(size)
|
datachain/node.py
CHANGED
|
@@ -50,8 +50,7 @@ class Node:
|
|
|
50
50
|
sys__rand: int = -1
|
|
51
51
|
vtype: str = ""
|
|
52
52
|
dir_type: Optional[int] = None
|
|
53
|
-
|
|
54
|
-
name: str = ""
|
|
53
|
+
path: str = ""
|
|
55
54
|
etag: str = ""
|
|
56
55
|
version: Optional[str] = None
|
|
57
56
|
is_latest: bool = True
|
|
@@ -62,10 +61,6 @@ class Node:
|
|
|
62
61
|
location: Optional[str] = None
|
|
63
62
|
source: StorageURI = StorageURI("")
|
|
64
63
|
|
|
65
|
-
@property
|
|
66
|
-
def path(self) -> str:
|
|
67
|
-
return f"{self.parent}/{self.name}" if self.parent else self.name
|
|
68
|
-
|
|
69
64
|
@property
|
|
70
65
|
def is_dir(self) -> bool:
|
|
71
66
|
return self.dir_type == DirType.DIR
|
|
@@ -107,13 +102,12 @@ class Node:
|
|
|
107
102
|
return self.path + "/"
|
|
108
103
|
return self.path
|
|
109
104
|
|
|
110
|
-
def as_uid(self, storage: Optional[StorageURI] = None):
|
|
105
|
+
def as_uid(self, storage: Optional[StorageURI] = None) -> UniqueId:
|
|
111
106
|
if storage is None:
|
|
112
107
|
storage = self.source
|
|
113
108
|
return UniqueId(
|
|
114
109
|
storage=storage,
|
|
115
|
-
|
|
116
|
-
name=self.name,
|
|
110
|
+
path=self.path,
|
|
117
111
|
size=self.size,
|
|
118
112
|
version=self.version or "",
|
|
119
113
|
etag=self.etag,
|
|
@@ -129,20 +123,30 @@ class Node:
|
|
|
129
123
|
return cls(**kw)
|
|
130
124
|
|
|
131
125
|
@classmethod
|
|
132
|
-
def from_dir(cls,
|
|
133
|
-
return cls(sys__id=-1, dir_type=DirType.DIR,
|
|
126
|
+
def from_dir(cls, path, **kwargs) -> "Node":
|
|
127
|
+
return cls(sys__id=-1, dir_type=DirType.DIR, path=path, **kwargs)
|
|
134
128
|
|
|
135
129
|
@classmethod
|
|
136
130
|
def root(cls) -> "Node":
|
|
137
131
|
return cls(sys__id=-1, dir_type=DirType.DIR)
|
|
138
132
|
|
|
133
|
+
@property
|
|
134
|
+
def name(self):
|
|
135
|
+
return self.path.rsplit("/", 1)[-1]
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def parent(self):
|
|
139
|
+
split = self.path.rsplit("/", 1)
|
|
140
|
+
if len(split) <= 1:
|
|
141
|
+
return ""
|
|
142
|
+
return split[0]
|
|
143
|
+
|
|
139
144
|
|
|
140
145
|
@attrs.define
|
|
141
146
|
class Entry:
|
|
142
147
|
vtype: str = ""
|
|
143
148
|
dir_type: Optional[int] = None
|
|
144
|
-
|
|
145
|
-
name: str = ""
|
|
149
|
+
path: str = ""
|
|
146
150
|
etag: str = ""
|
|
147
151
|
version: str = ""
|
|
148
152
|
is_latest: bool = True
|
|
@@ -157,27 +161,34 @@ class Entry:
|
|
|
157
161
|
return self.dir_type == DirType.DIR
|
|
158
162
|
|
|
159
163
|
@classmethod
|
|
160
|
-
def from_dir(cls,
|
|
161
|
-
return cls(dir_type=DirType.DIR,
|
|
164
|
+
def from_dir(cls, path: str, **kwargs) -> "Entry":
|
|
165
|
+
return cls(dir_type=DirType.DIR, path=path, **kwargs)
|
|
162
166
|
|
|
163
167
|
@classmethod
|
|
164
|
-
def from_file(cls,
|
|
165
|
-
return cls(dir_type=DirType.FILE,
|
|
168
|
+
def from_file(cls, path: str, **kwargs) -> "Entry":
|
|
169
|
+
return cls(dir_type=DirType.FILE, path=path, **kwargs)
|
|
166
170
|
|
|
167
171
|
@classmethod
|
|
168
172
|
def root(cls):
|
|
169
173
|
return cls(dir_type=DirType.DIR)
|
|
170
174
|
|
|
171
|
-
@property
|
|
172
|
-
def path(self) -> str:
|
|
173
|
-
return f"{self.parent}/{self.name}" if self.parent else self.name
|
|
174
|
-
|
|
175
175
|
@property
|
|
176
176
|
def full_path(self) -> str:
|
|
177
177
|
if self.is_dir and self.path:
|
|
178
178
|
return self.path + "/"
|
|
179
179
|
return self.path
|
|
180
180
|
|
|
181
|
+
@property
|
|
182
|
+
def name(self):
|
|
183
|
+
return self.path.rsplit("/", 1)[-1]
|
|
184
|
+
|
|
185
|
+
@property
|
|
186
|
+
def parent(self):
|
|
187
|
+
split = self.path.rsplit("/", 1)
|
|
188
|
+
if len(split) <= 1:
|
|
189
|
+
return ""
|
|
190
|
+
return split[0]
|
|
191
|
+
|
|
181
192
|
|
|
182
193
|
def get_path(parent: str, name: str):
|
|
183
194
|
return f"{parent}/{name}" if parent else name
|
datachain/query/batch.py
CHANGED
|
@@ -5,21 +5,29 @@ from collections.abc import Generator, Sequence
|
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from typing import TYPE_CHECKING, Callable, Optional, Union
|
|
7
7
|
|
|
8
|
-
import sqlalchemy as sa
|
|
9
|
-
|
|
10
8
|
from datachain.data_storage.schema import PARTITION_COLUMN_ID
|
|
11
9
|
from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
|
|
12
10
|
|
|
13
11
|
if TYPE_CHECKING:
|
|
12
|
+
from sqlalchemy import Select
|
|
13
|
+
|
|
14
14
|
from datachain.dataset import RowDict
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
@dataclass
|
|
18
|
-
class
|
|
18
|
+
class RowsOutputBatch:
|
|
19
|
+
rows: Sequence[Sequence]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
RowsOutput = Union[Sequence, RowsOutputBatch]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class UDFInputBatch:
|
|
19
27
|
rows: Sequence["RowDict"]
|
|
20
28
|
|
|
21
29
|
|
|
22
|
-
|
|
30
|
+
UDFInput = Union["RowDict", UDFInputBatch]
|
|
23
31
|
|
|
24
32
|
|
|
25
33
|
class BatchingStrategy(ABC):
|
|
@@ -28,9 +36,9 @@ class BatchingStrategy(ABC):
|
|
|
28
36
|
@abstractmethod
|
|
29
37
|
def __call__(
|
|
30
38
|
self,
|
|
31
|
-
execute: Callable,
|
|
32
|
-
query:
|
|
33
|
-
) -> Generator[
|
|
39
|
+
execute: Callable[..., Generator[Sequence, None, None]],
|
|
40
|
+
query: "Select",
|
|
41
|
+
) -> Generator[RowsOutput, None, None]:
|
|
34
42
|
"""Apply the provided parameters to the UDF."""
|
|
35
43
|
|
|
36
44
|
|
|
@@ -42,10 +50,10 @@ class NoBatching(BatchingStrategy):
|
|
|
42
50
|
|
|
43
51
|
def __call__(
|
|
44
52
|
self,
|
|
45
|
-
execute: Callable,
|
|
46
|
-
query:
|
|
47
|
-
) -> Generator[
|
|
48
|
-
return execute(query
|
|
53
|
+
execute: Callable[..., Generator[Sequence, None, None]],
|
|
54
|
+
query: "Select",
|
|
55
|
+
) -> Generator[Sequence, None, None]:
|
|
56
|
+
return execute(query)
|
|
49
57
|
|
|
50
58
|
|
|
51
59
|
class Batch(BatchingStrategy):
|
|
@@ -59,31 +67,24 @@ class Batch(BatchingStrategy):
|
|
|
59
67
|
|
|
60
68
|
def __call__(
|
|
61
69
|
self,
|
|
62
|
-
execute: Callable,
|
|
63
|
-
query:
|
|
64
|
-
) -> Generator[
|
|
70
|
+
execute: Callable[..., Generator[Sequence, None, None]],
|
|
71
|
+
query: "Select",
|
|
72
|
+
) -> Generator[RowsOutputBatch, None, None]:
|
|
65
73
|
# choose page size that is a multiple of the batch size
|
|
66
74
|
page_size = math.ceil(SELECT_BATCH_SIZE / self.count) * self.count
|
|
67
75
|
|
|
68
76
|
# select rows in batches
|
|
69
|
-
results: list[
|
|
70
|
-
|
|
71
|
-
with contextlib.closing(
|
|
72
|
-
execute(
|
|
73
|
-
query,
|
|
74
|
-
page_size=page_size,
|
|
75
|
-
limit=query._limit,
|
|
76
|
-
order_by=query._order_by_clauses,
|
|
77
|
-
)
|
|
78
|
-
) as rows:
|
|
77
|
+
results: list[Sequence] = []
|
|
78
|
+
|
|
79
|
+
with contextlib.closing(execute(query, page_size=page_size)) as rows:
|
|
79
80
|
for row in rows:
|
|
80
81
|
results.append(row)
|
|
81
82
|
if len(results) >= self.count:
|
|
82
83
|
batch, results = results[: self.count], results[self.count :]
|
|
83
|
-
yield
|
|
84
|
+
yield RowsOutputBatch(batch)
|
|
84
85
|
|
|
85
86
|
if len(results) > 0:
|
|
86
|
-
yield
|
|
87
|
+
yield RowsOutputBatch(results)
|
|
87
88
|
|
|
88
89
|
|
|
89
90
|
class Partition(BatchingStrategy):
|
|
@@ -95,27 +96,30 @@ class Partition(BatchingStrategy):
|
|
|
95
96
|
|
|
96
97
|
def __call__(
|
|
97
98
|
self,
|
|
98
|
-
execute: Callable,
|
|
99
|
-
query:
|
|
100
|
-
) -> Generator[
|
|
99
|
+
execute: Callable[..., Generator[Sequence, None, None]],
|
|
100
|
+
query: "Select",
|
|
101
|
+
) -> Generator[RowsOutputBatch, None, None]:
|
|
101
102
|
current_partition: Optional[int] = None
|
|
102
|
-
batch: list[
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
103
|
+
batch: list[Sequence] = []
|
|
104
|
+
|
|
105
|
+
query_fields = [str(c.name) for c in query.selected_columns]
|
|
106
|
+
partition_column_idx = query_fields.index(PARTITION_COLUMN_ID)
|
|
107
|
+
|
|
108
|
+
ordered_query = query.order_by(None).order_by(
|
|
109
|
+
PARTITION_COLUMN_ID,
|
|
110
|
+
"sys__id",
|
|
111
|
+
*query._order_by_clauses,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
with contextlib.closing(execute(ordered_query)) as rows:
|
|
111
115
|
for row in rows:
|
|
112
|
-
partition = row[
|
|
116
|
+
partition = row[partition_column_idx]
|
|
113
117
|
if current_partition != partition:
|
|
114
118
|
current_partition = partition
|
|
115
119
|
if len(batch) > 0:
|
|
116
|
-
yield
|
|
120
|
+
yield RowsOutputBatch(batch)
|
|
117
121
|
batch = []
|
|
118
122
|
batch.append(row)
|
|
119
123
|
|
|
120
124
|
if len(batch) > 0:
|
|
121
|
-
yield
|
|
125
|
+
yield RowsOutputBatch(batch)
|
datachain/query/builtins.py
CHANGED
|
@@ -20,8 +20,7 @@ def load_tar(raw):
|
|
|
20
20
|
@udf(
|
|
21
21
|
(
|
|
22
22
|
C.source,
|
|
23
|
-
C.
|
|
24
|
-
C.parent,
|
|
23
|
+
C.path,
|
|
25
24
|
C.size,
|
|
26
25
|
C.vtype,
|
|
27
26
|
C.dir_type,
|
|
@@ -37,8 +36,7 @@ def load_tar(raw):
|
|
|
37
36
|
)
|
|
38
37
|
def index_tar(
|
|
39
38
|
source,
|
|
40
|
-
|
|
41
|
-
parent,
|
|
39
|
+
parent_path,
|
|
42
40
|
size,
|
|
43
41
|
vtype,
|
|
44
42
|
dir_type,
|
|
@@ -52,9 +50,8 @@ def index_tar(
|
|
|
52
50
|
):
|
|
53
51
|
# generate original tar files as well, along with subobjects
|
|
54
52
|
yield DatasetRow.create(
|
|
55
|
-
name,
|
|
56
53
|
source=source,
|
|
57
|
-
|
|
54
|
+
path=parent_path,
|
|
58
55
|
size=size,
|
|
59
56
|
vtype=vtype,
|
|
60
57
|
dir_type=dir_type,
|
|
@@ -66,15 +63,12 @@ def index_tar(
|
|
|
66
63
|
etag=etag,
|
|
67
64
|
)
|
|
68
65
|
|
|
69
|
-
parent_path = name if not parent else f"{parent}/{name}"
|
|
70
66
|
for info in tar_entries:
|
|
71
67
|
if info.isfile():
|
|
72
68
|
full_path = f"{parent_path}/{info.name}"
|
|
73
|
-
parent_dir, subobject_name = full_path.rsplit("/", 1)
|
|
74
69
|
yield DatasetRow.create(
|
|
75
|
-
subobject_name,
|
|
76
70
|
source=source,
|
|
77
|
-
|
|
71
|
+
path=full_path,
|
|
78
72
|
size=info.size,
|
|
79
73
|
vtype="tar",
|
|
80
74
|
location={
|
|
@@ -83,8 +77,7 @@ def index_tar(
|
|
|
83
77
|
"size": info.size,
|
|
84
78
|
"parent": {
|
|
85
79
|
"source": source,
|
|
86
|
-
"
|
|
87
|
-
"name": name,
|
|
80
|
+
"path": parent_path,
|
|
88
81
|
"version": version,
|
|
89
82
|
"size": size,
|
|
90
83
|
"etag": etag,
|
datachain/query/dataset.py
CHANGED
|
@@ -307,7 +307,7 @@ class Subtract(DatasetDiffOperation):
|
|
|
307
307
|
class Changed(DatasetDiffOperation):
|
|
308
308
|
"""
|
|
309
309
|
Calculates rows that are changed in a source query compared to target query
|
|
310
|
-
Changed means it has same source +
|
|
310
|
+
Changed means it has same source + path but different last_modified
|
|
311
311
|
Example:
|
|
312
312
|
>>> ds = DatasetQuery(name="dogs_cats") # some older dataset with embeddings
|
|
313
313
|
>>> ds_updated = (
|
|
@@ -461,6 +461,8 @@ class UDFStep(Step, ABC):
|
|
|
461
461
|
|
|
462
462
|
processes = determine_processes(self.parallel)
|
|
463
463
|
|
|
464
|
+
udf_fields = [str(c.name) for c in query.selected_columns]
|
|
465
|
+
|
|
464
466
|
try:
|
|
465
467
|
if workers:
|
|
466
468
|
from datachain.catalog.loader import get_distributed_class
|
|
@@ -473,6 +475,7 @@ class UDFStep(Step, ABC):
|
|
|
473
475
|
query,
|
|
474
476
|
workers,
|
|
475
477
|
processes,
|
|
478
|
+
udf_fields=udf_fields,
|
|
476
479
|
is_generator=self.is_generator,
|
|
477
480
|
use_partitioning=use_partitioning,
|
|
478
481
|
cache=self.cache,
|
|
@@ -489,6 +492,7 @@ class UDFStep(Step, ABC):
|
|
|
489
492
|
"warehouse_clone_params": self.catalog.warehouse.clone_params(),
|
|
490
493
|
"table": udf_table,
|
|
491
494
|
"query": query,
|
|
495
|
+
"udf_fields": udf_fields,
|
|
492
496
|
"batching": batching,
|
|
493
497
|
"processes": processes,
|
|
494
498
|
"is_generator": self.is_generator,
|
|
@@ -528,6 +532,7 @@ class UDFStep(Step, ABC):
|
|
|
528
532
|
generated_cb = get_generated_callback(self.is_generator)
|
|
529
533
|
try:
|
|
530
534
|
udf_results = udf.run(
|
|
535
|
+
udf_fields,
|
|
531
536
|
udf_inputs,
|
|
532
537
|
self.catalog,
|
|
533
538
|
self.is_generator,
|
|
@@ -1244,21 +1249,23 @@ class DatasetQuery:
|
|
|
1244
1249
|
actual_params = [normalize_param(p) for p in params]
|
|
1245
1250
|
try:
|
|
1246
1251
|
query = self.apply_steps().select()
|
|
1252
|
+
query_fields = [str(c.name) for c in query.selected_columns]
|
|
1247
1253
|
|
|
1248
|
-
def row_iter() -> Generator[
|
|
1254
|
+
def row_iter() -> Generator[Sequence, None, None]:
|
|
1249
1255
|
# warehouse isn't threadsafe, we need to clone() it
|
|
1250
1256
|
# in the thread that uses the results
|
|
1251
1257
|
with self.catalog.warehouse.clone() as warehouse:
|
|
1252
|
-
gen = warehouse.dataset_select_paginated(
|
|
1253
|
-
query, limit=query._limit, order_by=query._order_by_clauses
|
|
1254
|
-
)
|
|
1258
|
+
gen = warehouse.dataset_select_paginated(query)
|
|
1255
1259
|
with contextlib.closing(gen) as rows:
|
|
1256
1260
|
yield from rows
|
|
1257
1261
|
|
|
1258
|
-
async def get_params(row:
|
|
1262
|
+
async def get_params(row: Sequence) -> tuple:
|
|
1263
|
+
row_dict = RowDict(zip(query_fields, row))
|
|
1259
1264
|
return tuple(
|
|
1260
1265
|
[
|
|
1261
|
-
await p.get_value_async(
|
|
1266
|
+
await p.get_value_async(
|
|
1267
|
+
self.catalog, row_dict, mapper, **kwargs
|
|
1268
|
+
)
|
|
1262
1269
|
for p in actual_params
|
|
1263
1270
|
]
|
|
1264
1271
|
)
|
|
@@ -1526,7 +1533,7 @@ class DatasetQuery:
|
|
|
1526
1533
|
|
|
1527
1534
|
@detach
|
|
1528
1535
|
def subtract(self, dq: "DatasetQuery") -> "Self":
|
|
1529
|
-
return self._subtract(dq, on=["source", "
|
|
1536
|
+
return self._subtract(dq, on=["source", "path"])
|
|
1530
1537
|
|
|
1531
1538
|
@detach
|
|
1532
1539
|
def _subtract(self, dq: "DatasetQuery", on: Sequence[str]) -> "Self":
|