datachain 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +1 -1
- datachain/client/fsspec.py +1 -4
- datachain/client/local.py +2 -7
- datachain/data_storage/warehouse.py +8 -14
- datachain/lib/dc.py +1 -1
- datachain/lib/udf.py +21 -14
- datachain/query/batch.py +45 -41
- datachain/query/dataset.py +13 -6
- datachain/query/dispatch.py +53 -68
- datachain/query/queue.py +120 -0
- datachain/query/udf.py +23 -8
- datachain/utils.py +17 -2
- {datachain-0.3.0.dist-info → datachain-0.3.1.dist-info}/METADATA +1 -1
- {datachain-0.3.0.dist-info → datachain-0.3.1.dist-info}/RECORD +18 -17
- {datachain-0.3.0.dist-info → datachain-0.3.1.dist-info}/LICENSE +0 -0
- {datachain-0.3.0.dist-info → datachain-0.3.1.dist-info}/WHEEL +0 -0
- {datachain-0.3.0.dist-info → datachain-0.3.1.dist-info}/entry_points.txt +0 -0
- {datachain-0.3.0.dist-info → datachain-0.3.1.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -676,7 +676,7 @@ class Catalog:
|
|
|
676
676
|
|
|
677
677
|
def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
|
|
678
678
|
config = config or self.client_config
|
|
679
|
-
return Client.parse_url(uri, self.
|
|
679
|
+
return Client.parse_url(uri, self.cache, **config)
|
|
680
680
|
|
|
681
681
|
def get_client(self, uri: StorageURI, **config: Any) -> Client:
|
|
682
682
|
"""
|
datachain/client/fsspec.py
CHANGED
|
@@ -37,7 +37,6 @@ from datachain.storage import StorageURI
|
|
|
37
37
|
if TYPE_CHECKING:
|
|
38
38
|
from fsspec.spec import AbstractFileSystem
|
|
39
39
|
|
|
40
|
-
from datachain.data_storage import AbstractMetastore
|
|
41
40
|
|
|
42
41
|
logger = logging.getLogger("datachain")
|
|
43
42
|
|
|
@@ -116,13 +115,12 @@ class Client(ABC):
|
|
|
116
115
|
@staticmethod
|
|
117
116
|
def parse_url(
|
|
118
117
|
source: str,
|
|
119
|
-
metastore: "AbstractMetastore",
|
|
120
118
|
cache: DataChainCache,
|
|
121
119
|
**kwargs,
|
|
122
120
|
) -> tuple["Client", str]:
|
|
123
121
|
cls = Client.get_implementation(source)
|
|
124
122
|
storage_url, rel_path = cls.split_url(source)
|
|
125
|
-
client = cls.from_name(storage_url,
|
|
123
|
+
client = cls.from_name(storage_url, cache, kwargs)
|
|
126
124
|
return client, rel_path
|
|
127
125
|
|
|
128
126
|
@classmethod
|
|
@@ -136,7 +134,6 @@ class Client(ABC):
|
|
|
136
134
|
def from_name(
|
|
137
135
|
cls,
|
|
138
136
|
name: str,
|
|
139
|
-
metastore: "AbstractMetastore",
|
|
140
137
|
cache: DataChainCache,
|
|
141
138
|
kwargs: dict[str, Any],
|
|
142
139
|
) -> "Client":
|
datachain/client/local.py
CHANGED
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
import posixpath
|
|
3
3
|
from datetime import datetime, timezone
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import Any
|
|
6
6
|
from urllib.parse import urlparse
|
|
7
7
|
|
|
8
8
|
from fsspec.implementations.local import LocalFileSystem
|
|
@@ -12,9 +12,6 @@ from datachain.storage import StorageURI
|
|
|
12
12
|
|
|
13
13
|
from .fsspec import Client
|
|
14
14
|
|
|
15
|
-
if TYPE_CHECKING:
|
|
16
|
-
from datachain.data_storage import AbstractMetastore
|
|
17
|
-
|
|
18
15
|
|
|
19
16
|
class FileClient(Client):
|
|
20
17
|
FS_CLASS = LocalFileSystem
|
|
@@ -97,9 +94,7 @@ class FileClient(Client):
|
|
|
97
94
|
return cls.root_dir(), uri.removeprefix(cls.root_path().as_uri())
|
|
98
95
|
|
|
99
96
|
@classmethod
|
|
100
|
-
def from_name(
|
|
101
|
-
cls, name: str, metastore: "AbstractMetastore", cache, kwargs
|
|
102
|
-
) -> "FileClient":
|
|
97
|
+
def from_name(cls, name: str, cache, kwargs) -> "FileClient":
|
|
103
98
|
use_symlinks = kwargs.pop("use_symlinks", False)
|
|
104
99
|
return cls(name, kwargs, cache, use_symlinks=use_symlinks)
|
|
105
100
|
|
|
@@ -17,7 +17,7 @@ from sqlalchemy.sql.expression import true
|
|
|
17
17
|
|
|
18
18
|
from datachain.client import Client
|
|
19
19
|
from datachain.data_storage.serializer import Serializable
|
|
20
|
-
from datachain.dataset import DatasetRecord
|
|
20
|
+
from datachain.dataset import DatasetRecord
|
|
21
21
|
from datachain.node import DirType, DirTypeGroup, Entry, Node, NodeWithPath, get_path
|
|
22
22
|
from datachain.sql.functions import path as pathfunc
|
|
23
23
|
from datachain.sql.types import Int, SQLType
|
|
@@ -201,23 +201,17 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
201
201
|
def dataset_select_paginated(
|
|
202
202
|
self,
|
|
203
203
|
query,
|
|
204
|
-
limit: Optional[int] = None,
|
|
205
|
-
order_by: tuple["ColumnElement[Any]", ...] = (),
|
|
206
204
|
page_size: int = SELECT_BATCH_SIZE,
|
|
207
|
-
) -> Generator[
|
|
205
|
+
) -> Generator[Sequence, None, None]:
|
|
208
206
|
"""
|
|
209
207
|
This is equivalent to `db.execute`, but for selecting rows in batches
|
|
210
208
|
"""
|
|
211
|
-
|
|
212
|
-
|
|
209
|
+
limit = query._limit
|
|
210
|
+
paginated_query = query.limit(page_size)
|
|
213
211
|
|
|
214
|
-
if not
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
ordering = order_by # type: ignore[assignment]
|
|
218
|
-
|
|
219
|
-
# reset query order by and apply new order by id
|
|
220
|
-
paginated_query = query.order_by(None).order_by(*ordering).limit(page_size)
|
|
212
|
+
if not paginated_query._order_by_clauses:
|
|
213
|
+
# default order by is order by `sys__id`
|
|
214
|
+
paginated_query = paginated_query.order_by(query.selected_columns.sys__id)
|
|
221
215
|
|
|
222
216
|
results = None
|
|
223
217
|
offset = 0
|
|
@@ -236,7 +230,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
236
230
|
processed = False
|
|
237
231
|
for row in results:
|
|
238
232
|
processed = True
|
|
239
|
-
yield
|
|
233
|
+
yield row
|
|
240
234
|
num_yielded += 1
|
|
241
235
|
|
|
242
236
|
if not processed:
|
datachain/lib/dc.py
CHANGED
datachain/lib/udf.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
import traceback
|
|
3
|
-
from collections.abc import Iterable, Iterator
|
|
4
3
|
from typing import TYPE_CHECKING, Callable, Optional
|
|
5
4
|
|
|
6
5
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
@@ -14,16 +13,19 @@ from datachain.lib.model_store import ModelStore
|
|
|
14
13
|
from datachain.lib.signal_schema import SignalSchema
|
|
15
14
|
from datachain.lib.udf_signature import UdfSignature
|
|
16
15
|
from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
|
|
17
|
-
from datachain.query.batch import
|
|
16
|
+
from datachain.query.batch import UDFInputBatch
|
|
18
17
|
from datachain.query.schema import ColumnParameter
|
|
19
18
|
from datachain.query.udf import UDFBase as _UDFBase
|
|
20
|
-
from datachain.query.udf import UDFProperties
|
|
19
|
+
from datachain.query.udf import UDFProperties
|
|
21
20
|
|
|
22
21
|
if TYPE_CHECKING:
|
|
22
|
+
from collections.abc import Iterable, Iterator, Sequence
|
|
23
|
+
|
|
23
24
|
from typing_extensions import Self
|
|
24
25
|
|
|
25
26
|
from datachain.catalog import Catalog
|
|
26
|
-
from datachain.query.batch import
|
|
27
|
+
from datachain.query.batch import RowsOutput, UDFInput
|
|
28
|
+
from datachain.query.udf import UDFResult
|
|
27
29
|
|
|
28
30
|
|
|
29
31
|
class UdfError(DataChainParamsError):
|
|
@@ -42,22 +44,27 @@ class UDFAdapter(_UDFBase):
|
|
|
42
44
|
|
|
43
45
|
def run(
|
|
44
46
|
self,
|
|
45
|
-
|
|
47
|
+
udf_fields: "Sequence[str]",
|
|
48
|
+
udf_inputs: "Iterable[RowsOutput]",
|
|
46
49
|
catalog: "Catalog",
|
|
47
50
|
is_generator: bool,
|
|
48
51
|
cache: bool,
|
|
49
52
|
download_cb: Callback = DEFAULT_CALLBACK,
|
|
50
53
|
processed_cb: Callback = DEFAULT_CALLBACK,
|
|
51
|
-
) -> Iterator[Iterable[
|
|
54
|
+
) -> "Iterator[Iterable[UDFResult]]":
|
|
52
55
|
self.inner._catalog = catalog
|
|
53
56
|
if hasattr(self.inner, "setup") and callable(self.inner.setup):
|
|
54
57
|
self.inner.setup()
|
|
55
58
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
59
|
+
yield from super().run(
|
|
60
|
+
udf_fields,
|
|
61
|
+
udf_inputs,
|
|
62
|
+
catalog,
|
|
63
|
+
is_generator,
|
|
64
|
+
cache,
|
|
65
|
+
download_cb,
|
|
66
|
+
processed_cb,
|
|
67
|
+
)
|
|
61
68
|
|
|
62
69
|
if hasattr(self.inner, "teardown") and callable(self.inner.teardown):
|
|
63
70
|
self.inner.teardown()
|
|
@@ -65,12 +72,12 @@ class UDFAdapter(_UDFBase):
|
|
|
65
72
|
def run_once(
|
|
66
73
|
self,
|
|
67
74
|
catalog: "Catalog",
|
|
68
|
-
arg: "
|
|
75
|
+
arg: "UDFInput",
|
|
69
76
|
is_generator: bool = False,
|
|
70
77
|
cache: bool = False,
|
|
71
78
|
cb: Callback = DEFAULT_CALLBACK,
|
|
72
|
-
) -> Iterable[UDFResult]:
|
|
73
|
-
if isinstance(arg,
|
|
79
|
+
) -> "Iterable[UDFResult]":
|
|
80
|
+
if isinstance(arg, UDFInputBatch):
|
|
74
81
|
udf_inputs = [
|
|
75
82
|
self.bind_parameters(catalog, row, cache=cache, cb=cb)
|
|
76
83
|
for row in arg.rows
|
datachain/query/batch.py
CHANGED
|
@@ -5,21 +5,29 @@ from collections.abc import Generator, Sequence
|
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from typing import TYPE_CHECKING, Callable, Optional, Union
|
|
7
7
|
|
|
8
|
-
import sqlalchemy as sa
|
|
9
|
-
|
|
10
8
|
from datachain.data_storage.schema import PARTITION_COLUMN_ID
|
|
11
9
|
from datachain.data_storage.warehouse import SELECT_BATCH_SIZE
|
|
12
10
|
|
|
13
11
|
if TYPE_CHECKING:
|
|
12
|
+
from sqlalchemy import Select
|
|
13
|
+
|
|
14
14
|
from datachain.dataset import RowDict
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
@dataclass
|
|
18
|
-
class
|
|
18
|
+
class RowsOutputBatch:
|
|
19
|
+
rows: Sequence[Sequence]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
RowsOutput = Union[Sequence, RowsOutputBatch]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class UDFInputBatch:
|
|
19
27
|
rows: Sequence["RowDict"]
|
|
20
28
|
|
|
21
29
|
|
|
22
|
-
|
|
30
|
+
UDFInput = Union["RowDict", UDFInputBatch]
|
|
23
31
|
|
|
24
32
|
|
|
25
33
|
class BatchingStrategy(ABC):
|
|
@@ -28,9 +36,9 @@ class BatchingStrategy(ABC):
|
|
|
28
36
|
@abstractmethod
|
|
29
37
|
def __call__(
|
|
30
38
|
self,
|
|
31
|
-
execute: Callable,
|
|
32
|
-
query:
|
|
33
|
-
) -> Generator[
|
|
39
|
+
execute: Callable[..., Generator[Sequence, None, None]],
|
|
40
|
+
query: "Select",
|
|
41
|
+
) -> Generator[RowsOutput, None, None]:
|
|
34
42
|
"""Apply the provided parameters to the UDF."""
|
|
35
43
|
|
|
36
44
|
|
|
@@ -42,10 +50,10 @@ class NoBatching(BatchingStrategy):
|
|
|
42
50
|
|
|
43
51
|
def __call__(
|
|
44
52
|
self,
|
|
45
|
-
execute: Callable,
|
|
46
|
-
query:
|
|
47
|
-
) -> Generator[
|
|
48
|
-
return execute(query
|
|
53
|
+
execute: Callable[..., Generator[Sequence, None, None]],
|
|
54
|
+
query: "Select",
|
|
55
|
+
) -> Generator[Sequence, None, None]:
|
|
56
|
+
return execute(query)
|
|
49
57
|
|
|
50
58
|
|
|
51
59
|
class Batch(BatchingStrategy):
|
|
@@ -59,31 +67,24 @@ class Batch(BatchingStrategy):
|
|
|
59
67
|
|
|
60
68
|
def __call__(
|
|
61
69
|
self,
|
|
62
|
-
execute: Callable,
|
|
63
|
-
query:
|
|
64
|
-
) -> Generator[
|
|
70
|
+
execute: Callable[..., Generator[Sequence, None, None]],
|
|
71
|
+
query: "Select",
|
|
72
|
+
) -> Generator[RowsOutputBatch, None, None]:
|
|
65
73
|
# choose page size that is a multiple of the batch size
|
|
66
74
|
page_size = math.ceil(SELECT_BATCH_SIZE / self.count) * self.count
|
|
67
75
|
|
|
68
76
|
# select rows in batches
|
|
69
|
-
results: list[
|
|
70
|
-
|
|
71
|
-
with contextlib.closing(
|
|
72
|
-
execute(
|
|
73
|
-
query,
|
|
74
|
-
page_size=page_size,
|
|
75
|
-
limit=query._limit,
|
|
76
|
-
order_by=query._order_by_clauses,
|
|
77
|
-
)
|
|
78
|
-
) as rows:
|
|
77
|
+
results: list[Sequence] = []
|
|
78
|
+
|
|
79
|
+
with contextlib.closing(execute(query, page_size=page_size)) as rows:
|
|
79
80
|
for row in rows:
|
|
80
81
|
results.append(row)
|
|
81
82
|
if len(results) >= self.count:
|
|
82
83
|
batch, results = results[: self.count], results[self.count :]
|
|
83
|
-
yield
|
|
84
|
+
yield RowsOutputBatch(batch)
|
|
84
85
|
|
|
85
86
|
if len(results) > 0:
|
|
86
|
-
yield
|
|
87
|
+
yield RowsOutputBatch(results)
|
|
87
88
|
|
|
88
89
|
|
|
89
90
|
class Partition(BatchingStrategy):
|
|
@@ -95,27 +96,30 @@ class Partition(BatchingStrategy):
|
|
|
95
96
|
|
|
96
97
|
def __call__(
|
|
97
98
|
self,
|
|
98
|
-
execute: Callable,
|
|
99
|
-
query:
|
|
100
|
-
) -> Generator[
|
|
99
|
+
execute: Callable[..., Generator[Sequence, None, None]],
|
|
100
|
+
query: "Select",
|
|
101
|
+
) -> Generator[RowsOutputBatch, None, None]:
|
|
101
102
|
current_partition: Optional[int] = None
|
|
102
|
-
batch: list[
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
103
|
+
batch: list[Sequence] = []
|
|
104
|
+
|
|
105
|
+
query_fields = [str(c.name) for c in query.selected_columns]
|
|
106
|
+
partition_column_idx = query_fields.index(PARTITION_COLUMN_ID)
|
|
107
|
+
|
|
108
|
+
ordered_query = query.order_by(None).order_by(
|
|
109
|
+
PARTITION_COLUMN_ID,
|
|
110
|
+
"sys__id",
|
|
111
|
+
*query._order_by_clauses,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
with contextlib.closing(execute(ordered_query)) as rows:
|
|
111
115
|
for row in rows:
|
|
112
|
-
partition = row[
|
|
116
|
+
partition = row[partition_column_idx]
|
|
113
117
|
if current_partition != partition:
|
|
114
118
|
current_partition = partition
|
|
115
119
|
if len(batch) > 0:
|
|
116
|
-
yield
|
|
120
|
+
yield RowsOutputBatch(batch)
|
|
117
121
|
batch = []
|
|
118
122
|
batch.append(row)
|
|
119
123
|
|
|
120
124
|
if len(batch) > 0:
|
|
121
|
-
yield
|
|
125
|
+
yield RowsOutputBatch(batch)
|
datachain/query/dataset.py
CHANGED
|
@@ -461,6 +461,8 @@ class UDFStep(Step, ABC):
|
|
|
461
461
|
|
|
462
462
|
processes = determine_processes(self.parallel)
|
|
463
463
|
|
|
464
|
+
udf_fields = [str(c.name) for c in query.selected_columns]
|
|
465
|
+
|
|
464
466
|
try:
|
|
465
467
|
if workers:
|
|
466
468
|
from datachain.catalog.loader import get_distributed_class
|
|
@@ -473,6 +475,7 @@ class UDFStep(Step, ABC):
|
|
|
473
475
|
query,
|
|
474
476
|
workers,
|
|
475
477
|
processes,
|
|
478
|
+
udf_fields=udf_fields,
|
|
476
479
|
is_generator=self.is_generator,
|
|
477
480
|
use_partitioning=use_partitioning,
|
|
478
481
|
cache=self.cache,
|
|
@@ -489,6 +492,7 @@ class UDFStep(Step, ABC):
|
|
|
489
492
|
"warehouse_clone_params": self.catalog.warehouse.clone_params(),
|
|
490
493
|
"table": udf_table,
|
|
491
494
|
"query": query,
|
|
495
|
+
"udf_fields": udf_fields,
|
|
492
496
|
"batching": batching,
|
|
493
497
|
"processes": processes,
|
|
494
498
|
"is_generator": self.is_generator,
|
|
@@ -528,6 +532,7 @@ class UDFStep(Step, ABC):
|
|
|
528
532
|
generated_cb = get_generated_callback(self.is_generator)
|
|
529
533
|
try:
|
|
530
534
|
udf_results = udf.run(
|
|
535
|
+
udf_fields,
|
|
531
536
|
udf_inputs,
|
|
532
537
|
self.catalog,
|
|
533
538
|
self.is_generator,
|
|
@@ -1244,21 +1249,23 @@ class DatasetQuery:
|
|
|
1244
1249
|
actual_params = [normalize_param(p) for p in params]
|
|
1245
1250
|
try:
|
|
1246
1251
|
query = self.apply_steps().select()
|
|
1252
|
+
query_fields = [str(c.name) for c in query.selected_columns]
|
|
1247
1253
|
|
|
1248
|
-
def row_iter() -> Generator[
|
|
1254
|
+
def row_iter() -> Generator[Sequence, None, None]:
|
|
1249
1255
|
# warehouse isn't threadsafe, we need to clone() it
|
|
1250
1256
|
# in the thread that uses the results
|
|
1251
1257
|
with self.catalog.warehouse.clone() as warehouse:
|
|
1252
|
-
gen = warehouse.dataset_select_paginated(
|
|
1253
|
-
query, limit=query._limit, order_by=query._order_by_clauses
|
|
1254
|
-
)
|
|
1258
|
+
gen = warehouse.dataset_select_paginated(query)
|
|
1255
1259
|
with contextlib.closing(gen) as rows:
|
|
1256
1260
|
yield from rows
|
|
1257
1261
|
|
|
1258
|
-
async def get_params(row:
|
|
1262
|
+
async def get_params(row: Sequence) -> tuple:
|
|
1263
|
+
row_dict = RowDict(zip(query_fields, row))
|
|
1259
1264
|
return tuple(
|
|
1260
1265
|
[
|
|
1261
|
-
await p.get_value_async(
|
|
1266
|
+
await p.get_value_async(
|
|
1267
|
+
self.catalog, row_dict, mapper, **kwargs
|
|
1268
|
+
)
|
|
1262
1269
|
for p in actual_params
|
|
1263
1270
|
]
|
|
1264
1271
|
)
|
datachain/query/dispatch.py
CHANGED
|
@@ -2,11 +2,8 @@ import contextlib
|
|
|
2
2
|
from collections.abc import Iterator, Sequence
|
|
3
3
|
from itertools import chain
|
|
4
4
|
from multiprocessing import cpu_count
|
|
5
|
-
from queue import Empty, Full, Queue
|
|
6
5
|
from sys import stdin
|
|
7
|
-
from
|
|
8
|
-
from types import GeneratorType
|
|
9
|
-
from typing import Any, Optional
|
|
6
|
+
from typing import Optional
|
|
10
7
|
|
|
11
8
|
import attrs
|
|
12
9
|
import multiprocess
|
|
@@ -22,7 +19,16 @@ from datachain.query.dataset import (
|
|
|
22
19
|
get_processed_callback,
|
|
23
20
|
process_udf_outputs,
|
|
24
21
|
)
|
|
22
|
+
from datachain.query.queue import (
|
|
23
|
+
get_from_queue,
|
|
24
|
+
marshal,
|
|
25
|
+
msgpack_pack,
|
|
26
|
+
msgpack_unpack,
|
|
27
|
+
put_into_queue,
|
|
28
|
+
unmarshal,
|
|
29
|
+
)
|
|
25
30
|
from datachain.query.udf import UDFBase, UDFFactory, UDFResult
|
|
31
|
+
from datachain.utils import batched_it
|
|
26
32
|
|
|
27
33
|
DEFAULT_BATCH_SIZE = 10000
|
|
28
34
|
STOP_SIGNAL = "STOP"
|
|
@@ -44,44 +50,6 @@ def get_n_workers_from_arg(n_workers: Optional[int] = None) -> int:
|
|
|
44
50
|
return n_workers
|
|
45
51
|
|
|
46
52
|
|
|
47
|
-
# For more context on the get_from_queue and put_into_queue functions, see the
|
|
48
|
-
# discussion here:
|
|
49
|
-
# https://github.com/iterative/dvcx/pull/1297#issuecomment-2026308773
|
|
50
|
-
# This problem is not exactly described by, but is also related to these Python issues:
|
|
51
|
-
# https://github.com/python/cpython/issues/66587
|
|
52
|
-
# https://github.com/python/cpython/issues/88628
|
|
53
|
-
# https://github.com/python/cpython/issues/108645
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def get_from_queue(queue: Queue) -> Any:
|
|
57
|
-
"""
|
|
58
|
-
Gets an item from a queue.
|
|
59
|
-
This is required to handle signals, such as KeyboardInterrupt exceptions
|
|
60
|
-
while waiting for items to be available, although only on certain installations.
|
|
61
|
-
(See the above comment for more context.)
|
|
62
|
-
"""
|
|
63
|
-
while True:
|
|
64
|
-
try:
|
|
65
|
-
return queue.get_nowait()
|
|
66
|
-
except Empty:
|
|
67
|
-
sleep(0.01)
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
def put_into_queue(queue: Queue, item: Any) -> None:
|
|
71
|
-
"""
|
|
72
|
-
Puts an item into a queue.
|
|
73
|
-
This is required to handle signals, such as KeyboardInterrupt exceptions
|
|
74
|
-
while waiting for items to be queued, although only on certain installations.
|
|
75
|
-
(See the above comment for more context.)
|
|
76
|
-
"""
|
|
77
|
-
while True:
|
|
78
|
-
try:
|
|
79
|
-
queue.put_nowait(item)
|
|
80
|
-
return
|
|
81
|
-
except Full:
|
|
82
|
-
sleep(0.01)
|
|
83
|
-
|
|
84
|
-
|
|
85
53
|
def udf_entrypoint() -> int:
|
|
86
54
|
# Load UDF info from stdin
|
|
87
55
|
udf_info = load(stdin.buffer)
|
|
@@ -100,8 +68,9 @@ def udf_entrypoint() -> int:
|
|
|
100
68
|
udf_info["id_generator_clone_params"],
|
|
101
69
|
udf_info["metastore_clone_params"],
|
|
102
70
|
udf_info["warehouse_clone_params"],
|
|
103
|
-
|
|
71
|
+
udf_fields=udf_info["udf_fields"],
|
|
104
72
|
cache=udf_info["cache"],
|
|
73
|
+
is_generator=udf_info.get("is_generator", False),
|
|
105
74
|
)
|
|
106
75
|
|
|
107
76
|
query = udf_info["query"]
|
|
@@ -121,7 +90,7 @@ def udf_entrypoint() -> int:
|
|
|
121
90
|
generated_cb = get_generated_callback(dispatch.is_generator)
|
|
122
91
|
try:
|
|
123
92
|
udf_results = dispatch.run_udf_parallel(
|
|
124
|
-
udf_inputs,
|
|
93
|
+
marshal(udf_inputs),
|
|
125
94
|
n_workers=n_workers,
|
|
126
95
|
processed_cb=processed_cb,
|
|
127
96
|
download_cb=download_cb,
|
|
@@ -142,6 +111,9 @@ def udf_worker_entrypoint() -> int:
|
|
|
142
111
|
|
|
143
112
|
|
|
144
113
|
class UDFDispatcher:
|
|
114
|
+
catalog: Optional[Catalog] = None
|
|
115
|
+
task_queue: Optional[multiprocess.Queue] = None
|
|
116
|
+
done_queue: Optional[multiprocess.Queue] = None
|
|
145
117
|
_batch_size: Optional[int] = None
|
|
146
118
|
|
|
147
119
|
def __init__(
|
|
@@ -151,9 +123,10 @@ class UDFDispatcher:
|
|
|
151
123
|
id_generator_clone_params,
|
|
152
124
|
metastore_clone_params,
|
|
153
125
|
warehouse_clone_params,
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
126
|
+
udf_fields: "Sequence[str]",
|
|
127
|
+
cache: bool,
|
|
128
|
+
is_generator: bool = False,
|
|
129
|
+
buffer_size: int = DEFAULT_BATCH_SIZE,
|
|
157
130
|
):
|
|
158
131
|
self.udf_data = udf_data
|
|
159
132
|
self.catalog_init_params = catalog_init_params
|
|
@@ -172,12 +145,13 @@ class UDFDispatcher:
|
|
|
172
145
|
self.warehouse_args,
|
|
173
146
|
self.warehouse_kwargs,
|
|
174
147
|
) = warehouse_clone_params
|
|
175
|
-
self.
|
|
148
|
+
self.udf_fields = udf_fields
|
|
176
149
|
self.cache = cache
|
|
150
|
+
self.is_generator = is_generator
|
|
151
|
+
self.buffer_size = buffer_size
|
|
177
152
|
self.catalog = None
|
|
178
153
|
self.task_queue = None
|
|
179
154
|
self.done_queue = None
|
|
180
|
-
self.buffer_size = buffer_size
|
|
181
155
|
self.ctx = get_context("spawn")
|
|
182
156
|
|
|
183
157
|
@property
|
|
@@ -226,6 +200,7 @@ class UDFDispatcher:
|
|
|
226
200
|
self.done_queue,
|
|
227
201
|
self.is_generator,
|
|
228
202
|
self.cache,
|
|
203
|
+
self.udf_fields,
|
|
229
204
|
)
|
|
230
205
|
|
|
231
206
|
def _run_worker(self) -> None:
|
|
@@ -233,7 +208,11 @@ class UDFDispatcher:
|
|
|
233
208
|
worker = self._create_worker()
|
|
234
209
|
worker.run()
|
|
235
210
|
except (Exception, KeyboardInterrupt) as e:
|
|
236
|
-
|
|
211
|
+
if self.done_queue:
|
|
212
|
+
put_into_queue(
|
|
213
|
+
self.done_queue,
|
|
214
|
+
{"status": FAILED_STATUS, "exception": e},
|
|
215
|
+
)
|
|
237
216
|
raise
|
|
238
217
|
|
|
239
218
|
@staticmethod
|
|
@@ -249,7 +228,6 @@ class UDFDispatcher:
|
|
|
249
228
|
self,
|
|
250
229
|
input_rows,
|
|
251
230
|
n_workers: Optional[int] = None,
|
|
252
|
-
cache: bool = False,
|
|
253
231
|
input_queue=None,
|
|
254
232
|
processed_cb: Callback = DEFAULT_CALLBACK,
|
|
255
233
|
download_cb: Callback = DEFAULT_CALLBACK,
|
|
@@ -299,21 +277,24 @@ class UDFDispatcher:
|
|
|
299
277
|
result = get_from_queue(self.done_queue)
|
|
300
278
|
status = result["status"]
|
|
301
279
|
if status == NOTIFY_STATUS:
|
|
302
|
-
|
|
280
|
+
if downloaded := result.get("downloaded"):
|
|
281
|
+
download_cb.relative_update(downloaded)
|
|
282
|
+
if processed := result.get("processed"):
|
|
283
|
+
processed_cb.relative_update(processed)
|
|
303
284
|
elif status == FINISHED_STATUS:
|
|
304
285
|
# Worker finished
|
|
305
286
|
n_workers -= 1
|
|
306
287
|
elif status == OK_STATUS:
|
|
307
|
-
|
|
308
|
-
|
|
288
|
+
if processed := result.get("processed"):
|
|
289
|
+
processed_cb.relative_update(processed)
|
|
290
|
+
yield msgpack_unpack(result["result"])
|
|
309
291
|
else: # Failed / error
|
|
310
292
|
n_workers -= 1
|
|
311
|
-
exc
|
|
312
|
-
if exc:
|
|
293
|
+
if exc := result.get("exception"):
|
|
313
294
|
raise exc
|
|
314
295
|
raise RuntimeError("Internal error: Parallel UDF execution failed")
|
|
315
296
|
|
|
316
|
-
if not streaming_mode and not input_finished:
|
|
297
|
+
if status == OK_STATUS and not streaming_mode and not input_finished:
|
|
317
298
|
try:
|
|
318
299
|
put_into_queue(self.task_queue, next(input_data))
|
|
319
300
|
except StopIteration:
|
|
@@ -348,7 +329,7 @@ class UDFDispatcher:
|
|
|
348
329
|
|
|
349
330
|
|
|
350
331
|
class WorkerCallback(Callback):
|
|
351
|
-
def __init__(self, queue: multiprocess.Queue):
|
|
332
|
+
def __init__(self, queue: "multiprocess.Queue"):
|
|
352
333
|
self.queue = queue
|
|
353
334
|
super().__init__()
|
|
354
335
|
|
|
@@ -369,10 +350,11 @@ class ProcessedCallback(Callback):
|
|
|
369
350
|
class UDFWorker:
|
|
370
351
|
catalog: Catalog
|
|
371
352
|
udf: UDFBase
|
|
372
|
-
task_queue: multiprocess.Queue
|
|
373
|
-
done_queue: multiprocess.Queue
|
|
353
|
+
task_queue: "multiprocess.Queue"
|
|
354
|
+
done_queue: "multiprocess.Queue"
|
|
374
355
|
is_generator: bool
|
|
375
356
|
cache: bool
|
|
357
|
+
udf_fields: Sequence[str]
|
|
376
358
|
cb: Callback = attrs.field()
|
|
377
359
|
|
|
378
360
|
@cb.default
|
|
@@ -382,7 +364,8 @@ class UDFWorker:
|
|
|
382
364
|
def run(self) -> None:
|
|
383
365
|
processed_cb = ProcessedCallback()
|
|
384
366
|
udf_results = self.udf.run(
|
|
385
|
-
self.
|
|
367
|
+
self.udf_fields,
|
|
368
|
+
unmarshal(self.get_inputs()),
|
|
386
369
|
self.catalog,
|
|
387
370
|
self.is_generator,
|
|
388
371
|
self.cache,
|
|
@@ -390,15 +373,17 @@ class UDFWorker:
|
|
|
390
373
|
processed_cb=processed_cb,
|
|
391
374
|
)
|
|
392
375
|
for udf_output in udf_results:
|
|
393
|
-
|
|
394
|
-
|
|
376
|
+
for batch in batched_it(udf_output, DEFAULT_BATCH_SIZE):
|
|
377
|
+
put_into_queue(
|
|
378
|
+
self.done_queue,
|
|
379
|
+
{
|
|
380
|
+
"status": OK_STATUS,
|
|
381
|
+
"result": msgpack_pack(list(batch)),
|
|
382
|
+
},
|
|
383
|
+
)
|
|
395
384
|
put_into_queue(
|
|
396
385
|
self.done_queue,
|
|
397
|
-
{
|
|
398
|
-
"status": OK_STATUS,
|
|
399
|
-
"result": udf_output,
|
|
400
|
-
"processed": processed_cb.processed_rows,
|
|
401
|
-
},
|
|
386
|
+
{"status": NOTIFY_STATUS, "processed": processed_cb.processed_rows},
|
|
402
387
|
)
|
|
403
388
|
put_into_queue(self.done_queue, {"status": FINISHED_STATUS})
|
|
404
389
|
|
datachain/query/queue.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
from collections.abc import Iterable, Iterator
|
|
3
|
+
from queue import Empty, Full, Queue
|
|
4
|
+
from struct import pack, unpack
|
|
5
|
+
from time import sleep
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import msgpack
|
|
9
|
+
|
|
10
|
+
from datachain.query.batch import RowsOutput, RowsOutputBatch
|
|
11
|
+
|
|
12
|
+
DEFAULT_BATCH_SIZE = 10000
|
|
13
|
+
STOP_SIGNAL = "STOP"
|
|
14
|
+
OK_STATUS = "OK"
|
|
15
|
+
FINISHED_STATUS = "FINISHED"
|
|
16
|
+
FAILED_STATUS = "FAILED"
|
|
17
|
+
NOTIFY_STATUS = "NOTIFY"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# For more context on the get_from_queue and put_into_queue functions, see the
|
|
21
|
+
# discussion here:
|
|
22
|
+
# https://github.com/iterative/dvcx/pull/1297#issuecomment-2026308773
|
|
23
|
+
# This problem is not exactly described by, but is also related to these Python issues:
|
|
24
|
+
# https://github.com/python/cpython/issues/66587
|
|
25
|
+
# https://github.com/python/cpython/issues/88628
|
|
26
|
+
# https://github.com/python/cpython/issues/108645
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_from_queue(queue: Queue) -> Any:
|
|
30
|
+
"""
|
|
31
|
+
Gets an item from a queue.
|
|
32
|
+
This is required to handle signals, such as KeyboardInterrupt exceptions
|
|
33
|
+
while waiting for items to be available, although only on certain installations.
|
|
34
|
+
(See the above comment for more context.)
|
|
35
|
+
"""
|
|
36
|
+
while True:
|
|
37
|
+
try:
|
|
38
|
+
return queue.get_nowait()
|
|
39
|
+
except Empty:
|
|
40
|
+
sleep(0.01)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def put_into_queue(queue: Queue, item: Any) -> None:
|
|
44
|
+
"""
|
|
45
|
+
Puts an item into a queue.
|
|
46
|
+
This is required to handle signals, such as KeyboardInterrupt exceptions
|
|
47
|
+
while waiting for items to be queued, although only on certain installations.
|
|
48
|
+
(See the above comment for more context.)
|
|
49
|
+
"""
|
|
50
|
+
while True:
|
|
51
|
+
try:
|
|
52
|
+
queue.put_nowait(item)
|
|
53
|
+
return
|
|
54
|
+
except Full:
|
|
55
|
+
sleep(0.01)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
MSGPACK_EXT_TYPE_DATETIME = 42
|
|
59
|
+
MSGPACK_EXT_TYPE_ROWS_INPUT_BATCH = 43
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _msgpack_pack_extended_types(obj: Any) -> msgpack.ExtType:
|
|
63
|
+
if isinstance(obj, datetime.datetime):
|
|
64
|
+
# packing date object as 1 or 2 variables, depending if timezone info is present
|
|
65
|
+
# - timestamp
|
|
66
|
+
# - [OPTIONAL] timezone offset from utc in seconds if timezone info exists
|
|
67
|
+
if obj.tzinfo:
|
|
68
|
+
data = (obj.timestamp(), int(obj.utcoffset().total_seconds())) # type: ignore # noqa: PGH003
|
|
69
|
+
return msgpack.ExtType(MSGPACK_EXT_TYPE_DATETIME, pack("!dl", *data))
|
|
70
|
+
data = (obj.timestamp(),) # type: ignore # noqa: PGH003
|
|
71
|
+
return msgpack.ExtType(MSGPACK_EXT_TYPE_DATETIME, pack("!d", *data))
|
|
72
|
+
|
|
73
|
+
if isinstance(obj, RowsOutputBatch):
|
|
74
|
+
return msgpack.ExtType(
|
|
75
|
+
MSGPACK_EXT_TYPE_ROWS_INPUT_BATCH,
|
|
76
|
+
msgpack_pack(obj.rows),
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
raise TypeError(f"Unknown type: {obj}")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def msgpack_pack(obj: Any) -> bytes:
|
|
83
|
+
return msgpack.packb(obj, default=_msgpack_pack_extended_types)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _msgpack_unpack_extended_types(code: int, data: bytes) -> Any:
|
|
87
|
+
if code == MSGPACK_EXT_TYPE_DATETIME:
|
|
88
|
+
has_timezone = False
|
|
89
|
+
if len(data) == 8:
|
|
90
|
+
# we send only timestamp without timezone if data is 8 bytes
|
|
91
|
+
values = unpack("!d", data)
|
|
92
|
+
else:
|
|
93
|
+
has_timezone = True
|
|
94
|
+
values = unpack("!dl", data)
|
|
95
|
+
|
|
96
|
+
timestamp = values[0]
|
|
97
|
+
tz_info = None
|
|
98
|
+
if has_timezone:
|
|
99
|
+
timezone_offset = values[1]
|
|
100
|
+
tz_info = datetime.timezone(datetime.timedelta(seconds=timezone_offset))
|
|
101
|
+
return datetime.datetime.fromtimestamp(timestamp, tz=tz_info)
|
|
102
|
+
|
|
103
|
+
if code == MSGPACK_EXT_TYPE_ROWS_INPUT_BATCH:
|
|
104
|
+
return RowsOutputBatch(msgpack_unpack(data))
|
|
105
|
+
|
|
106
|
+
return msgpack.ExtType(code, data)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def msgpack_unpack(data: bytes) -> Any:
|
|
110
|
+
return msgpack.unpackb(data, ext_hook=_msgpack_unpack_extended_types)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def marshal(obj: Iterator[RowsOutput]) -> Iterable[bytes]:
|
|
114
|
+
for row in obj:
|
|
115
|
+
yield msgpack_pack(row)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def unmarshal(obj: Iterator[bytes]) -> Iterable[RowsOutput]:
|
|
119
|
+
for row in obj:
|
|
120
|
+
yield msgpack_unpack(row)
|
datachain/query/udf.py
CHANGED
|
@@ -15,7 +15,14 @@ from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
|
15
15
|
|
|
16
16
|
from datachain.dataset import RowDict
|
|
17
17
|
|
|
18
|
-
from .batch import
|
|
18
|
+
from .batch import (
|
|
19
|
+
Batch,
|
|
20
|
+
BatchingStrategy,
|
|
21
|
+
NoBatching,
|
|
22
|
+
Partition,
|
|
23
|
+
RowsOutputBatch,
|
|
24
|
+
UDFInputBatch,
|
|
25
|
+
)
|
|
19
26
|
from .schema import (
|
|
20
27
|
UDFParameter,
|
|
21
28
|
UDFParamSpec,
|
|
@@ -25,7 +32,7 @@ from .schema import (
|
|
|
25
32
|
if TYPE_CHECKING:
|
|
26
33
|
from datachain.catalog import Catalog
|
|
27
34
|
|
|
28
|
-
from .batch import
|
|
35
|
+
from .batch import RowsOutput, UDFInput
|
|
29
36
|
|
|
30
37
|
ColumnType = Any
|
|
31
38
|
|
|
@@ -107,7 +114,8 @@ class UDFBase:
|
|
|
107
114
|
|
|
108
115
|
def run(
|
|
109
116
|
self,
|
|
110
|
-
|
|
117
|
+
udf_fields: "Sequence[str]",
|
|
118
|
+
udf_inputs: "Iterable[RowsOutput]",
|
|
111
119
|
catalog: "Catalog",
|
|
112
120
|
is_generator: bool,
|
|
113
121
|
cache: bool,
|
|
@@ -115,15 +123,22 @@ class UDFBase:
|
|
|
115
123
|
processed_cb: Callback = DEFAULT_CALLBACK,
|
|
116
124
|
) -> Iterator[Iterable["UDFResult"]]:
|
|
117
125
|
for batch in udf_inputs:
|
|
118
|
-
|
|
119
|
-
|
|
126
|
+
if isinstance(batch, RowsOutputBatch):
|
|
127
|
+
n_rows = len(batch.rows)
|
|
128
|
+
inputs: UDFInput = UDFInputBatch(
|
|
129
|
+
[RowDict(zip(udf_fields, row)) for row in batch.rows]
|
|
130
|
+
)
|
|
131
|
+
else:
|
|
132
|
+
n_rows = 1
|
|
133
|
+
inputs = RowDict(zip(udf_fields, batch))
|
|
134
|
+
output = self.run_once(catalog, inputs, is_generator, cache, cb=download_cb)
|
|
120
135
|
processed_cb.relative_update(n_rows)
|
|
121
136
|
yield output
|
|
122
137
|
|
|
123
138
|
def run_once(
|
|
124
139
|
self,
|
|
125
140
|
catalog: "Catalog",
|
|
126
|
-
arg: "
|
|
141
|
+
arg: "UDFInput",
|
|
127
142
|
is_generator: bool = False,
|
|
128
143
|
cache: bool = False,
|
|
129
144
|
cb: Callback = DEFAULT_CALLBACK,
|
|
@@ -199,12 +214,12 @@ class UDFWrapper(UDFBase):
|
|
|
199
214
|
def run_once(
|
|
200
215
|
self,
|
|
201
216
|
catalog: "Catalog",
|
|
202
|
-
arg: "
|
|
217
|
+
arg: "UDFInput",
|
|
203
218
|
is_generator: bool = False,
|
|
204
219
|
cache: bool = False,
|
|
205
220
|
cb: Callback = DEFAULT_CALLBACK,
|
|
206
221
|
) -> Iterable[UDFResult]:
|
|
207
|
-
if isinstance(arg,
|
|
222
|
+
if isinstance(arg, UDFInputBatch):
|
|
208
223
|
udf_inputs = [
|
|
209
224
|
self.bind_parameters(catalog, row, cache=cache, cb=cb)
|
|
210
225
|
for row in arg.rows
|
datachain/utils.py
CHANGED
|
@@ -10,7 +10,7 @@ import sys
|
|
|
10
10
|
import time
|
|
11
11
|
from collections.abc import Iterable, Iterator, Sequence
|
|
12
12
|
from datetime import date, datetime, timezone
|
|
13
|
-
from itertools import islice
|
|
13
|
+
from itertools import chain, islice
|
|
14
14
|
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
|
|
15
15
|
from uuid import UUID
|
|
16
16
|
|
|
@@ -241,7 +241,7 @@ _T_co = TypeVar("_T_co", covariant=True)
|
|
|
241
241
|
|
|
242
242
|
|
|
243
243
|
def batched(iterable: Iterable[_T_co], n: int) -> Iterator[tuple[_T_co, ...]]:
|
|
244
|
-
"Batch data into tuples of length n. The last batch may be shorter."
|
|
244
|
+
"""Batch data into tuples of length n. The last batch may be shorter."""
|
|
245
245
|
# Based on: https://docs.python.org/3/library/itertools.html#itertools-recipes
|
|
246
246
|
# batched('ABCDEFG', 3) --> ABC DEF G
|
|
247
247
|
if n < 1:
|
|
@@ -251,6 +251,21 @@ def batched(iterable: Iterable[_T_co], n: int) -> Iterator[tuple[_T_co, ...]]:
|
|
|
251
251
|
yield batch
|
|
252
252
|
|
|
253
253
|
|
|
254
|
+
def batched_it(iterable: Iterable[_T_co], n: int) -> Iterator[Iterator[_T_co]]:
|
|
255
|
+
"""Batch data into iterators of length n. The last batch may be shorter."""
|
|
256
|
+
# batched('ABCDEFG', 3) --> ABC DEF G
|
|
257
|
+
if n < 1:
|
|
258
|
+
raise ValueError("Batch size must be at least one")
|
|
259
|
+
it = iter(iterable)
|
|
260
|
+
while True:
|
|
261
|
+
chunk_it = islice(it, n)
|
|
262
|
+
try:
|
|
263
|
+
first_el = next(chunk_it)
|
|
264
|
+
except StopIteration:
|
|
265
|
+
return
|
|
266
|
+
yield chain((first_el,), chunk_it)
|
|
267
|
+
|
|
268
|
+
|
|
254
269
|
def flatten(items):
|
|
255
270
|
for item in items:
|
|
256
271
|
if isinstance(item, list):
|
|
@@ -15,18 +15,18 @@ datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A
|
|
|
15
15
|
datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
|
|
16
16
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
18
|
-
datachain/utils.py,sha256=
|
|
18
|
+
datachain/utils.py,sha256=ROVCLwb37VmFRzgTlSGUDw4eJNgYGiQ4yMX581HfUX8,12988
|
|
19
19
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
20
|
-
datachain/catalog/catalog.py,sha256=
|
|
20
|
+
datachain/catalog/catalog.py,sha256=9-7SnMjh5ruH9sdKDo8P5EklX9oC2EHH6bnku6ZqLko,80275
|
|
21
21
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
22
22
|
datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
|
|
23
23
|
datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
|
|
24
24
|
datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
|
|
25
25
|
datachain/client/azure.py,sha256=3RfDTAI_TszDy9WazHQd3bI3sS2wDFrNXfNqCDewZgE,2214
|
|
26
26
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
27
|
-
datachain/client/fsspec.py,sha256=
|
|
27
|
+
datachain/client/fsspec.py,sha256=G4QTm3KPhlaV74T3gLXJ86345_ak8CH38ezn2ET-oLc,13230
|
|
28
28
|
datachain/client/gcs.py,sha256=Mt77W_l8_fK61gLm4mmxNmENuOM0ETwxdiFp4S8d-_w,4105
|
|
29
|
-
datachain/client/local.py,sha256=
|
|
29
|
+
datachain/client/local.py,sha256=SyGnqcrbtSvDK6IJsQa6NxxHwbWaWIP1GLZsQBXg_IA,4939
|
|
30
30
|
datachain/client/s3.py,sha256=GfRZZzNPQPRsYjoef8bbsLbanJPUlCbyGTTK8ojzp8A,6136
|
|
31
31
|
datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
|
|
32
32
|
datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
|
|
@@ -36,13 +36,13 @@ datachain/data_storage/metastore.py,sha256=nxcY6nwyEmQWMAo33sNGO-FgUFQs2amBGGnZz
|
|
|
36
36
|
datachain/data_storage/schema.py,sha256=Idi-29fckvZozzvkyz3nTR2FOIajPlSuPdIEO7SMvXM,7863
|
|
37
37
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
38
38
|
datachain/data_storage/sqlite.py,sha256=0r6L_a2hdGRoR_gl06v1qWhEFOS_Q31aldHyk07Yx-M,26857
|
|
39
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
39
|
+
datachain/data_storage/warehouse.py,sha256=MXYkUG69UK2wbIFsZFvT7rKzXlnSitDMp3Vzj_IIsnA,33089
|
|
40
40
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
41
|
datachain/lib/arrow.py,sha256=R8wDUDEa-5hYjI3HW9cqvOYYJpeeah5lbhFIL3gkmcE,4915
|
|
42
42
|
datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
|
|
43
43
|
datachain/lib/data_model.py,sha256=qfTtQNncS5pt9SvXdMEa5kClniaT6XBGBfO7onEz2TI,1632
|
|
44
44
|
datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
|
|
45
|
-
datachain/lib/dc.py,sha256=
|
|
45
|
+
datachain/lib/dc.py,sha256=e24ecfIcypVkmVBqvr-p06zpwrw7GD20gy1gBJQPT-I,58012
|
|
46
46
|
datachain/lib/file.py,sha256=ZHpdilDPYCob8uqtwUPtBvBNxVvQRq4AC_0IGg5m-G4,12003
|
|
47
47
|
datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
|
|
48
48
|
datachain/lib/meta_formats.py,sha256=jlSYWRUeDMjun_YCsQ2JxyaDJpEpokzHDPmKUAoCXnU,7034
|
|
@@ -51,7 +51,7 @@ datachain/lib/pytorch.py,sha256=9PsypKseyKfIimTmTQOgb-pbNXgeeAHLdlWx0qRPULY,5660
|
|
|
51
51
|
datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
|
|
52
52
|
datachain/lib/signal_schema.py,sha256=VL9TR0CJ3eRzjIDr-8e-e7cZKuMBbPUZtY2lGAsucc0,15734
|
|
53
53
|
datachain/lib/text.py,sha256=dVe2Ilc_gW2EV0kun0UwegiCkapWcd20cef7CgINWHU,1083
|
|
54
|
-
datachain/lib/udf.py,sha256=
|
|
54
|
+
datachain/lib/udf.py,sha256=n3x6No-7l5LAciPJPWwZbA8WtTnGUU7d0wRL6CyfZh8,11847
|
|
55
55
|
datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
|
|
56
56
|
datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
|
|
57
57
|
datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -64,15 +64,16 @@ datachain/lib/convert/sql_to_python.py,sha256=lGnKzSF_tz9Y_5SSKkrIU95QEjpcDzvOxI
|
|
|
64
64
|
datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
|
|
65
65
|
datachain/lib/convert/values_to_tuples.py,sha256=aVoHWMOUGLAiS6_BBwKJqVIne91VffOW6-dWyNE7oHg,3715
|
|
66
66
|
datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
|
|
67
|
-
datachain/query/batch.py,sha256
|
|
67
|
+
datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
|
|
68
68
|
datachain/query/builtins.py,sha256=EmKPYsoQ46zwdyOn54MuCzvYFmfsBn5F8zyF7UBUfrc,2550
|
|
69
|
-
datachain/query/dataset.py,sha256=
|
|
70
|
-
datachain/query/dispatch.py,sha256=
|
|
69
|
+
datachain/query/dataset.py,sha256=sRKY2it_znlzTNOt_OCRe008rHu0TXMnFwvGsnthSO0,60209
|
|
70
|
+
datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
|
|
71
71
|
datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
|
|
72
72
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
73
|
+
datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
|
|
73
74
|
datachain/query/schema.py,sha256=O3mTM5DRjvRAJCI7O9mR8wOdFJbgI1jIjvtfl5YvjI4,7755
|
|
74
75
|
datachain/query/session.py,sha256=qTzkXgwMJdJhal3rVt3hdv3x1EXT1IHuXcwkC-Ex0As,4111
|
|
75
|
-
datachain/query/udf.py,sha256=
|
|
76
|
+
datachain/query/udf.py,sha256=j3NhmKK5rYG5TclcM2Sr0LhS1tmYLMjzMugx9G9iFLM,8100
|
|
76
77
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
77
78
|
datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
|
|
78
79
|
datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
|
|
@@ -92,9 +93,9 @@ datachain/sql/sqlite/base.py,sha256=LBYmXqXsVF30fbcnR55evCZHbPDCzMdGk_ogPLps63s,
|
|
|
92
93
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
93
94
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
94
95
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
95
|
-
datachain-0.3.
|
|
96
|
-
datachain-0.3.
|
|
97
|
-
datachain-0.3.
|
|
98
|
-
datachain-0.3.
|
|
99
|
-
datachain-0.3.
|
|
100
|
-
datachain-0.3.
|
|
96
|
+
datachain-0.3.1.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
97
|
+
datachain-0.3.1.dist-info/METADATA,sha256=qR3OMpGUkx0cKelnl51d9uksn5H-Wn4LvTJbUnTMDuQ,17268
|
|
98
|
+
datachain-0.3.1.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
|
99
|
+
datachain-0.3.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
100
|
+
datachain-0.3.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
101
|
+
datachain-0.3.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|