datachain 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +8 -0
- datachain/cli.py +3 -2
- datachain/data_storage/metastore.py +28 -9
- datachain/data_storage/sqlite.py +24 -32
- datachain/data_storage/warehouse.py +1 -3
- datachain/dataset.py +0 -3
- datachain/lib/arrow.py +64 -19
- datachain/lib/dc.py +310 -123
- datachain/lib/listing.py +5 -3
- datachain/lib/pytorch.py +5 -1
- datachain/lib/udf.py +100 -78
- datachain/lib/udf_signature.py +8 -6
- datachain/query/dataset.py +7 -7
- datachain/query/dispatch.py +2 -2
- datachain/query/session.py +42 -0
- {datachain-0.4.0.dist-info → datachain-0.5.1.dist-info}/METADATA +1 -1
- {datachain-0.4.0.dist-info → datachain-0.5.1.dist-info}/RECORD +21 -22
- datachain/query/udf.py +0 -126
- {datachain-0.4.0.dist-info → datachain-0.5.1.dist-info}/LICENSE +0 -0
- {datachain-0.4.0.dist-info → datachain-0.5.1.dist-info}/WHEEL +0 -0
- {datachain-0.4.0.dist-info → datachain-0.5.1.dist-info}/entry_points.txt +0 -0
- {datachain-0.4.0.dist-info → datachain-0.5.1.dist-info}/top_level.txt +0 -0
|
@@ -2,10 +2,10 @@ datachain/__init__.py,sha256=ofPJ6B-d-ybSDRrE7J6wqF_ZRAB2W9U8l-eeuBtqPLg,865
|
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=Lg3Ck1PQLjQziMx9KU4atzbEnJXTE0924WMYkhgWtGU,8247
|
|
4
4
|
datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
|
|
5
|
-
datachain/cli.py,sha256=
|
|
5
|
+
datachain/cli.py,sha256=vVK7hNEyF7p5bUTmixkbgS7JYyTSpXeyRZJkWfpYUOw,30164
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
7
7
|
datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
|
|
8
|
-
datachain/dataset.py,sha256=
|
|
8
|
+
datachain/dataset.py,sha256=w7qqJP7xYrm9CmBSmSezSxUQHZDsHKkwviF8AYUob7o,14671
|
|
9
9
|
datachain/error.py,sha256=vbIbamnFMIojh1UpmxWoA6Omup7WFAFNJnf8xAkGWwI,1146
|
|
10
10
|
datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
|
|
11
11
|
datachain/listing.py,sha256=TkMmBzCiru26x4RaZiagWJTmTGbiy6yGrAsSJMr8cFE,8213
|
|
@@ -18,7 +18,7 @@ datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
|
18
18
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
19
19
|
datachain/utils.py,sha256=KeFSRHsiYthnTu4a6bH-rw04mX1m8krTX0f2NqfQGFI,12114
|
|
20
20
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
21
|
-
datachain/catalog/catalog.py,sha256=
|
|
21
|
+
datachain/catalog/catalog.py,sha256=BsMyk2RQibQYHgrmovFZeSEpPVMTwgb_7ntVYdc7t-E,64090
|
|
22
22
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
23
23
|
datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
|
|
24
24
|
datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
|
|
@@ -33,31 +33,31 @@ datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZ
|
|
|
33
33
|
datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
|
|
34
34
|
datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
|
|
35
35
|
datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
|
|
36
|
-
datachain/data_storage/metastore.py,sha256=
|
|
36
|
+
datachain/data_storage/metastore.py,sha256=HfCxk4lmDUg2Q4WsFNQGMWxllP0mToA00fxkFTwdNIE,52919
|
|
37
37
|
datachain/data_storage/schema.py,sha256=AGbjyEir5UmRZXI3m0jChZogUh5wd8csj6-YlUWaAxQ,8383
|
|
38
38
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
39
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
40
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
39
|
+
datachain/data_storage/sqlite.py,sha256=fW08P7AbJ0cDbTbcTKuAGpvMXvBjg-QkGsKT_Dslyws,28383
|
|
40
|
+
datachain/data_storage/warehouse.py,sha256=fXhVfao3NfWFGbbG5uJ-Ga4bX1FiKVfcbDyQgECYfk8,32122
|
|
41
41
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
|
-
datachain/lib/arrow.py,sha256=
|
|
42
|
+
datachain/lib/arrow.py,sha256=0R2CYsN82nNa5_03iS6jVix9EKeeqNZNAMgpSQP2hfo,9482
|
|
43
43
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
44
44
|
datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
|
|
45
45
|
datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
|
|
46
|
-
datachain/lib/dc.py,sha256=
|
|
46
|
+
datachain/lib/dc.py,sha256=HLOAkJEKFHJV_PqwSu0Pyl1m7JmUea8_wiMJFr14Nfk,75960
|
|
47
47
|
datachain/lib/file.py,sha256=LjTW_-PDAnoUhvyB4bJ8Y8n__XGqrxvmd9mDOF0Gir8,14875
|
|
48
48
|
datachain/lib/hf.py,sha256=cPnmLuprr0pYABH7KqA5FARQ1JGlywdDwD3yDzVAm4k,5920
|
|
49
49
|
datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
|
|
50
|
-
datachain/lib/listing.py,sha256=
|
|
50
|
+
datachain/lib/listing.py,sha256=_2oQXh03RAOydeyW3G4OSXCncZaapMGlyGCYcvuUPhc,4145
|
|
51
51
|
datachain/lib/listing_info.py,sha256=36NZ-tXY5Y118wurkajuWWbcE8UCjkRwZlacDtN9F3g,954
|
|
52
52
|
datachain/lib/meta_formats.py,sha256=3f-0vpMTesagS9iMd3y9-u9r-7g0eqYsxmK4fVfNWlw,6635
|
|
53
53
|
datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
|
|
54
|
-
datachain/lib/pytorch.py,sha256=
|
|
54
|
+
datachain/lib/pytorch.py,sha256=W-ARi2xH1f1DUkVfRuerW-YWYgSaJASmNCxtz2lrJGI,6072
|
|
55
55
|
datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
|
|
56
56
|
datachain/lib/signal_schema.py,sha256=iqgubjCBRiUJB30miv05qFX4uU04dA_Pzi3DCUsHZGs,24177
|
|
57
57
|
datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
|
|
58
58
|
datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
|
|
59
|
-
datachain/lib/udf.py,sha256=
|
|
60
|
-
datachain/lib/udf_signature.py,sha256=
|
|
59
|
+
datachain/lib/udf.py,sha256=oHhJWb0gVTxcybGzYDzAeN0Gb1IMhZBoGefncT88dIY,12339
|
|
60
|
+
datachain/lib/udf_signature.py,sha256=GXw24A-Olna6DWCdgy2bC-gZh_gLGPQ-KvjuI6pUjC0,7281
|
|
61
61
|
datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
|
|
62
62
|
datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
63
63
|
datachain/lib/webdataset.py,sha256=o7SHk5HOUWsZ5Ln04xOM04eQqiBHiJNO7xLgyVBrwo8,6924
|
|
@@ -70,14 +70,13 @@ datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xd
|
|
|
70
70
|
datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMNDGl4x5t6yQMl8,3931
|
|
71
71
|
datachain/query/__init__.py,sha256=0NBOZVgIDpCcj1Ci883dQ9A0iiwe03xzmotkOCFbxYc,293
|
|
72
72
|
datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
|
|
73
|
-
datachain/query/dataset.py,sha256=
|
|
74
|
-
datachain/query/dispatch.py,sha256=
|
|
73
|
+
datachain/query/dataset.py,sha256=1c7y178ccFSeL_WIba0vT87Md_Oo4F8zaTVDjB9Bp3I,53641
|
|
74
|
+
datachain/query/dispatch.py,sha256=JVcZ4REE_GOsqXbar_Cb_fk-pHgQoabQLzXwuu7IhOg,12409
|
|
75
75
|
datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
|
|
76
76
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
77
77
|
datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
|
|
78
78
|
datachain/query/schema.py,sha256=I8zLWJuWl5N332ni9mAzDYtcxMJupVPgWkSDe8spNEk,8019
|
|
79
|
-
datachain/query/session.py,sha256=
|
|
80
|
-
datachain/query/udf.py,sha256=HB2hbEuiGA4ch9P2mh9iLA5Jj9mRj-4JFy9VfjTLJ8U,3622
|
|
79
|
+
datachain/query/session.py,sha256=kpFFJMfWBnxaMPojMGhJRbk-BOsSYI8Ckl6vvqnx7d0,5787
|
|
81
80
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
82
81
|
datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
|
|
83
82
|
datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
|
|
@@ -97,9 +96,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
|
|
|
97
96
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
98
97
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
99
98
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
100
|
-
datachain-0.
|
|
101
|
-
datachain-0.
|
|
102
|
-
datachain-0.
|
|
103
|
-
datachain-0.
|
|
104
|
-
datachain-0.
|
|
105
|
-
datachain-0.
|
|
99
|
+
datachain-0.5.1.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
100
|
+
datachain-0.5.1.dist-info/METADATA,sha256=n8TFKjDmTzNBMaW5Oa6MUUUOAQbAjPzkAMaKCW3Y9NU,17156
|
|
101
|
+
datachain-0.5.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
102
|
+
datachain-0.5.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
103
|
+
datachain-0.5.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
104
|
+
datachain-0.5.1.dist-info/RECORD,,
|
datachain/query/udf.py
DELETED
|
@@ -1,126 +0,0 @@
|
|
|
1
|
-
import typing
|
|
2
|
-
from collections.abc import Iterable, Iterator, Sequence
|
|
3
|
-
from dataclasses import dataclass
|
|
4
|
-
from typing import (
|
|
5
|
-
TYPE_CHECKING,
|
|
6
|
-
Any,
|
|
7
|
-
)
|
|
8
|
-
|
|
9
|
-
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
10
|
-
|
|
11
|
-
from datachain.dataset import RowDict
|
|
12
|
-
|
|
13
|
-
from .batch import (
|
|
14
|
-
Batch,
|
|
15
|
-
BatchingStrategy,
|
|
16
|
-
NoBatching,
|
|
17
|
-
Partition,
|
|
18
|
-
RowsOutputBatch,
|
|
19
|
-
UDFInputBatch,
|
|
20
|
-
)
|
|
21
|
-
from .schema import UDFParameter
|
|
22
|
-
|
|
23
|
-
if TYPE_CHECKING:
|
|
24
|
-
from datachain.catalog import Catalog
|
|
25
|
-
|
|
26
|
-
from .batch import RowsOutput, UDFInput
|
|
27
|
-
|
|
28
|
-
ColumnType = Any
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
# Specification for the output of a UDF
|
|
32
|
-
UDFOutputSpec = typing.Mapping[str, ColumnType]
|
|
33
|
-
|
|
34
|
-
# Result type when calling the UDF wrapper around the actual
|
|
35
|
-
# Python function / class implementing it.
|
|
36
|
-
UDFResult = dict[str, Any]
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
@dataclass
|
|
40
|
-
class UDFProperties:
|
|
41
|
-
"""Container for basic UDF properties."""
|
|
42
|
-
|
|
43
|
-
params: list[UDFParameter]
|
|
44
|
-
output: UDFOutputSpec
|
|
45
|
-
batch: int = 1
|
|
46
|
-
|
|
47
|
-
def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
|
|
48
|
-
if use_partitioning:
|
|
49
|
-
return Partition()
|
|
50
|
-
if self.batch == 1:
|
|
51
|
-
return NoBatching()
|
|
52
|
-
if self.batch > 1:
|
|
53
|
-
return Batch(self.batch)
|
|
54
|
-
raise ValueError(f"invalid batch size {self.batch}")
|
|
55
|
-
|
|
56
|
-
def signal_names(self) -> Iterable[str]:
|
|
57
|
-
return self.output.keys()
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
class UDFBase:
|
|
61
|
-
"""A base class for implementing stateful UDFs."""
|
|
62
|
-
|
|
63
|
-
def __init__(
|
|
64
|
-
self,
|
|
65
|
-
properties: UDFProperties,
|
|
66
|
-
):
|
|
67
|
-
self.properties = properties
|
|
68
|
-
self.signal_names = properties.signal_names()
|
|
69
|
-
self.output = properties.output
|
|
70
|
-
|
|
71
|
-
def run(
|
|
72
|
-
self,
|
|
73
|
-
udf_fields: "Sequence[str]",
|
|
74
|
-
udf_inputs: "Iterable[RowsOutput]",
|
|
75
|
-
catalog: "Catalog",
|
|
76
|
-
is_generator: bool,
|
|
77
|
-
cache: bool,
|
|
78
|
-
download_cb: Callback = DEFAULT_CALLBACK,
|
|
79
|
-
processed_cb: Callback = DEFAULT_CALLBACK,
|
|
80
|
-
) -> Iterator[Iterable["UDFResult"]]:
|
|
81
|
-
for batch in udf_inputs:
|
|
82
|
-
if isinstance(batch, RowsOutputBatch):
|
|
83
|
-
n_rows = len(batch.rows)
|
|
84
|
-
inputs: UDFInput = UDFInputBatch(
|
|
85
|
-
[RowDict(zip(udf_fields, row)) for row in batch.rows]
|
|
86
|
-
)
|
|
87
|
-
else:
|
|
88
|
-
n_rows = 1
|
|
89
|
-
inputs = RowDict(zip(udf_fields, batch))
|
|
90
|
-
output = self.run_once(catalog, inputs, is_generator, cache, cb=download_cb)
|
|
91
|
-
processed_cb.relative_update(n_rows)
|
|
92
|
-
yield output
|
|
93
|
-
|
|
94
|
-
def run_once(
|
|
95
|
-
self,
|
|
96
|
-
catalog: "Catalog",
|
|
97
|
-
arg: "UDFInput",
|
|
98
|
-
is_generator: bool = False,
|
|
99
|
-
cache: bool = False,
|
|
100
|
-
cb: Callback = DEFAULT_CALLBACK,
|
|
101
|
-
) -> Iterable[UDFResult]:
|
|
102
|
-
raise NotImplementedError
|
|
103
|
-
|
|
104
|
-
def bind_parameters(self, catalog: "Catalog", row: "RowDict", **kwargs) -> list:
|
|
105
|
-
return [p.get_value(catalog, row, **kwargs) for p in self.properties.params]
|
|
106
|
-
|
|
107
|
-
def _process_results(
|
|
108
|
-
self,
|
|
109
|
-
rows: Sequence["RowDict"],
|
|
110
|
-
results: Sequence[Sequence[Any]],
|
|
111
|
-
is_generator=False,
|
|
112
|
-
) -> Iterable[UDFResult]:
|
|
113
|
-
"""Create a list of dictionaries representing UDF results."""
|
|
114
|
-
|
|
115
|
-
# outputting rows
|
|
116
|
-
if is_generator:
|
|
117
|
-
# each row in results is a tuple of column values
|
|
118
|
-
return (dict(zip(self.signal_names, row)) for row in results)
|
|
119
|
-
|
|
120
|
-
# outputting signals
|
|
121
|
-
row_ids = [row["sys__id"] for row in rows]
|
|
122
|
-
return [
|
|
123
|
-
{"sys__id": row_id} | dict(zip(self.signal_names, signals))
|
|
124
|
-
for row_id, signals in zip(row_ids, results)
|
|
125
|
-
if signals is not None # skip rows with no output
|
|
126
|
-
]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|