datachain 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -2,10 +2,10 @@ datachain/__init__.py,sha256=ofPJ6B-d-ybSDRrE7J6wqF_ZRAB2W9U8l-eeuBtqPLg,865
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=Lg3Ck1PQLjQziMx9KU4atzbEnJXTE0924WMYkhgWtGU,8247
4
4
  datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
5
- datachain/cli.py,sha256=TQ1OKMulAcsJndKLCyxJpfNqbMWQgOa4Aeihnu36cR8,30095
5
+ datachain/cli.py,sha256=vVK7hNEyF7p5bUTmixkbgS7JYyTSpXeyRZJkWfpYUOw,30164
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
8
- datachain/dataset.py,sha256=HWcFckJpmTU5AGsg8ILW8JInpNQqaWmJoasls18q5kI,14735
8
+ datachain/dataset.py,sha256=w7qqJP7xYrm9CmBSmSezSxUQHZDsHKkwviF8AYUob7o,14671
9
9
  datachain/error.py,sha256=vbIbamnFMIojh1UpmxWoA6Omup7WFAFNJnf8xAkGWwI,1146
10
10
  datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
11
11
  datachain/listing.py,sha256=TkMmBzCiru26x4RaZiagWJTmTGbiy6yGrAsSJMr8cFE,8213
@@ -18,7 +18,7 @@ datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
18
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
19
19
  datachain/utils.py,sha256=KeFSRHsiYthnTu4a6bH-rw04mX1m8krTX0f2NqfQGFI,12114
20
20
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
21
- datachain/catalog/catalog.py,sha256=FuKuIiCwPgN5Ea25hnFe_ZFZH9YEUZ2ma9k_Lczk-JU,63867
21
+ datachain/catalog/catalog.py,sha256=BsMyk2RQibQYHgrmovFZeSEpPVMTwgb_7ntVYdc7t-E,64090
22
22
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
23
23
  datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
24
24
  datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
@@ -33,31 +33,31 @@ datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZ
33
33
  datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
34
34
  datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
35
35
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
36
- datachain/data_storage/metastore.py,sha256=Ztw86JbN4-1gobZea1oqAAT2kotvi46pxNRjqncZ7B8,52457
36
+ datachain/data_storage/metastore.py,sha256=HfCxk4lmDUg2Q4WsFNQGMWxllP0mToA00fxkFTwdNIE,52919
37
37
  datachain/data_storage/schema.py,sha256=AGbjyEir5UmRZXI3m0jChZogUh5wd8csj6-YlUWaAxQ,8383
38
38
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
39
- datachain/data_storage/sqlite.py,sha256=EBKJncuzcyQfcKFm2mUjvHjHRTODsteM-k_zndunBrw,28834
40
- datachain/data_storage/warehouse.py,sha256=Vwhu_OfcNAoTtg1BHui80VCzlPeTUjZQL0QWziu8awY,32186
39
+ datachain/data_storage/sqlite.py,sha256=fW08P7AbJ0cDbTbcTKuAGpvMXvBjg-QkGsKT_Dslyws,28383
40
+ datachain/data_storage/warehouse.py,sha256=fXhVfao3NfWFGbbG5uJ-Ga4bX1FiKVfcbDyQgECYfk8,32122
41
41
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
- datachain/lib/arrow.py,sha256=aUsoQmxDmuSnB8Ik9p57Y66gc_dgx6NBqkDDIfLsvno,7630
42
+ datachain/lib/arrow.py,sha256=0R2CYsN82nNa5_03iS6jVix9EKeeqNZNAMgpSQP2hfo,9482
43
43
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
44
44
  datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
45
45
  datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
46
- datachain/lib/dc.py,sha256=kabEHnqbcoat7gd-yl0PvmuC6SyKbRa8r7NWKcN6GEQ,68978
46
+ datachain/lib/dc.py,sha256=HLOAkJEKFHJV_PqwSu0Pyl1m7JmUea8_wiMJFr14Nfk,75960
47
47
  datachain/lib/file.py,sha256=LjTW_-PDAnoUhvyB4bJ8Y8n__XGqrxvmd9mDOF0Gir8,14875
48
48
  datachain/lib/hf.py,sha256=cPnmLuprr0pYABH7KqA5FARQ1JGlywdDwD3yDzVAm4k,5920
49
49
  datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
50
- datachain/lib/listing.py,sha256=cHPN5-Fq8yb0gP6DARImhmZWxykDDNqhhJujDxEp53A,4104
50
+ datachain/lib/listing.py,sha256=_2oQXh03RAOydeyW3G4OSXCncZaapMGlyGCYcvuUPhc,4145
51
51
  datachain/lib/listing_info.py,sha256=36NZ-tXY5Y118wurkajuWWbcE8UCjkRwZlacDtN9F3g,954
52
52
  datachain/lib/meta_formats.py,sha256=3f-0vpMTesagS9iMd3y9-u9r-7g0eqYsxmK4fVfNWlw,6635
53
53
  datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
54
- datachain/lib/pytorch.py,sha256=8LNyFaBrx8zws--MEsFg5g3pb8oLnaQAUlgGvtjKxX4,5960
54
+ datachain/lib/pytorch.py,sha256=W-ARi2xH1f1DUkVfRuerW-YWYgSaJASmNCxtz2lrJGI,6072
55
55
  datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
56
56
  datachain/lib/signal_schema.py,sha256=iqgubjCBRiUJB30miv05qFX4uU04dA_Pzi3DCUsHZGs,24177
57
57
  datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
58
58
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
59
- datachain/lib/udf.py,sha256=nG7DDuPgZ5ZuijwvDoCq-OZMxlDM8vFNzyxMmik0Y1c,11716
60
- datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
59
+ datachain/lib/udf.py,sha256=oHhJWb0gVTxcybGzYDzAeN0Gb1IMhZBoGefncT88dIY,12339
60
+ datachain/lib/udf_signature.py,sha256=GXw24A-Olna6DWCdgy2bC-gZh_gLGPQ-KvjuI6pUjC0,7281
61
61
  datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
62
62
  datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
63
  datachain/lib/webdataset.py,sha256=o7SHk5HOUWsZ5Ln04xOM04eQqiBHiJNO7xLgyVBrwo8,6924
@@ -70,14 +70,13 @@ datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xd
70
70
  datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMNDGl4x5t6yQMl8,3931
71
71
  datachain/query/__init__.py,sha256=0NBOZVgIDpCcj1Ci883dQ9A0iiwe03xzmotkOCFbxYc,293
72
72
  datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
73
- datachain/query/dataset.py,sha256=F9WEVhDuFm6NQT6l-Vi3PMU-mQVpqwKHMgZIA4eWB18,53602
74
- datachain/query/dispatch.py,sha256=CFAc09O6UllcyUSSEY1GUlEMPzeO8RYhXinNN4HBl9M,12405
73
+ datachain/query/dataset.py,sha256=1c7y178ccFSeL_WIba0vT87Md_Oo4F8zaTVDjB9Bp3I,53641
74
+ datachain/query/dispatch.py,sha256=JVcZ4REE_GOsqXbar_Cb_fk-pHgQoabQLzXwuu7IhOg,12409
75
75
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
76
76
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
77
77
  datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
78
78
  datachain/query/schema.py,sha256=I8zLWJuWl5N332ni9mAzDYtcxMJupVPgWkSDe8spNEk,8019
79
- datachain/query/session.py,sha256=UPH5Z4fzCDsvj81ji0e8GA6Mgra3bOAEpVq4htqOtis,4317
80
- datachain/query/udf.py,sha256=HB2hbEuiGA4ch9P2mh9iLA5Jj9mRj-4JFy9VfjTLJ8U,3622
79
+ datachain/query/session.py,sha256=kpFFJMfWBnxaMPojMGhJRbk-BOsSYI8Ckl6vvqnx7d0,5787
81
80
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
81
  datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
83
82
  datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
@@ -97,9 +96,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
97
96
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
98
97
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
99
98
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
100
- datachain-0.4.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
101
- datachain-0.4.0.dist-info/METADATA,sha256=UmW4n6_qqsTZe_bXdjwCe6n6zWSVq35Kn_-h_u_b0RA,17156
102
- datachain-0.4.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
103
- datachain-0.4.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
104
- datachain-0.4.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
105
- datachain-0.4.0.dist-info/RECORD,,
99
+ datachain-0.5.1.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
100
+ datachain-0.5.1.dist-info/METADATA,sha256=n8TFKjDmTzNBMaW5Oa6MUUUOAQbAjPzkAMaKCW3Y9NU,17156
101
+ datachain-0.5.1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
102
+ datachain-0.5.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
103
+ datachain-0.5.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
104
+ datachain-0.5.1.dist-info/RECORD,,
datachain/query/udf.py DELETED
@@ -1,126 +0,0 @@
1
- import typing
2
- from collections.abc import Iterable, Iterator, Sequence
3
- from dataclasses import dataclass
4
- from typing import (
5
- TYPE_CHECKING,
6
- Any,
7
- )
8
-
9
- from fsspec.callbacks import DEFAULT_CALLBACK, Callback
10
-
11
- from datachain.dataset import RowDict
12
-
13
- from .batch import (
14
- Batch,
15
- BatchingStrategy,
16
- NoBatching,
17
- Partition,
18
- RowsOutputBatch,
19
- UDFInputBatch,
20
- )
21
- from .schema import UDFParameter
22
-
23
- if TYPE_CHECKING:
24
- from datachain.catalog import Catalog
25
-
26
- from .batch import RowsOutput, UDFInput
27
-
28
- ColumnType = Any
29
-
30
-
31
- # Specification for the output of a UDF
32
- UDFOutputSpec = typing.Mapping[str, ColumnType]
33
-
34
- # Result type when calling the UDF wrapper around the actual
35
- # Python function / class implementing it.
36
- UDFResult = dict[str, Any]
37
-
38
-
39
- @dataclass
40
- class UDFProperties:
41
- """Container for basic UDF properties."""
42
-
43
- params: list[UDFParameter]
44
- output: UDFOutputSpec
45
- batch: int = 1
46
-
47
- def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
48
- if use_partitioning:
49
- return Partition()
50
- if self.batch == 1:
51
- return NoBatching()
52
- if self.batch > 1:
53
- return Batch(self.batch)
54
- raise ValueError(f"invalid batch size {self.batch}")
55
-
56
- def signal_names(self) -> Iterable[str]:
57
- return self.output.keys()
58
-
59
-
60
- class UDFBase:
61
- """A base class for implementing stateful UDFs."""
62
-
63
- def __init__(
64
- self,
65
- properties: UDFProperties,
66
- ):
67
- self.properties = properties
68
- self.signal_names = properties.signal_names()
69
- self.output = properties.output
70
-
71
- def run(
72
- self,
73
- udf_fields: "Sequence[str]",
74
- udf_inputs: "Iterable[RowsOutput]",
75
- catalog: "Catalog",
76
- is_generator: bool,
77
- cache: bool,
78
- download_cb: Callback = DEFAULT_CALLBACK,
79
- processed_cb: Callback = DEFAULT_CALLBACK,
80
- ) -> Iterator[Iterable["UDFResult"]]:
81
- for batch in udf_inputs:
82
- if isinstance(batch, RowsOutputBatch):
83
- n_rows = len(batch.rows)
84
- inputs: UDFInput = UDFInputBatch(
85
- [RowDict(zip(udf_fields, row)) for row in batch.rows]
86
- )
87
- else:
88
- n_rows = 1
89
- inputs = RowDict(zip(udf_fields, batch))
90
- output = self.run_once(catalog, inputs, is_generator, cache, cb=download_cb)
91
- processed_cb.relative_update(n_rows)
92
- yield output
93
-
94
- def run_once(
95
- self,
96
- catalog: "Catalog",
97
- arg: "UDFInput",
98
- is_generator: bool = False,
99
- cache: bool = False,
100
- cb: Callback = DEFAULT_CALLBACK,
101
- ) -> Iterable[UDFResult]:
102
- raise NotImplementedError
103
-
104
- def bind_parameters(self, catalog: "Catalog", row: "RowDict", **kwargs) -> list:
105
- return [p.get_value(catalog, row, **kwargs) for p in self.properties.params]
106
-
107
- def _process_results(
108
- self,
109
- rows: Sequence["RowDict"],
110
- results: Sequence[Sequence[Any]],
111
- is_generator=False,
112
- ) -> Iterable[UDFResult]:
113
- """Create a list of dictionaries representing UDF results."""
114
-
115
- # outputting rows
116
- if is_generator:
117
- # each row in results is a tuple of column values
118
- return (dict(zip(self.signal_names, row)) for row in results)
119
-
120
- # outputting signals
121
- row_ids = [row["sys__id"] for row in rows]
122
- return [
123
- {"sys__id": row_id} | dict(zip(self.signal_names, signals))
124
- for row_id, signals in zip(row_ids, results)
125
- if signals is not None # skip rows with no output
126
- ]