datachain 0.7.10__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +53 -41
- datachain/cli.py +25 -3
- datachain/client/__init__.py +1 -2
- datachain/data_storage/sqlite.py +20 -6
- datachain/lib/dc.py +160 -110
- datachain/lib/diff.py +197 -0
- datachain/lib/file.py +2 -1
- datachain/lib/meta_formats.py +40 -43
- datachain/lib/pytorch.py +1 -5
- datachain/lib/signal_schema.py +28 -6
- datachain/query/dataset.py +5 -1
- datachain/remote/studio.py +53 -1
- datachain/studio.py +47 -2
- datachain/toolkit/split.py +19 -6
- {datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/METADATA +10 -10
- {datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/RECORD +20 -19
- {datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/LICENSE +0 -0
- {datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/WHEEL +0 -0
- {datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/top_level.txt +0 -0
datachain/toolkit/split.py
CHANGED
|
@@ -1,7 +1,16 @@
|
|
|
1
|
+
import random
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
1
4
|
from datachain import C, DataChain
|
|
2
5
|
|
|
6
|
+
RESOLUTION = 2**31 - 1 # Maximum positive value for a 32-bit signed integer.
|
|
7
|
+
|
|
3
8
|
|
|
4
|
-
def train_test_split(
|
|
9
|
+
def train_test_split(
|
|
10
|
+
dc: DataChain,
|
|
11
|
+
weights: list[float],
|
|
12
|
+
seed: Optional[int] = None,
|
|
13
|
+
) -> list[DataChain]:
|
|
5
14
|
"""
|
|
6
15
|
Splits a DataChain into multiple subsets based on the provided weights.
|
|
7
16
|
|
|
@@ -18,6 +27,8 @@ def train_test_split(dc: DataChain, weights: list[float]) -> list[DataChain]:
|
|
|
18
27
|
For example:
|
|
19
28
|
- `[0.7, 0.3]` corresponds to a 70/30 split;
|
|
20
29
|
- `[2, 1, 1]` corresponds to a 50/25/25 split.
|
|
30
|
+
seed (int, optional):
|
|
31
|
+
The seed for the random number generator. Defaults to None.
|
|
21
32
|
|
|
22
33
|
Returns:
|
|
23
34
|
list[DataChain]:
|
|
@@ -58,14 +69,16 @@ def train_test_split(dc: DataChain, weights: list[float]) -> list[DataChain]:
|
|
|
58
69
|
|
|
59
70
|
weights_normalized = [weight / sum(weights) for weight in weights]
|
|
60
71
|
|
|
61
|
-
|
|
72
|
+
rand_col = C("sys.rand")
|
|
73
|
+
if seed is not None:
|
|
74
|
+
uniform_seed = random.Random(seed).randrange(1, RESOLUTION) # noqa: S311
|
|
75
|
+
rand_col = (rand_col % RESOLUTION) * uniform_seed # type: ignore[assignment]
|
|
76
|
+
rand_col = rand_col % RESOLUTION # type: ignore[assignment]
|
|
62
77
|
|
|
63
78
|
return [
|
|
64
79
|
dc.filter(
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
C("sys__rand") % resolution
|
|
68
|
-
< round(sum(weights_normalized[: index + 1]) * resolution),
|
|
80
|
+
rand_col >= round(sum(weights_normalized[:index]) * (RESOLUTION - 1)),
|
|
81
|
+
rand_col < round(sum(weights_normalized[: index + 1]) * (RESOLUTION - 1)),
|
|
69
82
|
)
|
|
70
83
|
for index, _ in enumerate(weights_normalized)
|
|
71
84
|
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -46,6 +46,7 @@ Requires-Dist: iterative-telemetry>=0.0.9
|
|
|
46
46
|
Requires-Dist: platformdirs
|
|
47
47
|
Requires-Dist: dvc-studio-client<1,>=0.21
|
|
48
48
|
Requires-Dist: tabulate
|
|
49
|
+
Requires-Dist: websockets
|
|
49
50
|
Provides-Extra: docs
|
|
50
51
|
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
51
52
|
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
@@ -91,14 +92,14 @@ Requires-Dist: types-requests; extra == "dev"
|
|
|
91
92
|
Requires-Dist: types-tabulate; extra == "dev"
|
|
92
93
|
Provides-Extra: examples
|
|
93
94
|
Requires-Dist: datachain[tests]; extra == "examples"
|
|
94
|
-
Requires-Dist: numpy<2,>=1; extra == "examples"
|
|
95
95
|
Requires-Dist: defusedxml; extra == "examples"
|
|
96
96
|
Requires-Dist: accelerate; extra == "examples"
|
|
97
|
-
Requires-Dist:
|
|
97
|
+
Requires-Dist: unstructured_ingest[embed-huggingface]; extra == "examples"
|
|
98
|
+
Requires-Dist: unstructured[pdf]; extra == "examples"
|
|
98
99
|
Requires-Dist: pdfplumber==0.11.4; extra == "examples"
|
|
99
100
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
100
101
|
Requires-Dist: onnx==1.16.1; extra == "examples"
|
|
101
|
-
Requires-Dist: ultralytics==8.3.
|
|
102
|
+
Requires-Dist: ultralytics==8.3.50; extra == "examples"
|
|
102
103
|
|
|
103
104
|
================
|
|
104
105
|
|logo| DataChain
|
|
@@ -138,6 +139,11 @@ Use Cases
|
|
|
138
139
|
3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
|
|
139
140
|
Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
|
|
140
141
|
|
|
142
|
+
Getting Started
|
|
143
|
+
===============
|
|
144
|
+
|
|
145
|
+
Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ and `Docs <https://docs.datachain.ai/>`_
|
|
146
|
+
to get started with `DataChain` and learn more.
|
|
141
147
|
|
|
142
148
|
Key Features
|
|
143
149
|
============
|
|
@@ -161,12 +167,6 @@ Key Features
|
|
|
161
167
|
- Pass datasets to Pytorch and Tensorflow, or export them back into storage.
|
|
162
168
|
|
|
163
169
|
|
|
164
|
-
Getting Started
|
|
165
|
-
===============
|
|
166
|
-
|
|
167
|
-
Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ to get started with `DataChain` and learn more.
|
|
168
|
-
|
|
169
|
-
|
|
170
170
|
Contributing
|
|
171
171
|
============
|
|
172
172
|
|
|
@@ -2,7 +2,7 @@ datachain/__init__.py,sha256=ofPJ6B-d-ybSDRrE7J6wqF_ZRAB2W9U8l-eeuBtqPLg,865
|
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=5aKrjnUxk0mtnZeFKNJd1DCE0MsnSoyJBZkr0y9H_a0,9313
|
|
4
4
|
datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
|
|
5
|
-
datachain/cli.py,sha256=
|
|
5
|
+
datachain/cli.py,sha256=gNXVoMfKINUhKjOpYN48tpyNBK13M0hkQWqra4jNSJQ,43137
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
7
7
|
datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
|
|
8
8
|
datachain/dataset.py,sha256=P-pDBgvPqJGDhq_I7fwCfb6hY8E8mIAO8Q0NT7SNlNE,19128
|
|
@@ -14,14 +14,14 @@ datachain/nodes_fetcher.py,sha256=ILMzUW5o4_6lUOVrLDC9gJPCXfcgKnMG68plrc7dAOA,11
|
|
|
14
14
|
datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
|
|
15
15
|
datachain/progress.py,sha256=5KotcvvzAUL_RF0GEj4JY0IB1lyImnmHxe89YkT1XO4,4330
|
|
16
16
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
-
datachain/studio.py,sha256=
|
|
17
|
+
datachain/studio.py,sha256=BegIXunW1n-sZtHSe3a30Mw2MXexVGRn_GU-OzjRRKM,8725
|
|
18
18
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
19
19
|
datachain/utils.py,sha256=-mSFowjIidJ4_sMXInvNHLn4rK_QnHuIlLuH1_lMGmI,13897
|
|
20
20
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
21
|
-
datachain/catalog/catalog.py,sha256=
|
|
21
|
+
datachain/catalog/catalog.py,sha256=nuWjSIs4MO1hJa8-LQGbiMXLWWznPB_VKSVpS7368t4,58415
|
|
22
22
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
23
23
|
datachain/catalog/loader.py,sha256=HA_mBC7q_My8j2WnSvIjUGuJpl6SIdg5vvy_lagxJlA,5733
|
|
24
|
-
datachain/client/__init__.py,sha256=
|
|
24
|
+
datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
|
|
25
25
|
datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
|
|
26
26
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
27
27
|
datachain/client/fsspec.py,sha256=kf1blSGNcEXJ0tra3y5i35jc1aAy-67wMHXkqjlRMXg,12736
|
|
@@ -35,7 +35,7 @@ datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s
|
|
|
35
35
|
datachain/data_storage/metastore.py,sha256=hfTITcesE9XlUTxcCcdDyWGGep-QSjJL9DUxko5QCeI,37524
|
|
36
36
|
datachain/data_storage/schema.py,sha256=-QVlRvD0dfu-ZFUxylEoSnLJLnleMEjVlcAb2OGu-AY,9895
|
|
37
37
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
38
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
38
|
+
datachain/data_storage/sqlite.py,sha256=iJv1QxwVifOowtYhIDqYVoea21dvkQIdxklGNIend3c,22961
|
|
39
39
|
datachain/data_storage/warehouse.py,sha256=tjIkU-5JywBR0apCyqTcwSyaRtGxhu2L7IVjrz-55uc,30802
|
|
40
40
|
datachain/func/__init__.py,sha256=TG6JHFKtLi06Nd5iLszXIflEq-VKZcKMdgo_KiQ8SGQ,1055
|
|
41
41
|
datachain/func/aggregate.py,sha256=7_IPrIwb2XSs3zG4iOr1eTvzn6kNVe2mkzvNzjusDHk,10942
|
|
@@ -53,17 +53,18 @@ datachain/lib/arrow.py,sha256=b5efxAUaNNYVwtXVJqj07D3zf5KC-BPlLCxKEZbEG6w,9429
|
|
|
53
53
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
54
54
|
datachain/lib/data_model.py,sha256=zS4lmXHVBXc9ntcyea2a1CRLXGSAN_0glXcF88CohgY,2685
|
|
55
55
|
datachain/lib/dataset_info.py,sha256=IjdF1E0TQNOq9YyynfWiCFTeZpbyGfyJvxgJY4YN810,2493
|
|
56
|
-
datachain/lib/dc.py,sha256=
|
|
57
|
-
datachain/lib/
|
|
56
|
+
datachain/lib/dc.py,sha256=7Wm6TEPVNCSh4bz0iA9JvEsYtYAZ9o97lK7TEJ8modE,92149
|
|
57
|
+
datachain/lib/diff.py,sha256=Yurzyi7PzZzY80HOnVTpwtbWzSJ1LqN8NgZWwZOh_UU,6732
|
|
58
|
+
datachain/lib/file.py,sha256=4dDWXVCHHP2uELDPHP_LheyTyyr01jwp5wp3HaOIeFI,15028
|
|
58
59
|
datachain/lib/hf.py,sha256=a-zFpDmZIR4r8dlNNTjfpAKSnuJ9xyRXlgcdENiXt3E,5864
|
|
59
60
|
datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
|
|
60
61
|
datachain/lib/listing.py,sha256=cVkCp7TRVpcZKSx-Bbk9t51bQI9Mw0o86W6ZPhAsuzM,3667
|
|
61
62
|
datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
|
|
62
|
-
datachain/lib/meta_formats.py,sha256=
|
|
63
|
+
datachain/lib/meta_formats.py,sha256=hDPfEkcmiLZOjhBBXuareMdnq65Wj8vZvxjmum6cROM,6377
|
|
63
64
|
datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
|
|
64
|
-
datachain/lib/pytorch.py,sha256=
|
|
65
|
+
datachain/lib/pytorch.py,sha256=dA3r1JY0wqV_907a1D0lFaEN-7v3fMRpc1ePFE9CnvA,6168
|
|
65
66
|
datachain/lib/settings.py,sha256=ZELRCTLbi5vzRPiDX6cQ9LLg9TefJ_A05gIGni0lll8,2535
|
|
66
|
-
datachain/lib/signal_schema.py,sha256=
|
|
67
|
+
datachain/lib/signal_schema.py,sha256=ziRTctom0-wAqURZfkfG6dc_3P2FcYxKjYsKC49NQ1Q,25415
|
|
67
68
|
datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
|
|
68
69
|
datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
|
|
69
70
|
datachain/lib/udf.py,sha256=-j0krjNAELTqRI0dB1N65AmawtcIY5vN---AuUcW8Us,13637
|
|
@@ -88,7 +89,7 @@ datachain/model/ultralytics/pose.py,sha256=71KBTcoST2wcEtsyGXqLVpvUtqbp9gwZGA15p
|
|
|
88
89
|
datachain/model/ultralytics/segment.py,sha256=Z1ab0tZRJubSYNH4KkFlzhYeGNTfAyC71KmkQcToHDQ,2760
|
|
89
90
|
datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
|
|
90
91
|
datachain/query/batch.py,sha256=5fEhORFe7li12SdYddaSK3LyqksMfCHhwN1_A6TfsA4,3485
|
|
91
|
-
datachain/query/dataset.py,sha256=
|
|
92
|
+
datachain/query/dataset.py,sha256=fECGctERQrfLIowN9Fo6dTSnmHEe9WbfcjHRtRObcio,54667
|
|
92
93
|
datachain/query/dispatch.py,sha256=fZ0TgGFRcsrYh1iXQoZVjkUl4Xetom9PSHoeDes3IRs,11606
|
|
93
94
|
datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
|
|
94
95
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
@@ -96,7 +97,7 @@ datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
|
|
|
96
97
|
datachain/query/schema.py,sha256=b_KnVy6B26Ol4nYG0LqNNpeQ1QYPk95YRGUjXfdaQWs,6606
|
|
97
98
|
datachain/query/session.py,sha256=vvLIJ5b8eElovHLAWq_CZJXmN5t7C7iAZA7x9wPPOms,5905
|
|
98
99
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
99
|
-
datachain/remote/studio.py,sha256=
|
|
100
|
+
datachain/remote/studio.py,sha256=3DlgESETzxm3dgb6zzjjGxsddSkacT2dARnteLAfMxQ,13366
|
|
100
101
|
datachain/sql/__init__.py,sha256=6SQRdbljO3d2hx3EAVXEZrHQKv5jth0Jh98PogT59No,262
|
|
101
102
|
datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
|
|
102
103
|
datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
|
|
@@ -116,11 +117,11 @@ datachain/sql/sqlite/base.py,sha256=E2PK3hoGlHey1eEjcReXRrI-c_ASr3AmAXaNYKDY_o8,
|
|
|
116
117
|
datachain/sql/sqlite/types.py,sha256=lPXS1XbkmUtlkkiRxy_A_UzsgpPv2VSkXYOD4zIHM4w,1734
|
|
117
118
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
118
119
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
119
|
-
datachain/toolkit/split.py,sha256=
|
|
120
|
+
datachain/toolkit/split.py,sha256=z3zRJNzjWrpPuRw-zgFbCOBKInyYxJew8ygrYQRQLNc,2930
|
|
120
121
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
121
|
-
datachain-0.
|
|
122
|
-
datachain-0.
|
|
123
|
-
datachain-0.
|
|
124
|
-
datachain-0.
|
|
125
|
-
datachain-0.
|
|
126
|
-
datachain-0.
|
|
122
|
+
datachain-0.8.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
123
|
+
datachain-0.8.0.dist-info/METADATA,sha256=PXb2pYY67bdfDjFXR7C9hwN6LaKSmseRZJNFakrWfyg,8437
|
|
124
|
+
datachain-0.8.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
125
|
+
datachain-0.8.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
126
|
+
datachain-0.8.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
127
|
+
datachain-0.8.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|