datachain 0.7.10__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -1,7 +1,16 @@
1
+ import random
2
+ from typing import Optional
3
+
1
4
  from datachain import C, DataChain
2
5
 
6
+ RESOLUTION = 2**31 - 1 # Maximum positive value for a 32-bit signed integer.
7
+
3
8
 
4
- def train_test_split(dc: DataChain, weights: list[float]) -> list[DataChain]:
9
+ def train_test_split(
10
+ dc: DataChain,
11
+ weights: list[float],
12
+ seed: Optional[int] = None,
13
+ ) -> list[DataChain]:
5
14
  """
6
15
  Splits a DataChain into multiple subsets based on the provided weights.
7
16
 
@@ -18,6 +27,8 @@ def train_test_split(dc: DataChain, weights: list[float]) -> list[DataChain]:
18
27
  For example:
19
28
  - `[0.7, 0.3]` corresponds to a 70/30 split;
20
29
  - `[2, 1, 1]` corresponds to a 50/25/25 split.
30
+ seed (int, optional):
31
+ The seed for the random number generator. Defaults to None.
21
32
 
22
33
  Returns:
23
34
  list[DataChain]:
@@ -58,14 +69,16 @@ def train_test_split(dc: DataChain, weights: list[float]) -> list[DataChain]:
58
69
 
59
70
  weights_normalized = [weight / sum(weights) for weight in weights]
60
71
 
61
- resolution = 2**31 - 1 # Maximum positive value for a 32-bit signed integer.
72
+ rand_col = C("sys.rand")
73
+ if seed is not None:
74
+ uniform_seed = random.Random(seed).randrange(1, RESOLUTION) # noqa: S311
75
+ rand_col = (rand_col % RESOLUTION) * uniform_seed # type: ignore[assignment]
76
+ rand_col = rand_col % RESOLUTION # type: ignore[assignment]
62
77
 
63
78
  return [
64
79
  dc.filter(
65
- C("sys__rand") % resolution
66
- >= round(sum(weights_normalized[:index]) * resolution),
67
- C("sys__rand") % resolution
68
- < round(sum(weights_normalized[: index + 1]) * resolution),
80
+ rand_col >= round(sum(weights_normalized[:index]) * (RESOLUTION - 1)),
81
+ rand_col < round(sum(weights_normalized[: index + 1]) * (RESOLUTION - 1)),
69
82
  )
70
83
  for index, _ in enumerate(weights_normalized)
71
84
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.7.10
3
+ Version: 0.8.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -46,6 +46,7 @@ Requires-Dist: iterative-telemetry>=0.0.9
46
46
  Requires-Dist: platformdirs
47
47
  Requires-Dist: dvc-studio-client<1,>=0.21
48
48
  Requires-Dist: tabulate
49
+ Requires-Dist: websockets
49
50
  Provides-Extra: docs
50
51
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
51
52
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -91,14 +92,14 @@ Requires-Dist: types-requests; extra == "dev"
91
92
  Requires-Dist: types-tabulate; extra == "dev"
92
93
  Provides-Extra: examples
93
94
  Requires-Dist: datachain[tests]; extra == "examples"
94
- Requires-Dist: numpy<2,>=1; extra == "examples"
95
95
  Requires-Dist: defusedxml; extra == "examples"
96
96
  Requires-Dist: accelerate; extra == "examples"
97
- Requires-Dist: unstructured[embed-huggingface,pdf]<0.16.0; extra == "examples"
97
+ Requires-Dist: unstructured_ingest[embed-huggingface]; extra == "examples"
98
+ Requires-Dist: unstructured[pdf]; extra == "examples"
98
99
  Requires-Dist: pdfplumber==0.11.4; extra == "examples"
99
100
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
100
101
  Requires-Dist: onnx==1.16.1; extra == "examples"
101
- Requires-Dist: ultralytics==8.3.37; extra == "examples"
102
+ Requires-Dist: ultralytics==8.3.50; extra == "examples"
102
103
 
103
104
  ================
104
105
  |logo| DataChain
@@ -138,6 +139,11 @@ Use Cases
138
139
  3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
139
140
  Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
140
141
 
142
+ Getting Started
143
+ ===============
144
+
145
+ Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ and `Docs <https://docs.datachain.ai/>`_
146
+ to get started with `DataChain` and learn more.
141
147
 
142
148
  Key Features
143
149
  ============
@@ -161,12 +167,6 @@ Key Features
161
167
  - Pass datasets to Pytorch and Tensorflow, or export them back into storage.
162
168
 
163
169
 
164
- Getting Started
165
- ===============
166
-
167
- Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ to get started with `DataChain` and learn more.
168
-
169
-
170
170
  Contributing
171
171
  ============
172
172
 
@@ -2,7 +2,7 @@ datachain/__init__.py,sha256=ofPJ6B-d-ybSDRrE7J6wqF_ZRAB2W9U8l-eeuBtqPLg,865
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=5aKrjnUxk0mtnZeFKNJd1DCE0MsnSoyJBZkr0y9H_a0,9313
4
4
  datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
5
- datachain/cli.py,sha256=wQiYQ_qSVCGvS06pkknT9_FIBdFRzBdeRusW9uXE3vQ,42505
5
+ datachain/cli.py,sha256=gNXVoMfKINUhKjOpYN48tpyNBK13M0hkQWqra4jNSJQ,43137
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
8
8
  datachain/dataset.py,sha256=P-pDBgvPqJGDhq_I7fwCfb6hY8E8mIAO8Q0NT7SNlNE,19128
@@ -14,14 +14,14 @@ datachain/nodes_fetcher.py,sha256=ILMzUW5o4_6lUOVrLDC9gJPCXfcgKnMG68plrc7dAOA,11
14
14
  datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
15
15
  datachain/progress.py,sha256=5KotcvvzAUL_RF0GEj4JY0IB1lyImnmHxe89YkT1XO4,4330
16
16
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- datachain/studio.py,sha256=Hr0Ha0kou0so4i8i-gWiXC1AYlJ2arI1D55cc7mi3tg,7253
17
+ datachain/studio.py,sha256=BegIXunW1n-sZtHSe3a30Mw2MXexVGRn_GU-OzjRRKM,8725
18
18
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
19
19
  datachain/utils.py,sha256=-mSFowjIidJ4_sMXInvNHLn4rK_QnHuIlLuH1_lMGmI,13897
20
20
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
21
- datachain/catalog/catalog.py,sha256=s4fat0jjP3JPq0RGQ9zfzRkX1JavxxCrcB1tJKMgsks,57686
21
+ datachain/catalog/catalog.py,sha256=nuWjSIs4MO1hJa8-LQGbiMXLWWznPB_VKSVpS7368t4,58415
22
22
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
23
23
  datachain/catalog/loader.py,sha256=HA_mBC7q_My8j2WnSvIjUGuJpl6SIdg5vvy_lagxJlA,5733
24
- datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
24
+ datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
25
25
  datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
26
26
  datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
27
27
  datachain/client/fsspec.py,sha256=kf1blSGNcEXJ0tra3y5i35jc1aAy-67wMHXkqjlRMXg,12736
@@ -35,7 +35,7 @@ datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s
35
35
  datachain/data_storage/metastore.py,sha256=hfTITcesE9XlUTxcCcdDyWGGep-QSjJL9DUxko5QCeI,37524
36
36
  datachain/data_storage/schema.py,sha256=-QVlRvD0dfu-ZFUxylEoSnLJLnleMEjVlcAb2OGu-AY,9895
37
37
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
38
- datachain/data_storage/sqlite.py,sha256=D_ZQ0PHmZzHO2dinv4naVJocUDIZUwV4WAz692C1cyk,22521
38
+ datachain/data_storage/sqlite.py,sha256=iJv1QxwVifOowtYhIDqYVoea21dvkQIdxklGNIend3c,22961
39
39
  datachain/data_storage/warehouse.py,sha256=tjIkU-5JywBR0apCyqTcwSyaRtGxhu2L7IVjrz-55uc,30802
40
40
  datachain/func/__init__.py,sha256=TG6JHFKtLi06Nd5iLszXIflEq-VKZcKMdgo_KiQ8SGQ,1055
41
41
  datachain/func/aggregate.py,sha256=7_IPrIwb2XSs3zG4iOr1eTvzn6kNVe2mkzvNzjusDHk,10942
@@ -53,17 +53,18 @@ datachain/lib/arrow.py,sha256=b5efxAUaNNYVwtXVJqj07D3zf5KC-BPlLCxKEZbEG6w,9429
53
53
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
54
54
  datachain/lib/data_model.py,sha256=zS4lmXHVBXc9ntcyea2a1CRLXGSAN_0glXcF88CohgY,2685
55
55
  datachain/lib/dataset_info.py,sha256=IjdF1E0TQNOq9YyynfWiCFTeZpbyGfyJvxgJY4YN810,2493
56
- datachain/lib/dc.py,sha256=xqLR4IH_mbuet0FsxBHDsRUg-zR6tO8UZdLQQTLG8EE,89533
57
- datachain/lib/file.py,sha256=-XMkL6ED1sE7TMhWoMRTEuOXswZJw8X6AEmJDONFP74,15019
56
+ datachain/lib/dc.py,sha256=7Wm6TEPVNCSh4bz0iA9JvEsYtYAZ9o97lK7TEJ8modE,92149
57
+ datachain/lib/diff.py,sha256=Yurzyi7PzZzY80HOnVTpwtbWzSJ1LqN8NgZWwZOh_UU,6732
58
+ datachain/lib/file.py,sha256=4dDWXVCHHP2uELDPHP_LheyTyyr01jwp5wp3HaOIeFI,15028
58
59
  datachain/lib/hf.py,sha256=a-zFpDmZIR4r8dlNNTjfpAKSnuJ9xyRXlgcdENiXt3E,5864
59
60
  datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
60
61
  datachain/lib/listing.py,sha256=cVkCp7TRVpcZKSx-Bbk9t51bQI9Mw0o86W6ZPhAsuzM,3667
61
62
  datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
62
- datachain/lib/meta_formats.py,sha256=anK2bDVbaeCCh0yvKUBaW2MVos3zRgdaSV8uSduzPcU,6680
63
+ datachain/lib/meta_formats.py,sha256=hDPfEkcmiLZOjhBBXuareMdnq65Wj8vZvxjmum6cROM,6377
63
64
  datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
64
- datachain/lib/pytorch.py,sha256=QMJO_OGEMvBi2x71vGcG25agLzNwyLmF4Qx5iILlwaM,6350
65
+ datachain/lib/pytorch.py,sha256=dA3r1JY0wqV_907a1D0lFaEN-7v3fMRpc1ePFE9CnvA,6168
65
66
  datachain/lib/settings.py,sha256=ZELRCTLbi5vzRPiDX6cQ9LLg9TefJ_A05gIGni0lll8,2535
66
- datachain/lib/signal_schema.py,sha256=_uh19nCKhiD9ua8oIN1Q8R9iYv1BZAuqTJCLYVmyW8k,24557
67
+ datachain/lib/signal_schema.py,sha256=ziRTctom0-wAqURZfkfG6dc_3P2FcYxKjYsKC49NQ1Q,25415
67
68
  datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
68
69
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
69
70
  datachain/lib/udf.py,sha256=-j0krjNAELTqRI0dB1N65AmawtcIY5vN---AuUcW8Us,13637
@@ -88,7 +89,7 @@ datachain/model/ultralytics/pose.py,sha256=71KBTcoST2wcEtsyGXqLVpvUtqbp9gwZGA15p
88
89
  datachain/model/ultralytics/segment.py,sha256=Z1ab0tZRJubSYNH4KkFlzhYeGNTfAyC71KmkQcToHDQ,2760
89
90
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
90
91
  datachain/query/batch.py,sha256=5fEhORFe7li12SdYddaSK3LyqksMfCHhwN1_A6TfsA4,3485
91
- datachain/query/dataset.py,sha256=eXr9fJz2grX2evmkmsiH0Xeqajd8gFnujmt_USMxy0c,54563
92
+ datachain/query/dataset.py,sha256=fECGctERQrfLIowN9Fo6dTSnmHEe9WbfcjHRtRObcio,54667
92
93
  datachain/query/dispatch.py,sha256=fZ0TgGFRcsrYh1iXQoZVjkUl4Xetom9PSHoeDes3IRs,11606
93
94
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
94
95
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -96,7 +97,7 @@ datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
96
97
  datachain/query/schema.py,sha256=b_KnVy6B26Ol4nYG0LqNNpeQ1QYPk95YRGUjXfdaQWs,6606
97
98
  datachain/query/session.py,sha256=vvLIJ5b8eElovHLAWq_CZJXmN5t7C7iAZA7x9wPPOms,5905
98
99
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
99
- datachain/remote/studio.py,sha256=WiK6fpRAw0a6Dth4XXI0YInEHH4gDU7AUHHDNd3wJzg,11616
100
+ datachain/remote/studio.py,sha256=3DlgESETzxm3dgb6zzjjGxsddSkacT2dARnteLAfMxQ,13366
100
101
  datachain/sql/__init__.py,sha256=6SQRdbljO3d2hx3EAVXEZrHQKv5jth0Jh98PogT59No,262
101
102
  datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
102
103
  datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
@@ -116,11 +117,11 @@ datachain/sql/sqlite/base.py,sha256=E2PK3hoGlHey1eEjcReXRrI-c_ASr3AmAXaNYKDY_o8,
116
117
  datachain/sql/sqlite/types.py,sha256=lPXS1XbkmUtlkkiRxy_A_UzsgpPv2VSkXYOD4zIHM4w,1734
117
118
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
118
119
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
119
- datachain/toolkit/split.py,sha256=ZgDcrNiKiPXZmKD591_1z9qRIXitu5zwAsoVPB7ykiU,2508
120
+ datachain/toolkit/split.py,sha256=z3zRJNzjWrpPuRw-zgFbCOBKInyYxJew8ygrYQRQLNc,2930
120
121
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
121
- datachain-0.7.10.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
122
- datachain-0.7.10.dist-info/METADATA,sha256=qtw_rToRdmR9-CO6MFCAGv6NWJJ87C95iQaDEnDE4H8,8371
123
- datachain-0.7.10.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
124
- datachain-0.7.10.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
125
- datachain-0.7.10.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
126
- datachain-0.7.10.dist-info/RECORD,,
122
+ datachain-0.8.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
123
+ datachain-0.8.0.dist-info/METADATA,sha256=PXb2pYY67bdfDjFXR7C9hwN6LaKSmseRZJNFakrWfyg,8437
124
+ datachain-0.8.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
125
+ datachain-0.8.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
126
+ datachain-0.8.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
127
+ datachain-0.8.0.dist-info/RECORD,,