datachain 0.25.0__py3-none-any.whl → 0.25.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/lib/arrow.py +9 -0
- datachain/lib/dc/datasets.py +1 -1
- datachain/lib/hf.py +18 -21
- {datachain-0.25.0.dist-info → datachain-0.25.2.dist-info}/METADATA +3 -2
- {datachain-0.25.0.dist-info → datachain-0.25.2.dist-info}/RECORD +9 -9
- {datachain-0.25.0.dist-info → datachain-0.25.2.dist-info}/WHEEL +0 -0
- {datachain-0.25.0.dist-info → datachain-0.25.2.dist-info}/entry_points.txt +0 -0
- {datachain-0.25.0.dist-info → datachain-0.25.2.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.25.0.dist-info → datachain-0.25.2.dist-info}/top_level.txt +0 -0
datachain/lib/arrow.py
CHANGED
|
@@ -126,7 +126,16 @@ class ArrowGenerator(Generator):
|
|
|
126
126
|
if isinstance(kwargs.get("format"), CsvFileFormat):
|
|
127
127
|
kwargs["format"] = "csv"
|
|
128
128
|
arrow_file = ArrowRow(file=file, index=index, kwargs=kwargs)
|
|
129
|
+
|
|
130
|
+
if self.output_schema and hasattr(vals[0], "source"):
|
|
131
|
+
# if we are reading parquet file written by datachain it might have
|
|
132
|
+
# source inside of it already, so we should not duplicate it, instead
|
|
133
|
+
# we are re-creating it of the self.source flag
|
|
134
|
+
vals[0].source = arrow_file # type: ignore[attr-defined]
|
|
135
|
+
|
|
136
|
+
return vals
|
|
129
137
|
return [arrow_file, *vals]
|
|
138
|
+
|
|
130
139
|
return vals
|
|
131
140
|
|
|
132
141
|
def _process_non_datachain_record(
|
datachain/lib/dc/datasets.py
CHANGED
|
@@ -376,7 +376,7 @@ def move_dataset(
|
|
|
376
376
|
the namespace and project, or a regular name. If a regular name is used,
|
|
377
377
|
default values will be applied. The source dataset will no longer exist
|
|
378
378
|
after the move.
|
|
379
|
-
|
|
379
|
+
dest: The destination dataset name. This can also be a fully qualified
|
|
380
380
|
name with a namespace and project, or just a regular name (default values
|
|
381
381
|
will be used in that case). The original dataset will be moved here.
|
|
382
382
|
session: An optional session instance. If not provided, the default session
|
datachain/lib/hf.py
CHANGED
|
@@ -11,7 +11,7 @@ try:
|
|
|
11
11
|
Image,
|
|
12
12
|
IterableDataset,
|
|
13
13
|
IterableDatasetDict,
|
|
14
|
-
|
|
14
|
+
List,
|
|
15
15
|
Value,
|
|
16
16
|
load_dataset,
|
|
17
17
|
)
|
|
@@ -59,7 +59,6 @@ class HFImage(DataModel):
|
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
class HFAudio(DataModel):
|
|
62
|
-
path: str
|
|
63
62
|
array: list[float]
|
|
64
63
|
sampling_rate: int
|
|
65
64
|
|
|
@@ -116,26 +115,24 @@ def stream_splits(ds: Union[str, HFDatasetType], *args, **kwargs):
|
|
|
116
115
|
return {"": ds}
|
|
117
116
|
|
|
118
117
|
|
|
119
|
-
def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
|
|
120
|
-
if isinstance(feat, (Value, Array2D, Array3D, Array4D, Array5D)):
|
|
118
|
+
def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
|
|
119
|
+
if isinstance(feat, (Value, Array2D, Array3D, Array4D, Array5D, List)):
|
|
121
120
|
return val
|
|
122
121
|
if isinstance(feat, ClassLabel):
|
|
123
122
|
return HFClassLabel(string=feat.names[val], integer=val)
|
|
124
|
-
if isinstance(feat,
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
return anno(**sdict)
|
|
132
|
-
return val
|
|
123
|
+
if isinstance(feat, dict):
|
|
124
|
+
sdict = {}
|
|
125
|
+
for sname in val:
|
|
126
|
+
sfeat = feat[sname]
|
|
127
|
+
sanno = anno.model_fields[sname].annotation
|
|
128
|
+
sdict[sname] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
|
|
129
|
+
return anno(**sdict)
|
|
133
130
|
if isinstance(feat, Image):
|
|
134
131
|
if isinstance(val, dict):
|
|
135
132
|
return HFImage(img=val["bytes"])
|
|
136
133
|
return HFImage(img=image_to_bytes(val))
|
|
137
134
|
if isinstance(feat, Audio):
|
|
138
|
-
return HFAudio(
|
|
135
|
+
return HFAudio(array=val["array"], sampling_rate=val["sampling_rate"])
|
|
139
136
|
|
|
140
137
|
|
|
141
138
|
def get_output_schema(features: Features) -> dict[str, DataType]:
|
|
@@ -151,13 +148,13 @@ def _feature_to_chain_type(name: str, val: Any) -> DataType: # noqa: PLR0911
|
|
|
151
148
|
return arrow_type_mapper(val.pa_type)
|
|
152
149
|
if isinstance(val, ClassLabel):
|
|
153
150
|
return HFClassLabel
|
|
154
|
-
if isinstance(val,
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
151
|
+
if isinstance(val, dict):
|
|
152
|
+
sequence_dict = {}
|
|
153
|
+
for sname, sval in val.items():
|
|
154
|
+
dtype = _feature_to_chain_type(sname, sval)
|
|
155
|
+
sequence_dict[sname] = dtype # type: ignore[valid-type]
|
|
156
|
+
return dict_to_data_model(name, sequence_dict) # type: ignore[arg-type]
|
|
157
|
+
if isinstance(val, List):
|
|
161
158
|
return list[_feature_to_chain_type(name, val.feature)] # type: ignore[arg-type,misc,return-value]
|
|
162
159
|
if isinstance(val, Array2D):
|
|
163
160
|
dtype = arrow_type_mapper(string_to_arrow(val.dtype))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.25.
|
|
3
|
+
Version: 0.25.2
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -70,7 +70,8 @@ Provides-Extra: vector
|
|
|
70
70
|
Requires-Dist: usearch; extra == "vector"
|
|
71
71
|
Provides-Extra: hf
|
|
72
72
|
Requires-Dist: numba>=0.60.0; extra == "hf"
|
|
73
|
-
Requires-Dist: datasets[
|
|
73
|
+
Requires-Dist: datasets[vision]>=4.0.0; extra == "hf"
|
|
74
|
+
Requires-Dist: datasets[audio]>=4.0.0; (sys_platform == "linux" or sys_platform == "darwin") and extra == "hf"
|
|
74
75
|
Requires-Dist: fsspec>=2024.12.0; extra == "hf"
|
|
75
76
|
Provides-Extra: video
|
|
76
77
|
Requires-Dist: ffmpeg-python; extra == "video"
|
|
@@ -70,12 +70,12 @@ datachain/func/random.py,sha256=t7jwXsI8-hy0qAdvjAntgzy-AHtTAfozlZ1CpKR-QZE,458
|
|
|
70
70
|
datachain/func/string.py,sha256=X9u4ip97U63RCaKRhMddoze7HgPiY3LbPRn9G06UWWo,7311
|
|
71
71
|
datachain/func/window.py,sha256=ImyRpc1QI8QUSPO7KdD60e_DPVo7Ja0G5kcm6BlyMcw,1584
|
|
72
72
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
73
|
-
datachain/lib/arrow.py,sha256=
|
|
73
|
+
datachain/lib/arrow.py,sha256=hdEQ8I1JgNmEAaXTaqaU1qvZDi5dgtes1IC69ycthz8,10753
|
|
74
74
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
75
75
|
datachain/lib/data_model.py,sha256=ZwBXELtqROEdLL4DmxTipnwUZmhQvMz_UVDzyf7nQ9Y,2899
|
|
76
76
|
datachain/lib/dataset_info.py,sha256=7w-DoKOyIVoOtWGCgciMLcP5CiAWJB3rVI-vUDF80k0,3311
|
|
77
77
|
datachain/lib/file.py,sha256=gTzJXaGIyFOrw_B4yiOEs7U23n4oAQuWDI2v9KWwp2o,33889
|
|
78
|
-
datachain/lib/hf.py,sha256=
|
|
78
|
+
datachain/lib/hf.py,sha256=_dCoGTv7n5cBgxhCDfZI-t3hnMCXGHd6sEsxRThcizE,5754
|
|
79
79
|
datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
|
|
80
80
|
datachain/lib/listing.py,sha256=U-2stsTEwEsq4Y80dqGfktGzkmB5-ZntnL1_rzXlH0k,7089
|
|
81
81
|
datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
|
|
@@ -104,7 +104,7 @@ datachain/lib/dc/__init__.py,sha256=TFci5HTvYGjBesNUxDAnXaX36PnzPEUSn5a6JxB9o0U,
|
|
|
104
104
|
datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
|
|
105
105
|
datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,6051
|
|
106
106
|
datachain/lib/dc/datachain.py,sha256=_FJnpgNN_b2xz39MsgeS0NTto0hzpcFPbJlaUBLcqTs,87094
|
|
107
|
-
datachain/lib/dc/datasets.py,sha256=
|
|
107
|
+
datachain/lib/dc/datasets.py,sha256=P6CIJizD2IYFwOQG5D3VbQRjDmUiRH0ysdtb551Xdm8,15098
|
|
108
108
|
datachain/lib/dc/hf.py,sha256=PJl2wiLjdRsMz0SYbLT-6H8b-D5i2WjeH7li8HHOk_0,2145
|
|
109
109
|
datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
|
|
110
110
|
datachain/lib/dc/listings.py,sha256=V379Cb-7ZyquM0w7sWArQZkzInZy4GB7QQ1ZfowKzQY,4544
|
|
@@ -157,9 +157,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
157
157
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
158
158
|
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
159
159
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
160
|
-
datachain-0.25.
|
|
161
|
-
datachain-0.25.
|
|
162
|
-
datachain-0.25.
|
|
163
|
-
datachain-0.25.
|
|
164
|
-
datachain-0.25.
|
|
165
|
-
datachain-0.25.
|
|
160
|
+
datachain-0.25.2.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
161
|
+
datachain-0.25.2.dist-info/METADATA,sha256=aA1Ee1umcPyEXMzrdlhNexDW1rq2zRo2IJHAKyOJwN4,13385
|
|
162
|
+
datachain-0.25.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
163
|
+
datachain-0.25.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
164
|
+
datachain-0.25.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
165
|
+
datachain-0.25.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|