datachain 0.25.0__py3-none-any.whl → 0.25.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/lib/arrow.py CHANGED
@@ -126,7 +126,16 @@ class ArrowGenerator(Generator):
126
126
  if isinstance(kwargs.get("format"), CsvFileFormat):
127
127
  kwargs["format"] = "csv"
128
128
  arrow_file = ArrowRow(file=file, index=index, kwargs=kwargs)
129
+
130
+ if self.output_schema and hasattr(vals[0], "source"):
131
+ # if we are reading parquet file written by datachain it might have
132
+ # source inside of it already, so we should not duplicate it, instead
133
+ # we are re-creating it of the self.source flag
134
+ vals[0].source = arrow_file # type: ignore[attr-defined]
135
+
136
+ return vals
129
137
  return [arrow_file, *vals]
138
+
130
139
  return vals
131
140
 
132
141
  def _process_non_datachain_record(
datachain/lib/hf.py CHANGED
@@ -11,7 +11,7 @@ try:
11
11
  Image,
12
12
  IterableDataset,
13
13
  IterableDatasetDict,
14
- Sequence,
14
+ List,
15
15
  Value,
16
16
  load_dataset,
17
17
  )
@@ -59,7 +59,6 @@ class HFImage(DataModel):
59
59
 
60
60
 
61
61
  class HFAudio(DataModel):
62
- path: str
63
62
  array: list[float]
64
63
  sampling_rate: int
65
64
 
@@ -116,26 +115,24 @@ def stream_splits(ds: Union[str, HFDatasetType], *args, **kwargs):
116
115
  return {"": ds}
117
116
 
118
117
 
119
- def convert_feature(val: Any, feat: Any, anno: Any) -> Any: # noqa: PLR0911
120
- if isinstance(feat, (Value, Array2D, Array3D, Array4D, Array5D)):
118
+ def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
119
+ if isinstance(feat, (Value, Array2D, Array3D, Array4D, Array5D, List)):
121
120
  return val
122
121
  if isinstance(feat, ClassLabel):
123
122
  return HFClassLabel(string=feat.names[val], integer=val)
124
- if isinstance(feat, Sequence):
125
- if isinstance(feat.feature, dict):
126
- sdict = {}
127
- for sname in val:
128
- sfeat = feat.feature[sname]
129
- sanno = anno.model_fields[sname].annotation
130
- sdict[sname] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
131
- return anno(**sdict)
132
- return val
123
+ if isinstance(feat, dict):
124
+ sdict = {}
125
+ for sname in val:
126
+ sfeat = feat[sname]
127
+ sanno = anno.model_fields[sname].annotation
128
+ sdict[sname] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
129
+ return anno(**sdict)
133
130
  if isinstance(feat, Image):
134
131
  if isinstance(val, dict):
135
132
  return HFImage(img=val["bytes"])
136
133
  return HFImage(img=image_to_bytes(val))
137
134
  if isinstance(feat, Audio):
138
- return HFAudio(**val)
135
+ return HFAudio(array=val["array"], sampling_rate=val["sampling_rate"])
139
136
 
140
137
 
141
138
  def get_output_schema(features: Features) -> dict[str, DataType]:
@@ -151,13 +148,13 @@ def _feature_to_chain_type(name: str, val: Any) -> DataType: # noqa: PLR0911
151
148
  return arrow_type_mapper(val.pa_type)
152
149
  if isinstance(val, ClassLabel):
153
150
  return HFClassLabel
154
- if isinstance(val, Sequence):
155
- if isinstance(val.feature, dict):
156
- sequence_dict = {}
157
- for sname, sval in val.feature.items():
158
- dtype = _feature_to_chain_type(sname, sval)
159
- sequence_dict[sname] = list[dtype] # type: ignore[valid-type]
160
- return dict_to_data_model(name, sequence_dict) # type: ignore[arg-type]
151
+ if isinstance(val, dict):
152
+ sequence_dict = {}
153
+ for sname, sval in val.items():
154
+ dtype = _feature_to_chain_type(sname, sval)
155
+ sequence_dict[sname] = dtype # type: ignore[valid-type]
156
+ return dict_to_data_model(name, sequence_dict) # type: ignore[arg-type]
157
+ if isinstance(val, List):
161
158
  return list[_feature_to_chain_type(name, val.feature)] # type: ignore[arg-type,misc,return-value]
162
159
  if isinstance(val, Array2D):
163
160
  dtype = arrow_type_mapper(string_to_arrow(val.dtype))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.25.0
3
+ Version: 0.25.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -70,7 +70,8 @@ Provides-Extra: vector
70
70
  Requires-Dist: usearch; extra == "vector"
71
71
  Provides-Extra: hf
72
72
  Requires-Dist: numba>=0.60.0; extra == "hf"
73
- Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
73
+ Requires-Dist: datasets[vision]>=4.0.0; extra == "hf"
74
+ Requires-Dist: datasets[audio]>=4.0.0; (sys_platform == "linux" or sys_platform == "darwin") and extra == "hf"
74
75
  Requires-Dist: fsspec>=2024.12.0; extra == "hf"
75
76
  Provides-Extra: video
76
77
  Requires-Dist: ffmpeg-python; extra == "video"
@@ -70,12 +70,12 @@ datachain/func/random.py,sha256=t7jwXsI8-hy0qAdvjAntgzy-AHtTAfozlZ1CpKR-QZE,458
70
70
  datachain/func/string.py,sha256=X9u4ip97U63RCaKRhMddoze7HgPiY3LbPRn9G06UWWo,7311
71
71
  datachain/func/window.py,sha256=ImyRpc1QI8QUSPO7KdD60e_DPVo7Ja0G5kcm6BlyMcw,1584
72
72
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
- datachain/lib/arrow.py,sha256=PyLXiscZ7sVEo65CAhYXmgHh1OLSH2lvbM5dAYhE8x4,10348
73
+ datachain/lib/arrow.py,sha256=hdEQ8I1JgNmEAaXTaqaU1qvZDi5dgtes1IC69ycthz8,10753
74
74
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
75
75
  datachain/lib/data_model.py,sha256=ZwBXELtqROEdLL4DmxTipnwUZmhQvMz_UVDzyf7nQ9Y,2899
76
76
  datachain/lib/dataset_info.py,sha256=7w-DoKOyIVoOtWGCgciMLcP5CiAWJB3rVI-vUDF80k0,3311
77
77
  datachain/lib/file.py,sha256=gTzJXaGIyFOrw_B4yiOEs7U23n4oAQuWDI2v9KWwp2o,33889
78
- datachain/lib/hf.py,sha256=gjxuStZBlKtNk3-4yYSlWZDv9zBGblOdvEy_Lwap5hA,5882
78
+ datachain/lib/hf.py,sha256=_dCoGTv7n5cBgxhCDfZI-t3hnMCXGHd6sEsxRThcizE,5754
79
79
  datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
80
80
  datachain/lib/listing.py,sha256=U-2stsTEwEsq4Y80dqGfktGzkmB5-ZntnL1_rzXlH0k,7089
81
81
  datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
@@ -157,9 +157,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
157
157
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
158
158
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
159
159
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
160
- datachain-0.25.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
161
- datachain-0.25.0.dist-info/METADATA,sha256=8CTAh5kMX-1sYBuh5CXD6u_gLS1yIWwWTwF1_umz7ek,13281
162
- datachain-0.25.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
163
- datachain-0.25.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
164
- datachain-0.25.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
165
- datachain-0.25.0.dist-info/RECORD,,
160
+ datachain-0.25.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
161
+ datachain-0.25.1.dist-info/METADATA,sha256=NaMV5K1wxCrOI7zW8agwmNfDMMkJJgaQ2fNX2PsuHnc,13385
162
+ datachain-0.25.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
163
+ datachain-0.25.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
164
+ datachain-0.25.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
165
+ datachain-0.25.1.dist-info/RECORD,,