datachain 0.25.0__py3-none-any.whl → 0.25.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/lib/arrow.py CHANGED
@@ -126,7 +126,16 @@ class ArrowGenerator(Generator):
126
126
  if isinstance(kwargs.get("format"), CsvFileFormat):
127
127
  kwargs["format"] = "csv"
128
128
  arrow_file = ArrowRow(file=file, index=index, kwargs=kwargs)
129
+
130
+ if self.output_schema and hasattr(vals[0], "source"):
131
+ # if we are reading parquet file written by datachain it might have
132
+ # source inside of it already, so we should not duplicate it, instead
133
+ # we are re-creating it of the self.source flag
134
+ vals[0].source = arrow_file # type: ignore[attr-defined]
135
+
136
+ return vals
129
137
  return [arrow_file, *vals]
138
+
130
139
  return vals
131
140
 
132
141
  def _process_non_datachain_record(
@@ -376,7 +376,7 @@ def move_dataset(
376
376
  the namespace and project, or a regular name. If a regular name is used,
377
377
  default values will be applied. The source dataset will no longer exist
378
378
  after the move.
379
- dst: The destination dataset name. This can also be a fully qualified
379
+ dest: The destination dataset name. This can also be a fully qualified
380
380
  name with a namespace and project, or just a regular name (default values
381
381
  will be used in that case). The original dataset will be moved here.
382
382
  session: An optional session instance. If not provided, the default session
datachain/lib/hf.py CHANGED
@@ -11,7 +11,7 @@ try:
11
11
  Image,
12
12
  IterableDataset,
13
13
  IterableDatasetDict,
14
- Sequence,
14
+ List,
15
15
  Value,
16
16
  load_dataset,
17
17
  )
@@ -59,7 +59,6 @@ class HFImage(DataModel):
59
59
 
60
60
 
61
61
  class HFAudio(DataModel):
62
- path: str
63
62
  array: list[float]
64
63
  sampling_rate: int
65
64
 
@@ -116,26 +115,24 @@ def stream_splits(ds: Union[str, HFDatasetType], *args, **kwargs):
116
115
  return {"": ds}
117
116
 
118
117
 
119
- def convert_feature(val: Any, feat: Any, anno: Any) -> Any: # noqa: PLR0911
120
- if isinstance(feat, (Value, Array2D, Array3D, Array4D, Array5D)):
118
+ def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
119
+ if isinstance(feat, (Value, Array2D, Array3D, Array4D, Array5D, List)):
121
120
  return val
122
121
  if isinstance(feat, ClassLabel):
123
122
  return HFClassLabel(string=feat.names[val], integer=val)
124
- if isinstance(feat, Sequence):
125
- if isinstance(feat.feature, dict):
126
- sdict = {}
127
- for sname in val:
128
- sfeat = feat.feature[sname]
129
- sanno = anno.model_fields[sname].annotation
130
- sdict[sname] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
131
- return anno(**sdict)
132
- return val
123
+ if isinstance(feat, dict):
124
+ sdict = {}
125
+ for sname in val:
126
+ sfeat = feat[sname]
127
+ sanno = anno.model_fields[sname].annotation
128
+ sdict[sname] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
129
+ return anno(**sdict)
133
130
  if isinstance(feat, Image):
134
131
  if isinstance(val, dict):
135
132
  return HFImage(img=val["bytes"])
136
133
  return HFImage(img=image_to_bytes(val))
137
134
  if isinstance(feat, Audio):
138
- return HFAudio(**val)
135
+ return HFAudio(array=val["array"], sampling_rate=val["sampling_rate"])
139
136
 
140
137
 
141
138
  def get_output_schema(features: Features) -> dict[str, DataType]:
@@ -151,13 +148,13 @@ def _feature_to_chain_type(name: str, val: Any) -> DataType: # noqa: PLR0911
151
148
  return arrow_type_mapper(val.pa_type)
152
149
  if isinstance(val, ClassLabel):
153
150
  return HFClassLabel
154
- if isinstance(val, Sequence):
155
- if isinstance(val.feature, dict):
156
- sequence_dict = {}
157
- for sname, sval in val.feature.items():
158
- dtype = _feature_to_chain_type(sname, sval)
159
- sequence_dict[sname] = list[dtype] # type: ignore[valid-type]
160
- return dict_to_data_model(name, sequence_dict) # type: ignore[arg-type]
151
+ if isinstance(val, dict):
152
+ sequence_dict = {}
153
+ for sname, sval in val.items():
154
+ dtype = _feature_to_chain_type(sname, sval)
155
+ sequence_dict[sname] = dtype # type: ignore[valid-type]
156
+ return dict_to_data_model(name, sequence_dict) # type: ignore[arg-type]
157
+ if isinstance(val, List):
161
158
  return list[_feature_to_chain_type(name, val.feature)] # type: ignore[arg-type,misc,return-value]
162
159
  if isinstance(val, Array2D):
163
160
  dtype = arrow_type_mapper(string_to_arrow(val.dtype))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.25.0
3
+ Version: 0.25.2
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -70,7 +70,8 @@ Provides-Extra: vector
70
70
  Requires-Dist: usearch; extra == "vector"
71
71
  Provides-Extra: hf
72
72
  Requires-Dist: numba>=0.60.0; extra == "hf"
73
- Requires-Dist: datasets[audio,vision]>=2.21.0; extra == "hf"
73
+ Requires-Dist: datasets[vision]>=4.0.0; extra == "hf"
74
+ Requires-Dist: datasets[audio]>=4.0.0; (sys_platform == "linux" or sys_platform == "darwin") and extra == "hf"
74
75
  Requires-Dist: fsspec>=2024.12.0; extra == "hf"
75
76
  Provides-Extra: video
76
77
  Requires-Dist: ffmpeg-python; extra == "video"
@@ -70,12 +70,12 @@ datachain/func/random.py,sha256=t7jwXsI8-hy0qAdvjAntgzy-AHtTAfozlZ1CpKR-QZE,458
70
70
  datachain/func/string.py,sha256=X9u4ip97U63RCaKRhMddoze7HgPiY3LbPRn9G06UWWo,7311
71
71
  datachain/func/window.py,sha256=ImyRpc1QI8QUSPO7KdD60e_DPVo7Ja0G5kcm6BlyMcw,1584
72
72
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
- datachain/lib/arrow.py,sha256=PyLXiscZ7sVEo65CAhYXmgHh1OLSH2lvbM5dAYhE8x4,10348
73
+ datachain/lib/arrow.py,sha256=hdEQ8I1JgNmEAaXTaqaU1qvZDi5dgtes1IC69ycthz8,10753
74
74
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
75
75
  datachain/lib/data_model.py,sha256=ZwBXELtqROEdLL4DmxTipnwUZmhQvMz_UVDzyf7nQ9Y,2899
76
76
  datachain/lib/dataset_info.py,sha256=7w-DoKOyIVoOtWGCgciMLcP5CiAWJB3rVI-vUDF80k0,3311
77
77
  datachain/lib/file.py,sha256=gTzJXaGIyFOrw_B4yiOEs7U23n4oAQuWDI2v9KWwp2o,33889
78
- datachain/lib/hf.py,sha256=gjxuStZBlKtNk3-4yYSlWZDv9zBGblOdvEy_Lwap5hA,5882
78
+ datachain/lib/hf.py,sha256=_dCoGTv7n5cBgxhCDfZI-t3hnMCXGHd6sEsxRThcizE,5754
79
79
  datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
80
80
  datachain/lib/listing.py,sha256=U-2stsTEwEsq4Y80dqGfktGzkmB5-ZntnL1_rzXlH0k,7089
81
81
  datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
@@ -104,7 +104,7 @@ datachain/lib/dc/__init__.py,sha256=TFci5HTvYGjBesNUxDAnXaX36PnzPEUSn5a6JxB9o0U,
104
104
  datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
105
105
  datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,6051
106
106
  datachain/lib/dc/datachain.py,sha256=_FJnpgNN_b2xz39MsgeS0NTto0hzpcFPbJlaUBLcqTs,87094
107
- datachain/lib/dc/datasets.py,sha256=eBhcybEeXHcQ_7RweRCh5uJyF5Ym1EEDPmD0YWYDPHw,15097
107
+ datachain/lib/dc/datasets.py,sha256=P6CIJizD2IYFwOQG5D3VbQRjDmUiRH0ysdtb551Xdm8,15098
108
108
  datachain/lib/dc/hf.py,sha256=PJl2wiLjdRsMz0SYbLT-6H8b-D5i2WjeH7li8HHOk_0,2145
109
109
  datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
110
110
  datachain/lib/dc/listings.py,sha256=V379Cb-7ZyquM0w7sWArQZkzInZy4GB7QQ1ZfowKzQY,4544
@@ -157,9 +157,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
157
157
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
158
158
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
159
159
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
160
- datachain-0.25.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
161
- datachain-0.25.0.dist-info/METADATA,sha256=8CTAh5kMX-1sYBuh5CXD6u_gLS1yIWwWTwF1_umz7ek,13281
162
- datachain-0.25.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
163
- datachain-0.25.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
164
- datachain-0.25.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
165
- datachain-0.25.0.dist-info/RECORD,,
160
+ datachain-0.25.2.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
161
+ datachain-0.25.2.dist-info/METADATA,sha256=aA1Ee1umcPyEXMzrdlhNexDW1rq2zRo2IJHAKyOJwN4,13385
162
+ datachain-0.25.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
163
+ datachain-0.25.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
164
+ datachain-0.25.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
165
+ datachain-0.25.2.dist-info/RECORD,,