datachain 0.26.1__py3-none-any.whl → 0.26.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/lib/arrow.py CHANGED
@@ -262,7 +262,7 @@ def _get_hf_schema(
262
262
  from datachain.lib.hf import get_output_schema, schema_from_arrow
263
263
 
264
264
  features = schema_from_arrow(schema)
265
- return features, get_output_schema(features)
265
+ return features, get_output_schema(features)[0]
266
266
  return None
267
267
 
268
268
 
@@ -3,6 +3,7 @@ from datetime import datetime
3
3
  from typing import ClassVar, Optional, Union, get_args, get_origin
4
4
 
5
5
  from pydantic import AliasChoices, BaseModel, Field, create_model
6
+ from pydantic.fields import FieldInfo
6
7
 
7
8
  from datachain.lib.model_store import ModelStore
8
9
  from datachain.lib.utils import normalize_col_names
@@ -89,7 +90,16 @@ def dict_to_data_model(
89
90
  }
90
91
 
91
92
  class _DataModelStrict(BaseModel, extra="forbid"):
92
- pass
93
+ @classmethod
94
+ def _model_fields_by_aliases(cls) -> dict[str, tuple[str, FieldInfo]]:
95
+ """Returns a map of aliases to original field names and info."""
96
+ field_info = {}
97
+ for _name, field in cls.model_fields.items():
98
+ assert isinstance(field.validation_alias, AliasChoices)
99
+ # Add mapping for all aliases (both normalized and original names)
100
+ for alias in field.validation_alias.choices:
101
+ field_info[str(alias)] = (_name, field)
102
+ return field_info
93
103
 
94
104
  return create_model(
95
105
  name,
datachain/lib/dc/hf.py CHANGED
@@ -32,6 +32,7 @@ def read_hf(
32
32
  Parameters:
33
33
  dataset : Path or name of the dataset to read from Hugging Face Hub,
34
34
  or an instance of `datasets.Dataset`-like object.
35
+ args : Additional positional arguments to pass to datasets.load_dataset.
35
36
  session : Session to use for the chain.
36
37
  settings : Settings to use for the chain.
37
38
  column : Generated object column name.
@@ -64,8 +65,9 @@ def read_hf(
64
65
 
65
66
  model_name = model_name or column or ""
66
67
  hf_features = next(iter(ds_dict.values())).features
67
- output = output | get_output_schema(hf_features)
68
- model = dict_to_data_model(model_name, output)
68
+ hf_output, normalized_names = get_output_schema(hf_features, list(output.keys()))
69
+ output = output | hf_output
70
+ model = dict_to_data_model(model_name, output, list(normalized_names.values()))
69
71
  if column:
70
72
  output = {column: model}
71
73
 
datachain/lib/hf.py CHANGED
@@ -26,7 +26,7 @@ except ImportError as exc:
26
26
  ) from exc
27
27
 
28
28
  from io import BytesIO
29
- from typing import TYPE_CHECKING, Any, Union
29
+ from typing import TYPE_CHECKING, Any, Optional, Union
30
30
 
31
31
  import PIL
32
32
  from tqdm.auto import tqdm
@@ -34,6 +34,7 @@ from tqdm.auto import tqdm
34
34
  from datachain.lib.arrow import arrow_type_mapper
35
35
  from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
36
36
  from datachain.lib.udf import Generator
37
+ from datachain.lib.utils import normalize_col_names
37
38
 
38
39
  if TYPE_CHECKING:
39
40
  import pyarrow as pa
@@ -94,14 +95,18 @@ class HFGenerator(Generator):
94
95
  ds = self.ds_dict[split]
95
96
  if split:
96
97
  desc += f" split '{split}'"
98
+ model_fields = self.output_schema._model_fields_by_aliases() # type: ignore[attr-defined]
97
99
  with tqdm(desc=desc, unit=" rows", leave=False) as pbar:
98
100
  for row in ds:
99
101
  output_dict = {}
100
102
  if split and "split" in self.output_schema.model_fields:
101
103
  output_dict["split"] = split
102
104
  for name, feat in ds.features.items():
103
- anno = self.output_schema.model_fields[name].annotation
104
- output_dict[name] = convert_feature(row[name], feat, anno)
105
+ normalized_name, info = model_fields[name]
106
+ anno = info.annotation
107
+ output_dict[normalized_name] = convert_feature(
108
+ row[name], feat, anno
109
+ )
105
110
  yield self.output_schema(**output_dict)
106
111
  pbar.update(1)
107
112
 
@@ -122,10 +127,12 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
122
127
  return HFClassLabel(string=feat.names[val], integer=val)
123
128
  if isinstance(feat, dict):
124
129
  sdict = {}
130
+ model_fields = anno._model_fields_by_aliases() # type: ignore[attr-defined]
125
131
  for sname in val:
126
132
  sfeat = feat[sname]
127
- sanno = anno.model_fields[sname].annotation
128
- sdict[sname] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
133
+ norm_name, info = model_fields[sname]
134
+ sanno = info.annotation
135
+ sdict[norm_name] = [convert_feature(v, sfeat, sanno) for v in val[sname]]
129
136
  return anno(**sdict)
130
137
  if isinstance(feat, Image):
131
138
  if isinstance(val, dict):
@@ -135,12 +142,26 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any:
135
142
  return HFAudio(array=val["array"], sampling_rate=val["sampling_rate"])
136
143
 
137
144
 
138
- def get_output_schema(features: Features) -> dict[str, DataType]:
139
- """Generate UDF output schema from huggingface datasets features."""
145
+ def get_output_schema(
146
+ features: Features, existing_column_names: Optional[list[str]] = None
147
+ ) -> tuple[dict[str, DataType], dict[str, str]]:
148
+ """
149
+ Generate UDF output schema from Hugging Face datasets features. It normalizes the
150
+ column names and returns a mapping of normalized names to original names along with
151
+ the data types. `existing_column_names` is the list of column names that already
152
+ exist in the dataset (to avoid name collisions due to normalization).
153
+ """
154
+ existing_column_names = existing_column_names or []
140
155
  fields_dict = {}
141
- for name, val in features.items():
142
- fields_dict[name] = _feature_to_chain_type(name, val)
143
- return fields_dict
156
+ normalized_names = normalize_col_names(
157
+ existing_column_names + list(features.keys())
158
+ )
159
+ # List of tuple(str, str) for HF dataset feature names, (normalized, original)
160
+ new_feature_names = list(normalized_names.items())[len(existing_column_names) :]
161
+ for idx, feat in enumerate(features.items()):
162
+ name, val = feat
163
+ fields_dict[new_feature_names[idx][0]] = _feature_to_chain_type(name, val)
164
+ return fields_dict, normalized_names
144
165
 
145
166
 
146
167
  def _feature_to_chain_type(name: str, val: Any) -> DataType: # noqa: PLR0911
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.26.1
3
+ Version: 0.26.2
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -98,7 +98,7 @@ Requires-Dist: scipy; extra == "tests"
98
98
  Requires-Dist: ultralytics; extra == "tests"
99
99
  Provides-Extra: dev
100
100
  Requires-Dist: datachain[docs,tests]; extra == "dev"
101
- Requires-Dist: mypy==1.16.1; extra == "dev"
101
+ Requires-Dist: mypy==1.17.0; extra == "dev"
102
102
  Requires-Dist: types-python-dateutil; extra == "dev"
103
103
  Requires-Dist: types-pytz; extra == "dev"
104
104
  Requires-Dist: types-PyYAML; extra == "dev"
@@ -70,13 +70,13 @@ datachain/func/random.py,sha256=t7jwXsI8-hy0qAdvjAntgzy-AHtTAfozlZ1CpKR-QZE,458
70
70
  datachain/func/string.py,sha256=X9u4ip97U63RCaKRhMddoze7HgPiY3LbPRn9G06UWWo,7311
71
71
  datachain/func/window.py,sha256=ImyRpc1QI8QUSPO7KdD60e_DPVo7Ja0G5kcm6BlyMcw,1584
72
72
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
- datachain/lib/arrow.py,sha256=hdEQ8I1JgNmEAaXTaqaU1qvZDi5dgtes1IC69ycthz8,10753
73
+ datachain/lib/arrow.py,sha256=gMgmiMOhTGFMSyWBbjyzF2RsSXjx0XmUGPoSBxcWwe0,10756
74
74
  datachain/lib/audio.py,sha256=J7XJ14ItPF9y6pN-tmMV9In9X9rgwlBwzyzdGOUkPGk,4376
75
75
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
76
- datachain/lib/data_model.py,sha256=ZwBXELtqROEdLL4DmxTipnwUZmhQvMz_UVDzyf7nQ9Y,2899
76
+ datachain/lib/data_model.py,sha256=JPHPO6z-pehyiY-qNBAnp8u015xUHrijPKbGkMHS6lo,3493
77
77
  datachain/lib/dataset_info.py,sha256=7w-DoKOyIVoOtWGCgciMLcP5CiAWJB3rVI-vUDF80k0,3311
78
78
  datachain/lib/file.py,sha256=tHBBacsh1580UPFC6fAINBNwNiyymNgzj89rpsz1LKc,40817
79
- datachain/lib/hf.py,sha256=_dCoGTv7n5cBgxhCDfZI-t3hnMCXGHd6sEsxRThcizE,5754
79
+ datachain/lib/hf.py,sha256=dadHs2dsi4ALwXz92Y3T7AUgq3wQF4mBydWqHCMjvks,6880
80
80
  datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
81
81
  datachain/lib/listing.py,sha256=U-2stsTEwEsq4Y80dqGfktGzkmB5-ZntnL1_rzXlH0k,7089
82
82
  datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
@@ -106,7 +106,7 @@ datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
106
106
  datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,6051
107
107
  datachain/lib/dc/datachain.py,sha256=ap54lcuj71tvp0zX1jiFFiEWvA5UPeyYJRJkd2APmlI,92897
108
108
  datachain/lib/dc/datasets.py,sha256=P6CIJizD2IYFwOQG5D3VbQRjDmUiRH0ysdtb551Xdm8,15098
109
- datachain/lib/dc/hf.py,sha256=PJl2wiLjdRsMz0SYbLT-6H8b-D5i2WjeH7li8HHOk_0,2145
109
+ datachain/lib/dc/hf.py,sha256=MJWO-NL4jAD6CEAmXsyeqXEyvefRLMhyxhT9jKT5vMU,2324
110
110
  datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
111
111
  datachain/lib/dc/listings.py,sha256=V379Cb-7ZyquM0w7sWArQZkzInZy4GB7QQ1ZfowKzQY,4544
112
112
  datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
@@ -158,9 +158,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
158
158
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
159
159
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
160
160
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
161
- datachain-0.26.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
162
- datachain-0.26.1.dist-info/METADATA,sha256=C0Pb9d9IcJ6oOPXihcyEhTc_Rf7Fe4pP_anKhC3JfeU,13543
163
- datachain-0.26.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
- datachain-0.26.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
165
- datachain-0.26.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
166
- datachain-0.26.1.dist-info/RECORD,,
161
+ datachain-0.26.2.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
162
+ datachain-0.26.2.dist-info/METADATA,sha256=2wCIuTSRK8oTU4mCd3uMRr9PQ9WxeEW2XCncqJep7Hc,13543
163
+ datachain-0.26.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
+ datachain-0.26.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
165
+ datachain-0.26.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
166
+ datachain-0.26.2.dist-info/RECORD,,