datachain 0.37.8__py3-none-any.whl → 0.37.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -18,6 +18,7 @@ from datachain.data_storage.schema import convert_rows_custom_column_types
18
18
  from datachain.data_storage.serializer import Serializable
19
19
  from datachain.dataset import DatasetRecord, StorageURI
20
20
  from datachain.lib.file import File
21
+ from datachain.lib.model_store import ModelStore
21
22
  from datachain.lib.signal_schema import SignalSchema
22
23
  from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
23
24
  from datachain.query.batch import RowsOutput
@@ -76,6 +77,29 @@ class AbstractWarehouse(ABC, Serializable):
76
77
  def cleanup_for_tests(self):
77
78
  """Cleanup for tests."""
78
79
 
80
+ def _to_jsonable(self, obj: Any) -> Any:
81
+ """Recursively convert Python/Pydantic structures into JSON-serializable
82
+ objects.
83
+ """
84
+
85
+ if ModelStore.is_pydantic(type(obj)):
86
+ return obj.model_dump()
87
+
88
+ if isinstance(obj, dict):
89
+ out: dict[str, Any] = {}
90
+ for k, v in obj.items():
91
+ if not isinstance(k, str):
92
+ key_str = json.dumps(self._to_jsonable(k), ensure_ascii=False)
93
+ else:
94
+ key_str = k
95
+ out[key_str] = self._to_jsonable(v)
96
+ return out
97
+
98
+ if isinstance(obj, (list, tuple, set)):
99
+ return [self._to_jsonable(i) for i in obj]
100
+
101
+ return obj
102
+
79
103
  def convert_type( # noqa: PLR0911
80
104
  self,
81
105
  val: Any,
@@ -122,11 +146,13 @@ class AbstractWarehouse(ABC, Serializable):
122
146
  if col_python_type is dict or col_type_name == "JSON":
123
147
  if value_type is str:
124
148
  return val
125
- if value_type in (dict, list):
126
- return json.dumps(val, ensure_ascii=False)
127
- raise ValueError(
128
- f"Cannot convert value {val!r} with type {value_type} to JSON"
129
- )
149
+ try:
150
+ json_ready = self._to_jsonable(val)
151
+ return json.dumps(json_ready, ensure_ascii=False)
152
+ except Exception as e:
153
+ raise ValueError(
154
+ f"Cannot convert value {val!r} with type {value_type} to JSON"
155
+ ) from e
130
156
 
131
157
  if isinstance(val, col_python_type):
132
158
  return val
@@ -13,41 +13,153 @@ class ValuesToTupleError(DataChainParamsError):
13
13
  super().__init__(f"Cannot convert signals for dataset{ds_name}: {msg}")
14
14
 
15
15
 
16
- def values_to_tuples( # noqa: C901, PLR0912
17
- ds_name: str = "",
18
- output: DataType | Sequence[str] | dict[str, DataType] | None = None,
19
- **fr_map: Sequence[DataValue],
20
- ) -> tuple[Any, Any, Any]:
21
- if output:
22
- if not isinstance(output, (Sequence, str, dict)):
23
- if len(fr_map) != 1:
24
- raise ValuesToTupleError(
25
- ds_name,
26
- f"only one output type was specified, {len(fr_map)} expected",
27
- )
28
- if not isinstance(output, type):
29
- raise ValuesToTupleError(
30
- ds_name,
31
- f"output must specify a type while '{output}' was given",
32
- )
16
+ def _find_first_non_none(sequence: Sequence[Any]) -> Any | None:
17
+ """Find the first non-None element in a sequence."""
18
+ try:
19
+ return next(itertools.dropwhile(lambda i: i is None, sequence))
20
+ except StopIteration:
21
+ return None
22
+
23
+
24
+ def _infer_list_item_type(lst: list) -> type:
25
+ """Infer the item type of a list, handling None values and nested lists."""
26
+ if len(lst) == 0:
27
+ # Default to str when list is empty to avoid generic list
28
+ return str
29
+
30
+ first_item = _find_first_non_none(lst)
31
+ if first_item is None:
32
+ # Default to str when all items are None
33
+ return str
34
+
35
+ item_type = type(first_item)
36
+
37
+ # Handle nested lists one level deep
38
+ if isinstance(first_item, list) and len(first_item) > 0:
39
+ nested_item = _find_first_non_none(first_item)
40
+ if nested_item is not None:
41
+ return list[type(nested_item)] # type: ignore[misc, return-value]
42
+ # Default to str for nested lists with all None
43
+ return list[str] # type: ignore[return-value]
44
+
45
+ return item_type
46
+
47
+
48
+ def _infer_dict_value_type(dct: dict) -> type:
49
+ """Infer the value type of a dict, handling None values and list values."""
50
+ if len(dct) == 0:
51
+ # Default to str when dict is empty to avoid generic dict values
52
+ return str
53
+
54
+ # Find first non-None value
55
+ first_value = None
56
+ for val in dct.values():
57
+ if val is not None:
58
+ first_value = val
59
+ break
60
+
61
+ if first_value is None:
62
+ # Default to str when all values are None
63
+ return str
64
+
65
+ # Handle list values
66
+ if isinstance(first_value, list) and len(first_value) > 0:
67
+ list_item = _find_first_non_none(first_value)
68
+ if list_item is not None:
69
+ return list[type(list_item)] # type: ignore[misc, return-value]
70
+ # Default to str for lists with all None
71
+ return list[str] # type: ignore[return-value]
72
+
73
+ return type(first_value)
74
+
75
+
76
+ def _infer_type_from_sequence(
77
+ sequence: Sequence[DataValue], signal_name: str, ds_name: str
78
+ ) -> type:
79
+ """
80
+ Infer the type from a sequence of values.
81
+
82
+ Returns str if all values are None, otherwise infers from the first non-None value.
83
+ Handles lists and dicts with proper type inference for nested structures.
84
+ """
85
+ first_element = _find_first_non_none(sequence)
86
+
87
+ if first_element is None:
88
+ # Default to str if column is empty or all values are None
89
+ return str
90
+
91
+ typ = type(first_element)
92
+
93
+ if not is_chain_type(typ):
94
+ raise ValuesToTupleError(
95
+ ds_name,
96
+ f"signal '{signal_name}' has unsupported type '{typ.__name__}'."
97
+ f" Please use DataModel types: {DataTypeNames}",
98
+ )
99
+
100
+ if isinstance(first_element, list):
101
+ item_type = _infer_list_item_type(first_element)
102
+ return list[item_type] # type: ignore[valid-type, return-value]
103
+
104
+ if isinstance(first_element, dict):
105
+ # If the first dict is empty, use str as default key/value types
106
+ if len(first_element) == 0:
107
+ return dict[str, str] # type: ignore[return-value]
108
+ first_key = next(iter(first_element.keys()))
109
+ value_type = _infer_dict_value_type(first_element)
110
+ return dict[type(first_key), value_type] # type: ignore[misc, return-value]
111
+
112
+ return typ
33
113
 
34
- key: str = next(iter(fr_map.keys()))
35
- output = {key: output} # type: ignore[dict-item]
36
114
 
37
- if not isinstance(output, dict):
115
+ def _validate_and_normalize_output(
116
+ output: DataType | Sequence[str] | dict[str, DataType] | None,
117
+ fr_map: dict[str, Sequence[DataValue]],
118
+ ds_name: str,
119
+ ) -> dict[str, DataType] | None:
120
+ """Validate and normalize the output parameter to a dict format."""
121
+ if not output:
122
+ return None
123
+
124
+ if not isinstance(output, (Sequence, str, dict)):
125
+ if len(fr_map) != 1:
38
126
  raise ValuesToTupleError(
39
127
  ds_name,
40
- "output type must be dict[str, DataType] while "
41
- f"'{type(output).__name__}' is given",
128
+ f"only one output type was specified, {len(fr_map)} expected",
42
129
  )
43
-
44
- if len(output) != len(fr_map):
130
+ if not isinstance(output, type):
45
131
  raise ValuesToTupleError(
46
132
  ds_name,
47
- f"number of outputs '{len(output)}' should match"
48
- f" number of signals '{len(fr_map)}'",
133
+ f"output must specify a type while '{output}' was given",
49
134
  )
50
135
 
136
+ key: str = next(iter(fr_map.keys()))
137
+ return {key: output} # type: ignore[dict-item]
138
+
139
+ if not isinstance(output, dict):
140
+ raise ValuesToTupleError(
141
+ ds_name,
142
+ "output type must be dict[str, DataType] while "
143
+ f"'{type(output).__name__}' is given",
144
+ )
145
+
146
+ if len(output) != len(fr_map):
147
+ raise ValuesToTupleError(
148
+ ds_name,
149
+ f"number of outputs '{len(output)}' should match"
150
+ f" number of signals '{len(fr_map)}'",
151
+ )
152
+
153
+ return output # type: ignore[return-value]
154
+
155
+
156
+ def values_to_tuples(
157
+ ds_name: str = "",
158
+ output: DataType | Sequence[str] | dict[str, DataType] | None = None,
159
+ **fr_map: Sequence[DataValue],
160
+ ) -> tuple[Any, Any, Any]:
161
+ output = _validate_and_normalize_output(output, fr_map, ds_name)
162
+
51
163
  types_map: dict[str, type] = {}
52
164
  length = -1
53
165
  for k, v in fr_map.items():
@@ -65,23 +177,7 @@ def values_to_tuples( # noqa: C901, PLR0912
65
177
  # FIXME: Stops as soon as it finds the first non-None value.
66
178
  # If a non-None value appears early, it won't check the remaining items for
67
179
  # `None` values.
68
- try:
69
- first_not_none_element = next(
70
- itertools.dropwhile(lambda i: i is None, v)
71
- )
72
- except StopIteration:
73
- # set default type to `str` if column is empty or all values are `None`
74
- typ = str
75
- else:
76
- typ = type(first_not_none_element) # type: ignore[assignment]
77
- if not is_chain_type(typ):
78
- raise ValuesToTupleError(
79
- ds_name,
80
- f"signal '{k}' has unsupported type '{typ.__name__}'."
81
- f" Please use DataModel types: {DataTypeNames}",
82
- )
83
- if isinstance(first_not_none_element, list):
84
- typ = list[type(first_not_none_element[0])] # type: ignore[assignment, misc]
180
+ typ = _infer_type_from_sequence(v, k, ds_name)
85
181
  types_map[k] = typ
86
182
 
87
183
  if length < 0:
@@ -64,6 +64,9 @@ def is_chain_type(t: type) -> bool:
64
64
  if orig is list and len(args) == 1:
65
65
  return is_chain_type(get_args(t)[0])
66
66
 
67
+ if orig is dict and len(args) == 2:
68
+ return is_chain_type(args[0]) and is_chain_type(args[1])
69
+
67
70
  if orig in (Union, types.UnionType) and len(args) == 2 and (type(None) in args):
68
71
  return is_chain_type(args[0] if args[1] is type(None) else args[1])
69
72
 
@@ -1,6 +1,5 @@
1
1
  import copy
2
2
  import hashlib
3
- import json
4
3
  import logging
5
4
  import math
6
5
  import types
@@ -14,9 +13,7 @@ from typing import (
14
13
  TYPE_CHECKING,
15
14
  Annotated,
16
15
  Any,
17
- Dict, # type: ignore[UP035]
18
16
  Final,
19
- List, # type: ignore[UP035]
20
17
  Literal,
21
18
  Optional,
22
19
  Union,
@@ -24,6 +21,7 @@ from typing import (
24
21
  get_origin,
25
22
  )
26
23
 
24
+ import ujson as json
27
25
  from pydantic import BaseModel, Field, ValidationError, create_model
28
26
  from sqlalchemy import ColumnElement
29
27
  from typing_extensions import Literal as LiteralEx
@@ -569,8 +567,10 @@ class SignalSchema:
569
567
  pos = 0
570
568
  for fr_cls in self.values.values():
571
569
  if (fr := ModelStore.to_pydantic(fr_cls)) is None:
572
- res.append(row[pos])
570
+ value = row[pos]
573
571
  pos += 1
572
+ converted = self._convert_feature_value(fr_cls, value, catalog, cache)
573
+ res.append(converted)
574
574
  else:
575
575
  json, pos = unflatten_to_json_pos(fr, row, pos) # type: ignore[union-attr]
576
576
  try:
@@ -585,6 +585,72 @@ class SignalSchema:
585
585
  res.append(obj)
586
586
  return res
587
587
 
588
+ def _convert_feature_value(
589
+ self,
590
+ annotation: DataType,
591
+ value: Any,
592
+ catalog: "Catalog",
593
+ cache: bool,
594
+ ) -> Any:
595
+ """Convert raw DB value into declared annotation if needed."""
596
+ if value is None:
597
+ return None
598
+
599
+ result = value
600
+ origin = get_origin(annotation)
601
+
602
+ if origin in (Union, types.UnionType):
603
+ non_none_args = [
604
+ arg for arg in get_args(annotation) if arg is not type(None)
605
+ ]
606
+ if len(non_none_args) == 1:
607
+ annotation = non_none_args[0]
608
+ origin = get_origin(annotation)
609
+ else:
610
+ return result
611
+
612
+ if ModelStore.is_pydantic(annotation):
613
+ if isinstance(value, annotation):
614
+ obj = value
615
+ elif isinstance(value, Mapping):
616
+ obj = annotation(**value)
617
+ else:
618
+ return result
619
+ assert isinstance(obj, BaseModel)
620
+ SignalSchema._set_file_stream(obj, catalog, cache)
621
+ result = obj
622
+ elif origin is list:
623
+ args = get_args(annotation)
624
+ if args and isinstance(value, (list, tuple)):
625
+ item_type = args[0]
626
+ result = [
627
+ self._convert_feature_value(item_type, item, catalog, cache)
628
+ if item is not None
629
+ else None
630
+ for item in value
631
+ ]
632
+ elif origin is dict:
633
+ args = get_args(annotation)
634
+ if len(args) == 2 and isinstance(value, dict):
635
+ key_type, val_type = args
636
+ result = {}
637
+ for key, val in value.items():
638
+ if key_type is str:
639
+ converted_key = key
640
+ else:
641
+ loaded_key = json.loads(key)
642
+ converted_key = self._convert_feature_value(
643
+ key_type, loaded_key, catalog, cache
644
+ )
645
+ converted_val = (
646
+ self._convert_feature_value(val_type, val, catalog, cache)
647
+ if val_type is not Any
648
+ else val
649
+ )
650
+ result[converted_key] = converted_val
651
+
652
+ return result
653
+
588
654
  @staticmethod
589
655
  def _set_file_stream(
590
656
  obj: BaseModel, catalog: "Catalog", cache: bool = False
@@ -898,13 +964,13 @@ class SignalSchema:
898
964
  args = get_args(type_)
899
965
  type_str = SignalSchema._type_to_str(args[0], subtypes)
900
966
  return f"Optional[{type_str}]"
901
- if origin in (list, List): # noqa: UP006
967
+ if origin is list:
902
968
  args = get_args(type_)
903
969
  if len(args) == 0:
904
970
  return "list"
905
971
  type_str = SignalSchema._type_to_str(args[0], subtypes)
906
972
  return f"list[{type_str}]"
907
- if origin in (dict, Dict): # noqa: UP006
973
+ if origin is dict:
908
974
  args = get_args(type_)
909
975
  if len(args) == 0:
910
976
  return "dict"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.37.8
3
+ Version: 0.37.9
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -58,7 +58,7 @@ datachain/data_storage/metastore.py,sha256=DFyTkKLJN5-nFXXc7ln_rGj-FLctj0nrhXJxu
58
58
  datachain/data_storage/schema.py,sha256=3fAgiE11TIDYCW7EbTdiOm61SErRitvsLr7YPnUlVm0,9801
59
59
  datachain/data_storage/serializer.py,sha256=oL8i8smyAeVUyDepk8Xhf3lFOGOEHMoZjA5GdFzvfGI,3862
60
60
  datachain/data_storage/sqlite.py,sha256=o9TR6N27JB52M9rRXdM9uwdBektGucWtJi9UnmLGh0A,29669
61
- datachain/data_storage/warehouse.py,sha256=Zhf_HzhiEpsI0IuinAK-sF4ZMH66rV_ZDSOx-UFHv5o,34771
61
+ datachain/data_storage/warehouse.py,sha256=_TGfMOtpltHA-G1KgoeIc_FFUomSmpAr94p-9AWNYIE,35642
62
62
  datachain/diff/__init__.py,sha256=lGrygGzdWSSYJ1DgX4h2q_ko5QINEW8PKfxOwE9ZFnI,9394
63
63
  datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
64
  datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
@@ -78,7 +78,7 @@ datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
78
  datachain/lib/arrow.py,sha256=eCZtqbjAzkL4aemY74f_XkIJ_FWwXugJNjIFOwDa9w0,10815
79
79
  datachain/lib/audio.py,sha256=hHG29vqrV389im152wCjh80d0xqXGGvFnUpUwkzZejQ,7385
80
80
  datachain/lib/clip.py,sha256=nF8-N6Uz0MbAsPJBY2iXEYa3DPLo80OOer5SRNAtcGM,6149
81
- datachain/lib/data_model.py,sha256=H-bagx24-cLlC7ngSP6Dby4mB6kSxxV7KDiHxQjzwlg,3798
81
+ datachain/lib/data_model.py,sha256=srz0pfFohSXwFnt5OMi1fNjSbKkFq8vzkcO0n4PHxlQ,3904
82
82
  datachain/lib/dataset_info.py,sha256=Ym7yYcGpfUmPLrfdxueijCVRP2Go6KbyuLk_fmzYgDU,3273
83
83
  datachain/lib/file.py,sha256=YO4QUaZVZ0TVW9fahERZ3HJXPNXjB4oYzvLQntQYT9s,47501
84
84
  datachain/lib/hf.py,sha256=jmyqRDXdksojUJCiU_2XFSIoMzzDJAZQs9xr-sEwEJc,7281
@@ -91,7 +91,7 @@ datachain/lib/namespaces.py,sha256=d4Zt2mYdGFctkA20SkB1woUxrNI4JwSxruxUGKwfauc,3
91
91
  datachain/lib/projects.py,sha256=FfBfGoWvy1SccCQW2ITKdDA6V03FbnRCusOeHdPHr6Y,4059
92
92
  datachain/lib/pytorch.py,sha256=gDJiUGoSaraW3JDPr5JW2a3SqT7KwgIMMpDTAC0L1_Y,7792
93
93
  datachain/lib/settings.py,sha256=maMtywOUetJvEApDiMVfTTq-oaRNvUIfDCrqZwFL2GE,7559
94
- datachain/lib/signal_schema.py,sha256=HeACY2i1bp9HOuaNm4_DvJh54Xnay3-qjcdERUBwFTU,41180
94
+ datachain/lib/signal_schema.py,sha256=k43MncD1eew3zS6h_OYujg3jbvR6WH4Sj2mbrGvvvhc,43554
95
95
  datachain/lib/tar.py,sha256=MLcVjzIgBqRuJacCNpZ6kwSZNq1i2tLyROc8PVprHsA,999
96
96
  datachain/lib/text.py,sha256=uZom8qXfrv9QYvuDrvd0PuvPmj6qCsjVUwZSNr60BI4,1242
97
97
  datachain/lib/udf.py,sha256=51qgPO5s5MA5ccwl7IIPxbkEZ4IKZe4tzihcpZ8ufX0,18618
@@ -105,7 +105,7 @@ datachain/lib/convert/flatten.py,sha256=_5rjGFnN6t1KCX5ftL5rG7tiiNat7j0SdNqajO15
105
105
  datachain/lib/convert/python_to_sql.py,sha256=wfnqJ2vRL5UydNPQHshd82hUONsDBa4XyobCSTGqcEo,3187
106
106
  datachain/lib/convert/sql_to_python.py,sha256=Gxc4FylWC_Pvvuawuc2MKZIiuAWI7wje8pyeN1MxRrU,670
107
107
  datachain/lib/convert/unflatten.py,sha256=ysMkstwJzPMWUlnxn-Z-tXJR3wmhjHeSN_P-sDcLS6s,2010
108
- datachain/lib/convert/values_to_tuples.py,sha256=Sxj0ojeMSpAwM_NNoXa1dMR_2L_cQ6Xw_bAaNkEoNhU,4342
108
+ datachain/lib/convert/values_to_tuples.py,sha256=nOn7dkzScYERZH-2vgUxkQawRQ1KgdIuSDIicvqZkc0,7171
109
109
  datachain/lib/dc/__init__.py,sha256=UrUzmDH6YyVl8fxM5iXTSFtl5DZTUzEYm1MaazK4vdQ,900
110
110
  datachain/lib/dc/csv.py,sha256=fIfj5-2Ix4z5D5yZueagd5WUWw86pusJ9JJKD-U3KGg,4407
111
111
  datachain/lib/dc/database.py,sha256=Wqob3dQc9Mol_0vagzVEXzteCKS9M0E3U5130KVmQKg,14629
@@ -165,9 +165,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
165
165
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
166
166
  datachain/toolkit/split.py,sha256=9HHZl0fGs5Zj8b9l2L3IKf0AiiVNL9SnWbc2rfDiXRA,3710
167
167
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
168
- datachain-0.37.8.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
169
- datachain-0.37.8.dist-info/METADATA,sha256=6MLsgOSmSsxKXzbiOqTs9yQXaPhFu1QwgSqN_OmuQQM,13763
170
- datachain-0.37.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
171
- datachain-0.37.8.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
172
- datachain-0.37.8.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
173
- datachain-0.37.8.dist-info/RECORD,,
168
+ datachain-0.37.9.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
169
+ datachain-0.37.9.dist-info/METADATA,sha256=iZmFzvJMHOE2j4t9zGX2eliujOaRIcD0E39Cx1IXSXg,13763
170
+ datachain-0.37.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
171
+ datachain-0.37.9.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
172
+ datachain-0.37.9.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
173
+ datachain-0.37.9.dist-info/RECORD,,