datachain 0.34.3__py3-none-any.whl → 0.34.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -1,6 +1,8 @@
1
1
  import copy
2
2
  import hashlib
3
3
  import json
4
+ import logging
5
+ import math
4
6
  import warnings
5
7
  from collections.abc import Iterator, Sequence
6
8
  from dataclasses import dataclass
@@ -23,7 +25,7 @@ from typing import ( # noqa: UP035
23
25
  get_origin,
24
26
  )
25
27
 
26
- from pydantic import BaseModel, Field, create_model
28
+ from pydantic import BaseModel, Field, ValidationError, create_model
27
29
  from sqlalchemy import ColumnElement
28
30
  from typing_extensions import Literal as LiteralEx
29
31
 
@@ -43,6 +45,8 @@ if TYPE_CHECKING:
43
45
  from datachain.catalog import Catalog
44
46
 
45
47
 
48
+ logger = logging.getLogger(__name__)
49
+
46
50
  NAMES_TO_TYPES = {
47
51
  "int": int,
48
52
  "str": str,
@@ -463,12 +467,32 @@ class SignalSchema:
463
467
  objs.append(self.setup_values.get(name))
464
468
  elif (fr := ModelStore.to_pydantic(fr_type)) is not None:
465
469
  j, pos = unflatten_to_json_pos(fr, row, pos)
466
- objs.append(fr(**j))
470
+ try:
471
+ obj = fr(**j)
472
+ except ValidationError as e:
473
+ if self._all_values_none(j):
474
+ logger.debug("Failed to create input for %s: %s", name, e)
475
+ obj = None
476
+ else:
477
+ raise
478
+ objs.append(obj)
467
479
  else:
468
480
  objs.append(row[pos])
469
481
  pos += 1
470
482
  return objs
471
483
 
484
+ @staticmethod
485
+ def _all_values_none(value: Any) -> bool:
486
+ if isinstance(value, dict):
487
+ return all(SignalSchema._all_values_none(v) for v in value.values())
488
+ if isinstance(value, (list, tuple, set)):
489
+ return all(SignalSchema._all_values_none(v) for v in value)
490
+ if isinstance(value, float):
491
+ # NaN is used to represent NULL and NaN float values in datachain
492
+ # Since SQLite does not have a separate NULL type, we need to check for NaN
493
+ return math.isnan(value) or value is None
494
+ return value is None
495
+
472
496
  def get_file_signal(self) -> Optional[str]:
473
497
  for signal_name, signal_type in self.values.items():
474
498
  if (fr := ModelStore.to_pydantic(signal_type)) is not None and issubclass(
@@ -546,8 +570,15 @@ class SignalSchema:
546
570
  pos += 1
547
571
  else:
548
572
  json, pos = unflatten_to_json_pos(fr, row, pos) # type: ignore[union-attr]
549
- obj = fr(**json)
550
- SignalSchema._set_file_stream(obj, catalog, cache)
573
+ try:
574
+ obj = fr(**json)
575
+ SignalSchema._set_file_stream(obj, catalog, cache)
576
+ except ValidationError as e:
577
+ if self._all_values_none(json):
578
+ logger.debug("Failed to create feature for %s: %s", fr_cls, e)
579
+ obj = None
580
+ else:
581
+ raise
551
582
  res.append(obj)
552
583
  return res
553
584
 
datachain/lib/udf.py CHANGED
@@ -160,9 +160,15 @@ class UDFBase(AbstractUDF):
160
160
  """
161
161
  Creates SHA hash of this UDF function. It takes into account function,
162
162
  inputs and outputs.
163
+
164
+ For function-based UDFs, hashes self._func.
165
+ For class-based UDFs, hashes the process method.
163
166
  """
167
+ # Hash user code: either _func (function-based) or process method (class-based)
168
+ func_to_hash = self._func if self._func else self.process
169
+
164
170
  parts = [
165
- hash_callable(self._func),
171
+ hash_callable(func_to_hash),
166
172
  self.params.hash() if self.params else "",
167
173
  self.output.hash(),
168
174
  ]
@@ -982,18 +982,26 @@ class SQLUnion(Step):
982
982
 
983
983
  columns1, columns2 = _order_columns(q1.columns, q2.columns)
984
984
 
985
+ union_select = sqlalchemy.select(*columns1).union_all(
986
+ sqlalchemy.select(*columns2)
987
+ )
988
+ union_cte = union_select.cte()
989
+ regenerated = self.query1.catalog.warehouse._regenerate_system_columns(
990
+ union_cte
991
+ )
992
+ result_columns = tuple(regenerated.selected_columns)
993
+
985
994
  def q(*columns):
986
- names = {c.name for c in columns}
987
- col1 = [c for c in columns1 if c.name in names]
988
- col2 = [c for c in columns2 if c.name in names]
989
- res = sqlalchemy.select(*col1).union_all(sqlalchemy.select(*col2))
995
+ if not columns:
996
+ return regenerated
990
997
 
991
- subquery = res.subquery()
992
- return sqlalchemy.select(*subquery.c).select_from(subquery)
998
+ names = {c.name for c in columns}
999
+ selected = [c for c in result_columns if c.name in names]
1000
+ return regenerated.with_only_columns(*selected)
993
1001
 
994
1002
  return step_result(
995
1003
  q,
996
- columns1,
1004
+ result_columns,
997
1005
  dependencies=self.query1.dependencies | self.query2.dependencies,
998
1006
  )
999
1007
 
@@ -1009,7 +1017,9 @@ class SQLJoin(Step):
1009
1017
  rname: str
1010
1018
 
1011
1019
  def hash_inputs(self) -> str:
1012
- predicates = ensure_sequence(self.predicates or [])
1020
+ predicates = (
1021
+ ensure_sequence(self.predicates) if self.predicates is not None else []
1022
+ )
1013
1023
 
1014
1024
  parts = [
1015
1025
  bytes.fromhex(self.query1.hash()),
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.34.3
3
+ Version: 0.34.5
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -90,10 +90,10 @@ datachain/lib/namespaces.py,sha256=ZyIYUa3WMrv6R5HrSoLsmLiEbvUQDl8sBINLUmWOYG0,3
90
90
  datachain/lib/projects.py,sha256=_YeU9PPcH_pC8-sbX-47XtWSdl1ltVKnALY8azWLJkM,4112
91
91
  datachain/lib/pytorch.py,sha256=S-st2SAczYut13KMf6eSqP_OQ8otWI5TRmzhK5fN3k0,7828
92
92
  datachain/lib/settings.py,sha256=xBQEPZfgaYKhHIFLd0u5CBTYDcJS8ZHCm47x7GJErFU,7666
93
- datachain/lib/signal_schema.py,sha256=WDFLbzXEOhgv865TePcFpLQHxsKQHtn8kTzaQGUG_XA,39479
93
+ datachain/lib/signal_schema.py,sha256=NsL2ISnSRN-lKRpXzB9CtsUj2tVKcoAe73TaaZKMT-0,40774
94
94
  datachain/lib/tar.py,sha256=MLcVjzIgBqRuJacCNpZ6kwSZNq1i2tLyROc8PVprHsA,999
95
95
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
96
- datachain/lib/udf.py,sha256=DdUxGBo9Y7Jz6aTBKgwex7YfK1RNaGm1JUlXCqs7qnw,18122
96
+ datachain/lib/udf.py,sha256=bU_7xj6Mz4SsajFJ6tmrZm9Ygoi1ESoOyh1Q00W2zX4,18389
97
97
  datachain/lib/udf_signature.py,sha256=Yz20iJ-WF1pijT3hvcDIKFzgWV9gFxZM73KZRx3NbPk,7560
98
98
  datachain/lib/utils.py,sha256=RLji1gHnfDXtJCnBo8BcNu1obndFpVsXJ_1Vb-FQ9Qo,4554
99
99
  datachain/lib/video.py,sha256=ddVstiMkfxyBPDsnjCKY0d_93bw-DcMqGqN60yzsZoo,6851
@@ -131,7 +131,7 @@ datachain/model/ultralytics/pose.py,sha256=pvoXrWWUSWT_UBaMwUb5MBHAY57Co2HFDPigF
131
131
  datachain/model/ultralytics/segment.py,sha256=v9_xDxd5zw_I8rXsbl7yQXgEdTs2T38zyY_Y4XGN8ok,3194
132
132
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
133
133
  datachain/query/batch.py,sha256=ocPeNgrJM6Y_6SYCx3O2cwlCFAhNMfoYgB99GP6A1Bg,4294
134
- datachain/query/dataset.py,sha256=P7pyRiWc9G3AfzxvyB2yToKW3bXoUCrfFOtFdiVbCrU,67836
134
+ datachain/query/dataset.py,sha256=I55ubMnoWpjoc4Ntw8zbp-i-49w0I95J7hCk_OCU6IU,68110
135
135
  datachain/query/dispatch.py,sha256=pygp7xg3lUDKlYHhecKxW5fB3zOSX1fPJfZBU4dfijk,16067
136
136
  datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
137
137
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -165,9 +165,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
165
165
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
166
166
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
167
167
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
168
- datachain-0.34.3.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
169
- datachain-0.34.3.dist-info/METADATA,sha256=l1d5np6lvB4K8ohVibIbhzlNobGtlglmBhK0VcQqV-U,13655
170
- datachain-0.34.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
171
- datachain-0.34.3.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
172
- datachain-0.34.3.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
173
- datachain-0.34.3.dist-info/RECORD,,
168
+ datachain-0.34.5.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
169
+ datachain-0.34.5.dist-info/METADATA,sha256=aovavI1WF3QvHC23Vcp4HxUykcEFNypIK3LlKo_wrDA,13655
170
+ datachain-0.34.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
171
+ datachain-0.34.5.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
172
+ datachain-0.34.5.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
173
+ datachain-0.34.5.dist-info/RECORD,,