datachain 0.37.1__py3-none-any.whl → 0.37.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -29,7 +29,6 @@ from sqlalchemy.sql.selectable import Select
29
29
  from tqdm.auto import tqdm
30
30
 
31
31
  import datachain.sql.sqlite
32
- from datachain import semver
33
32
  from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
34
33
  from datachain.data_storage.db_engine import DatabaseEngine
35
34
  from datachain.data_storage.schema import DefaultSchema
@@ -692,61 +691,6 @@ class SQLiteWarehouse(AbstractWarehouse):
692
691
  for row in self.db.execute(query, cursor=cur)
693
692
  ]
694
693
 
695
- def merge_dataset_rows(
696
- self,
697
- src: DatasetRecord,
698
- dst: DatasetRecord,
699
- src_version: str,
700
- dst_version: str,
701
- ) -> None:
702
- dst_empty = False
703
-
704
- if not self.db.has_table(self.dataset_table_name(src, src_version)):
705
- # source table doesn't exist, nothing to do
706
- return
707
-
708
- src_dr = self.dataset_rows(src, src_version).table
709
-
710
- if not self.db.has_table(self.dataset_table_name(dst, dst_version)):
711
- # destination table doesn't exist, create it
712
- self.create_dataset_rows_table(
713
- self.dataset_table_name(dst, dst_version),
714
- columns=src_dr.columns,
715
- )
716
- dst_empty = True
717
-
718
- dst_dr = self.dataset_rows(dst, dst_version).table
719
- merge_fields = [c.name for c in src_dr.columns if c.name != "sys__id"]
720
- select_src = select(*(getattr(src_dr.columns, f) for f in merge_fields))
721
-
722
- if dst_empty:
723
- # we don't need union, but just select from source to destination
724
- insert_query = sqlite.insert(dst_dr).from_select(merge_fields, select_src)
725
- else:
726
- dst_version_latest = None
727
- # find the previous version of the destination dataset
728
- dst_previous_versions = [
729
- v.version
730
- for v in dst.versions # type: ignore [union-attr]
731
- if semver.compare(v.version, dst_version) == -1
732
- ]
733
- if dst_previous_versions:
734
- dst_version_latest = max(dst_previous_versions)
735
-
736
- dst_dr_latest = self.dataset_rows(dst, dst_version_latest).table
737
-
738
- select_dst_latest = select(
739
- *(getattr(dst_dr_latest.c, f) for f in merge_fields)
740
- )
741
- union_query = sqlalchemy.union(select_src, select_dst_latest)
742
- insert_query = (
743
- sqlite.insert(dst_dr)
744
- .from_select(merge_fields, union_query)
745
- .prefix_with("OR IGNORE")
746
- )
747
-
748
- self.db.execute(insert_query)
749
-
750
694
  def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
751
695
  return (e.model_dump() for e in entries)
752
696
 
@@ -371,21 +371,6 @@ class AbstractWarehouse(ABC, Serializable):
371
371
  table = sa.Table(table_name, self.db.metadata)
372
372
  self.db.drop_table(table, if_exists=if_exists)
373
373
 
374
- @abstractmethod
375
- def merge_dataset_rows(
376
- self,
377
- src: "DatasetRecord",
378
- dst: "DatasetRecord",
379
- src_version: str,
380
- dst_version: str,
381
- ) -> None:
382
- """
383
- Merges source dataset rows and current latest destination dataset rows
384
- into a new rows table created for new destination dataset version.
385
- Note that table for new destination version must be created upfront.
386
- Merge results should not contain duplicates.
387
- """
388
-
389
374
  def dataset_rows_select(
390
375
  self,
391
376
  query: sa.Select,
datachain/delta.py CHANGED
@@ -1,16 +1,12 @@
1
- import hashlib
2
1
  from collections.abc import Sequence
3
2
  from copy import copy
4
3
  from functools import wraps
5
4
  from typing import TYPE_CHECKING, TypeVar
6
5
 
7
- from attrs import frozen
8
-
9
6
  import datachain
10
7
  from datachain.dataset import DatasetDependency, DatasetRecord
11
8
  from datachain.error import DatasetNotFoundError
12
9
  from datachain.project import Project
13
- from datachain.query.dataset import Step, step_result
14
10
 
15
11
  if TYPE_CHECKING:
16
12
  from collections.abc import Callable
@@ -18,9 +14,7 @@ if TYPE_CHECKING:
18
14
 
19
15
  from typing_extensions import ParamSpec
20
16
 
21
- from datachain.catalog import Catalog
22
17
  from datachain.lib.dc import DataChain
23
- from datachain.query.dataset import QueryGenerator
24
18
 
25
19
  P = ParamSpec("P")
26
20
 
@@ -49,38 +43,11 @@ def delta_disabled(
49
43
  return _inner
50
44
 
51
45
 
52
- @frozen
53
- class _RegenerateSystemColumnsStep(Step):
54
- catalog: "Catalog"
55
-
56
- def hash_inputs(self) -> str:
57
- return hashlib.sha256(b"regenerate_sys_columns").hexdigest()
58
-
59
- def apply(self, query_generator: "QueryGenerator", temp_tables: list[str]):
60
- selectable = query_generator.select()
61
- regenerated = self.catalog.warehouse._regenerate_system_columns(
62
- selectable,
63
- keep_existing_columns=True,
64
- regenerate_columns=None,
65
- )
66
-
67
- def q(*columns):
68
- return regenerated.with_only_columns(*columns)
69
-
70
- return step_result(q, regenerated.selected_columns)
71
-
72
-
73
46
  def _append_steps(dc: "DataChain", other: "DataChain"):
74
47
  """Returns cloned chain with appended steps from other chain.
75
48
  Steps are all those modification methods applied like filters, mappers etc.
76
49
  """
77
50
  dc = dc.clone()
78
- dc._query.steps.append(
79
- _RegenerateSystemColumnsStep(
80
- catalog=dc.session.catalog,
81
- )
82
- )
83
-
84
51
  dc._query.steps += other._query.steps.copy()
85
52
  dc.signals_schema = other.signals_schema
86
53
  return dc
@@ -150,7 +117,9 @@ def _get_retry_chain(
150
117
  error_records = result_dataset.filter(C(delta_retry) != "")
151
118
  error_source_records = source_dc.merge(
152
119
  error_records, on=on, right_on=right_on, inner=True
153
- ).select(*list(source_dc.signals_schema.values))
120
+ ).select(
121
+ *list(source_dc.signals_schema.clone_without_sys_signals().values.keys())
122
+ )
154
123
  retry_chain = error_source_records
155
124
 
156
125
  # Handle missing records if delta_retry is True
@@ -1697,14 +1697,13 @@ class DataChain:
1697
1697
  query.feature_schema = None
1698
1698
  ds = self._evolve(query=query)
1699
1699
 
1700
+ # Note: merge drops sys signals from both sides, make sure to not include it
1701
+ # in the resulting schema
1700
1702
  signals_schema = self.signals_schema.clone_without_sys_signals()
1701
1703
  right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
1702
1704
 
1703
1705
  ds.signals_schema = signals_schema.merge(right_signals_schema, rname)
1704
1706
 
1705
- if not full:
1706
- ds.signals_schema = SignalSchema({"sys": Sys}) | ds.signals_schema
1707
-
1708
1707
  return ds
1709
1708
 
1710
1709
  @delta_disabled
@@ -200,6 +200,10 @@ def read_dataset(
200
200
  signals_schema |= SignalSchema.deserialize(query.feature_schema)
201
201
  else:
202
202
  signals_schema |= SignalSchema.from_column_types(query.column_types or {})
203
+
204
+ if delta:
205
+ signals_schema = signals_schema.clone_without_sys_signals()
206
+
203
207
  chain = DataChain(query, _settings, signals_schema)
204
208
 
205
209
  if delta:
@@ -187,6 +187,12 @@ def read_storage(
187
187
  project=listing_project_name,
188
188
  session=session,
189
189
  settings=settings,
190
+ delta=delta,
191
+ delta_on=delta_on,
192
+ delta_result_on=delta_result_on,
193
+ delta_compare=delta_compare,
194
+ delta_retry=delta_retry,
195
+ delta_unsafe=delta_unsafe,
190
196
  )
191
197
  dc._query.update = update
192
198
  dc.signals_schema = dc.signals_schema.mutate({f"{column}": file_type})
@@ -252,13 +258,4 @@ def read_storage(
252
258
 
253
259
  assert storage_chain is not None
254
260
 
255
- if delta:
256
- storage_chain = storage_chain._as_delta(
257
- on=delta_on,
258
- right_on=delta_result_on,
259
- compare=delta_compare,
260
- delta_retry=delta_retry,
261
- delta_unsafe=delta_unsafe,
262
- )
263
-
264
261
  return storage_chain
@@ -1065,7 +1065,7 @@ class SQLJoin(Step):
1065
1065
  q1 = self.get_query(self.query1, temp_tables)
1066
1066
  q2 = self.get_query(self.query2, temp_tables)
1067
1067
 
1068
- q1_columns = _drop_system_columns(q1.c) if self.full else list(q1.c)
1068
+ q1_columns = _drop_system_columns(q1.c)
1069
1069
  q1_column_names = {c.name for c in q1_columns}
1070
1070
 
1071
1071
  q2_columns = []
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.37.1
3
+ Version: 0.37.3
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -5,7 +5,7 @@ datachain/cache.py,sha256=Klkc7iL_KvryeZk-UNjtByTFk7URbpb60XblalqHoYI,3604
5
5
  datachain/checkpoint.py,sha256=AOMqN_2fNuEBJDAsmc-P4L7FU444eQxTU4MCgr-XEH8,1121
6
6
  datachain/config.py,sha256=KPXef6P4NAZiEbSDMUcFwuNVTul2fZBs5xrCbyRl6Tg,4193
7
7
  datachain/dataset.py,sha256=PQwgeFPmEyN8xucaU41q371VJ1EAFXdMVbeQOVeCPFQ,24995
8
- datachain/delta.py,sha256=eUQK_zUH5xMwe0dNUaDsaovsAl97ULGtlG90uVuPaLY,11605
8
+ datachain/delta.py,sha256=8Bj7v4NQbH2ufNwAm1wYWrA-7vGJuLFVd4Mie1mowQs,10711
9
9
  datachain/error.py,sha256=P_5KXlfVIsW4E42JJCoFhGsgvY8la-6jXBEWbHbgqKo,1846
10
10
  datachain/hash_utils.py,sha256=FHzZS8WC4Qr_e-kZeQlfl-ilZ78IXWxj-xMZOqm8Ies,4455
11
11
  datachain/job.py,sha256=vCcHJHKRo5uZTpmUYt_1oVkeawFF0x8jbnm-XZYaKfI,1358
@@ -57,8 +57,8 @@ datachain/data_storage/job.py,sha256=NGFhXg0C0zRFTaF6ccjXZJT4xI4_gUr1WcxTLK6WYDE
57
57
  datachain/data_storage/metastore.py,sha256=DFyTkKLJN5-nFXXc7ln_rGj-FLctj0nrhXJxuyprZSk,64661
58
58
  datachain/data_storage/schema.py,sha256=3fAgiE11TIDYCW7EbTdiOm61SErRitvsLr7YPnUlVm0,9801
59
59
  datachain/data_storage/serializer.py,sha256=oL8i8smyAeVUyDepk8Xhf3lFOGOEHMoZjA5GdFzvfGI,3862
60
- datachain/data_storage/sqlite.py,sha256=pee99RewNQh5kVxGpD2sf9V5VloM4xwn8oeEhquU1rs,31756
61
- datachain/data_storage/warehouse.py,sha256=nuGT27visvAi7jr7ZAZF-wmFe0ZEFD8qaTheINX_7RM,35269
60
+ datachain/data_storage/sqlite.py,sha256=o9TR6N27JB52M9rRXdM9uwdBektGucWtJi9UnmLGh0A,29669
61
+ datachain/data_storage/warehouse.py,sha256=Zhf_HzhiEpsI0IuinAK-sF4ZMH66rV_ZDSOx-UFHv5o,34771
62
62
  datachain/diff/__init__.py,sha256=lGrygGzdWSSYJ1DgX4h2q_ko5QINEW8PKfxOwE9ZFnI,9394
63
63
  datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
64
  datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
@@ -109,15 +109,15 @@ datachain/lib/convert/values_to_tuples.py,sha256=Sxj0ojeMSpAwM_NNoXa1dMR_2L_cQ6X
109
109
  datachain/lib/dc/__init__.py,sha256=UrUzmDH6YyVl8fxM5iXTSFtl5DZTUzEYm1MaazK4vdQ,900
110
110
  datachain/lib/dc/csv.py,sha256=fIfj5-2Ix4z5D5yZueagd5WUWw86pusJ9JJKD-U3KGg,4407
111
111
  datachain/lib/dc/database.py,sha256=Wqob3dQc9Mol_0vagzVEXzteCKS9M0E3U5130KVmQKg,14629
112
- datachain/lib/dc/datachain.py,sha256=9zEL36hVkDxPmmy1A8dv9CFADUEnDr3S7vXNxbSHpGE,104054
113
- datachain/lib/dc/datasets.py,sha256=A4SW-b3dkQnm9Wi7ciCdlXqtrsquIeRfBQN_bJ_ulqY,15237
112
+ datachain/lib/dc/datachain.py,sha256=RYhinLQ6CMU3tudLpiJGh-vfCL24KDKbKM3Q1EsWoAE,104072
113
+ datachain/lib/dc/datasets.py,sha256=oY1t8QBAaZdhjwR439zZT74hMOspewVCrgdwy6juXng,15321
114
114
  datachain/lib/dc/hf.py,sha256=FeruEO176L2qQ1Mnx0QmK4kV0GuQ4xtj717N8fGJrBI,2849
115
115
  datachain/lib/dc/json.py,sha256=iJ6G0jwTKz8xtfh1eICShnWk_bAMWjF5bFnOXLHaTlw,2683
116
116
  datachain/lib/dc/listings.py,sha256=0XTZERQZ2ErP3LSVg9lF9i3alKebqA1Kip2Zf15unUM,4507
117
117
  datachain/lib/dc/pandas.py,sha256=o9rTcZf27-3mCEaDdX1ZzM0I4bSOsu-4mA2zK6rWoS4,1460
118
118
  datachain/lib/dc/parquet.py,sha256=wa_VazXotY5RZ8ypC0_M9Qo30tamzXmYeVE6P-NcQ1Y,2375
119
119
  datachain/lib/dc/records.py,sha256=WvbaLhMqM9e54gJLLeG54QX5ZXkkBIK3FokojLTSbZc,2974
120
- datachain/lib/dc/storage.py,sha256=nlEg-9v9iwtiQUcGd-Ng1lnrpMviliR95mjceBez1BU,9767
120
+ datachain/lib/dc/storage.py,sha256=zfVMkYqwmhI4bnOqyO6bW5gg_DfdYPM7ltWLTHDjGZo,9737
121
121
  datachain/lib/dc/storage_pattern.py,sha256=TqaDb5yq050W9IxpESz9iotjs0R__i5ngRtVo5BmJ-8,7645
122
122
  datachain/lib/dc/utils.py,sha256=kTpzS8CBQmle1A0XZzu4b5zAHo8piFiBSP1lS8ztkQU,4088
123
123
  datachain/lib/dc/values.py,sha256=-EI3xYUNzfwzogbW8WdHX0XbWev-je6_5-CnDsLRcF4,1399
@@ -132,7 +132,7 @@ datachain/model/ultralytics/pose.py,sha256=pvoXrWWUSWT_UBaMwUb5MBHAY57Co2HFDPigF
132
132
  datachain/model/ultralytics/segment.py,sha256=v9_xDxd5zw_I8rXsbl7yQXgEdTs2T38zyY_Y4XGN8ok,3194
133
133
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
134
134
  datachain/query/batch.py,sha256=ugTlSFqh_kxMcG6vJ5XrEzG9jBXRdb7KRAEEsFWiPew,4190
135
- datachain/query/dataset.py,sha256=pHdanZoPsCM20IK0PDt5EXWcPro9W0C-U3OXtPzpMDE,67556
135
+ datachain/query/dataset.py,sha256=vYSbtpVaobIC7uCWdx-wgPOh9_T8oRQoVw1ZC_9ArKE,67527
136
136
  datachain/query/dispatch.py,sha256=Tg73zB6vDnYYYAvtlS9l7BI3sI1EfRCbDjiasvNxz2s,16385
137
137
  datachain/query/metrics.py,sha256=qOMHiYPTMtVs2zI-mUSy8OPAVwrg4oJtVF85B9tdQyM,810
138
138
  datachain/query/params.py,sha256=JkVz6IKUIpF58JZRkUXFT8DAHX2yfaULbhVaGmHKFLc,826
@@ -165,9 +165,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
165
165
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
166
166
  datachain/toolkit/split.py,sha256=xQzzmvQRKsPteDKbpgOxd4r971BnFaK33mcOl0FuGeI,2883
167
167
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
168
- datachain-0.37.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
169
- datachain-0.37.1.dist-info/METADATA,sha256=Gxm5b2gZCiJZpi7L8_J0RM_YpxhgHueS0GqSSkWQWaA,13762
170
- datachain-0.37.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
171
- datachain-0.37.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
172
- datachain-0.37.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
173
- datachain-0.37.1.dist-info/RECORD,,
168
+ datachain-0.37.3.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
169
+ datachain-0.37.3.dist-info/METADATA,sha256=4Z4RNRBEOLYmciwRH0KYyx67xMEqaeORoGJX0ywH9gM,13762
170
+ datachain-0.37.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
171
+ datachain-0.37.3.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
172
+ datachain-0.37.3.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
173
+ datachain-0.37.3.dist-info/RECORD,,