datachain 0.30.0__py3-none-any.whl → 0.30.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Optional, Union
7
7
  import sqlalchemy
8
8
 
9
9
  from datachain.query.schema import ColumnMeta
10
+ from datachain.utils import batched
10
11
 
11
12
  DEFAULT_DATABASE_BATCH_SIZE = 10_000
12
13
 
@@ -74,6 +75,7 @@ def to_database(
74
75
  *,
75
76
  batch_rows: int = DEFAULT_DATABASE_BATCH_SIZE,
76
77
  on_conflict: Optional[str] = None,
78
+ conflict_columns: Optional[list[str]] = None,
77
79
  column_mapping: Optional[dict[str, Optional[str]]] = None,
78
80
  ) -> None:
79
81
  """
@@ -82,8 +84,6 @@ def to_database(
82
84
  This is the core implementation that handles the actual database operations.
83
85
  For user-facing documentation, see DataChain.to_database() method.
84
86
  """
85
- from datachain.utils import batched
86
-
87
87
  if on_conflict and on_conflict not in ("ignore", "update"):
88
88
  raise ValueError(
89
89
  f"on_conflict must be 'ignore' or 'update', got: {on_conflict}"
@@ -105,19 +105,26 @@ def to_database(
105
105
  metadata = sqlalchemy.MetaData()
106
106
  table = sqlalchemy.Table(table_name, metadata, *columns)
107
107
 
108
- # Check if table already exists to determine if we should clean up on error.
109
- inspector = sqlalchemy.inspect(conn)
110
- assert inspector # to satisfy mypy
111
- table_existed_before = table_name in inspector.get_table_names()
112
-
108
+ table_existed_before = False
113
109
  try:
114
- table.create(conn, checkfirst=True)
115
- rows_iter = chain._leaf_values()
116
- for batch in batched(rows_iter, batch_rows):
117
- _process_batch(
118
- conn, table, batch, on_conflict, column_indices_and_names
119
- )
120
- conn.commit()
110
+ with conn.begin():
111
+ # Check if table exists to determine if we should clean up on error.
112
+ inspector = sqlalchemy.inspect(conn)
113
+ assert inspector # to satisfy mypy
114
+ table_existed_before = table_name in inspector.get_table_names()
115
+
116
+ table.create(conn, checkfirst=True)
117
+
118
+ rows_iter = chain._leaf_values()
119
+ for batch in batched(rows_iter, batch_rows):
120
+ _process_batch(
121
+ conn,
122
+ table,
123
+ batch,
124
+ on_conflict,
125
+ conflict_columns,
126
+ column_indices_and_names,
127
+ )
121
128
  except Exception:
122
129
  if not table_existed_before:
123
130
  try:
@@ -183,7 +190,9 @@ def _prepare_columns(all_columns, column_mapping):
183
190
  return column_indices_and_names, columns
184
191
 
185
192
 
186
- def _process_batch(conn, table, batch, on_conflict, column_indices_and_names):
193
+ def _process_batch(
194
+ conn, table, batch, on_conflict, conflict_columns, column_indices_and_names
195
+ ):
187
196
  """Process a batch of rows with conflict resolution."""
188
197
 
189
198
  def prepare_row(row_values):
@@ -217,7 +226,19 @@ def _process_batch(conn, table, batch, on_conflict, column_indices_and_names):
217
226
  update_values = {
218
227
  col.name: insert_stmt.excluded[col.name] for col in table.columns
219
228
  }
220
- insert_stmt = insert_stmt.on_conflict_do_update(set_=update_values)
229
+ if conn.engine.name == "postgresql":
230
+ if not conflict_columns:
231
+ raise ValueError(
232
+ "conflict_columns parameter is required when "
233
+ "on_conflict='update' with PostgreSQL. Specify the column "
234
+ "names that form a unique constraint."
235
+ )
236
+
237
+ insert_stmt = insert_stmt.on_conflict_do_update(
238
+ index_elements=conflict_columns, set_=update_values
239
+ )
240
+ else:
241
+ insert_stmt = insert_stmt.on_conflict_do_update(set_=update_values)
221
242
  elif on_conflict:
222
243
  import warnings
223
244
 
@@ -2296,6 +2296,7 @@ class DataChain:
2296
2296
  *,
2297
2297
  batch_rows: int = DEFAULT_DATABASE_BATCH_SIZE,
2298
2298
  on_conflict: Optional[str] = None,
2299
+ conflict_columns: Optional[list[str]] = None,
2299
2300
  column_mapping: Optional[dict[str, Optional[str]]] = None,
2300
2301
  ) -> None:
2301
2302
  """Save chain to a database table using a given database connection.
@@ -2319,6 +2320,9 @@ class DataChain:
2319
2320
  (default)
2320
2321
  - "ignore": Skip duplicate rows silently
2321
2322
  - "update": Update existing rows with new values
2323
+ conflict_columns: List of column names that form a unique constraint
2324
+ for conflict resolution. Required when on_conflict='update' and
2325
+ using PostgreSQL.
2322
2326
  column_mapping: Optional mapping to rename or skip columns:
2323
2327
  - Dict mapping DataChain column names to database column names
2324
2328
  - Set values to None to skip columns entirely, or use `defaultdict` to
@@ -2377,6 +2381,7 @@ class DataChain:
2377
2381
  connection,
2378
2382
  batch_rows=batch_rows,
2379
2383
  on_conflict=on_conflict,
2384
+ conflict_columns=conflict_columns,
2380
2385
  column_mapping=column_mapping,
2381
2386
  )
2382
2387
 
@@ -665,7 +665,7 @@ class UDFSignal(UDFStep):
665
665
  original_cols = [c for c in subq.c if c.name not in partition_col_names]
666
666
 
667
667
  # new signal columns that are added to udf_table
668
- signal_cols = [c for c in udf_table.c if c.name != "sys__id"]
668
+ signal_cols = [c for c in udf_table.c if not c.name.startswith("sys__")]
669
669
  signal_name_cols = {c.name: c for c in signal_cols}
670
670
  cols = signal_cols
671
671
 
datachain/sql/__init__.py CHANGED
@@ -1,6 +1,8 @@
1
1
  from sqlalchemy.sql.elements import literal
2
2
  from sqlalchemy.sql.expression import column
3
3
 
4
+ # Import PostgreSQL dialect registration (registers PostgreSQL type converter)
5
+ from . import postgresql_dialect # noqa: F401
4
6
  from .default import setup as default_setup
5
7
  from .selectable import select, values
6
8
 
@@ -0,0 +1,9 @@
1
+ """
2
+ PostgreSQL dialect registration for DataChain.
3
+ """
4
+
5
+ from datachain.sql.postgresql_types import PostgreSQLTypeConverter
6
+ from datachain.sql.types import register_backend_types
7
+
8
+ # Register PostgreSQL type converter
9
+ register_backend_types("postgresql", PostgreSQLTypeConverter())
@@ -0,0 +1,21 @@
1
+ """
2
+ PostgreSQL-specific type converter for DataChain.
3
+
4
+ Handles PostgreSQL-specific type mappings that differ from the default dialect.
5
+ """
6
+
7
+ from sqlalchemy.dialects import postgresql
8
+
9
+ from datachain.sql.types import TypeConverter
10
+
11
+
12
+ class PostgreSQLTypeConverter(TypeConverter):
13
+ """PostgreSQL-specific type converter."""
14
+
15
+ def datetime(self):
16
+ """PostgreSQL uses TIMESTAMP WITH TIME ZONE to preserve timezone information."""
17
+ return postgresql.TIMESTAMP(timezone=True)
18
+
19
+ def json(self):
20
+ """PostgreSQL uses JSONB for better performance and query capabilities."""
21
+ return postgresql.JSONB()
@@ -1,4 +1,8 @@
1
- from .base import create_user_defined_sql_functions, setup, sqlite_dialect
1
+ from .base import (
2
+ create_user_defined_sql_functions,
3
+ setup,
4
+ sqlite_dialect,
5
+ )
2
6
 
3
7
  __all__ = [
4
8
  "create_user_defined_sql_functions",
@@ -304,7 +304,11 @@ def register_user_defined_sql_functions() -> None:
304
304
 
305
305
 
306
306
  def adapt_datetime(val: datetime) -> str:
307
- if not (val.tzinfo is timezone.utc or val.tzname() == "UTC"):
307
+ is_utc_check = val.tzinfo is timezone.utc
308
+ tzname_check = val.tzname() == "UTC"
309
+ combined_check = is_utc_check or tzname_check
310
+
311
+ if not combined_check:
308
312
  try:
309
313
  val = val.astimezone(timezone.utc)
310
314
  except (OverflowError, ValueError, OSError):
@@ -314,6 +318,7 @@ def adapt_datetime(val: datetime) -> str:
314
318
  val = datetime.min.replace(tzinfo=timezone.utc)
315
319
  else:
316
320
  raise
321
+
317
322
  return val.replace(tzinfo=None).isoformat(" ")
318
323
 
319
324
 
datachain/sql/types.py CHANGED
@@ -58,9 +58,14 @@ def converter(dialect) -> "TypeConverter":
58
58
  try:
59
59
  return registry[name]
60
60
  except KeyError:
61
- raise ValueError(
62
- f"No type converter registered for dialect: {dialect.name!r}"
63
- ) from None
61
+ # Fall back to default converter if specific dialect not found
62
+ try:
63
+ return registry["default"]
64
+ except KeyError:
65
+ raise ValueError(
66
+ f"No type converter registered for dialect: {dialect.name!r} "
67
+ f"and no default converter available"
68
+ ) from None
64
69
 
65
70
 
66
71
  def read_converter(dialect) -> "TypeReadConverter":
@@ -68,9 +73,14 @@ def read_converter(dialect) -> "TypeReadConverter":
68
73
  try:
69
74
  return read_converter_registry[name]
70
75
  except KeyError:
71
- raise ValueError(
72
- f"No read type converter registered for dialect: {dialect.name!r}"
73
- ) from None
76
+ # Fall back to default converter if specific dialect not found
77
+ try:
78
+ return read_converter_registry["default"]
79
+ except KeyError:
80
+ raise ValueError(
81
+ f"No read type converter registered for dialect: {dialect.name!r} "
82
+ f"and no default converter available"
83
+ ) from None
74
84
 
75
85
 
76
86
  def type_defaults(dialect) -> "TypeDefaults":
@@ -78,7 +88,14 @@ def type_defaults(dialect) -> "TypeDefaults":
78
88
  try:
79
89
  return type_defaults_registry[name]
80
90
  except KeyError:
81
- raise ValueError(f"No type defaults registered for dialect: {name!r}") from None
91
+ # Fall back to default converter if specific dialect not found
92
+ try:
93
+ return type_defaults_registry["default"]
94
+ except KeyError:
95
+ raise ValueError(
96
+ f"No type defaults registered for dialect: {dialect.name!r} "
97
+ f"and no default converter available"
98
+ ) from None
82
99
 
83
100
 
84
101
  def db_defaults(dialect) -> "DBDefaults":
@@ -86,7 +103,14 @@ def db_defaults(dialect) -> "DBDefaults":
86
103
  try:
87
104
  return db_defaults_registry[name]
88
105
  except KeyError:
89
- raise ValueError(f"No DB defaults registered for dialect: {name!r}") from None
106
+ # Fall back to default converter if specific dialect not found
107
+ try:
108
+ return db_defaults_registry["default"]
109
+ except KeyError:
110
+ raise ValueError(
111
+ f"No DB defaults registered for dialect: {dialect.name!r} "
112
+ f"and no default converter available"
113
+ ) from None
90
114
 
91
115
 
92
116
  class SQLType(TypeDecorator):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.30.0
3
+ Version: 0.30.2
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -81,8 +81,10 @@ Provides-Extra: video
81
81
  Requires-Dist: ffmpeg-python; extra == "video"
82
82
  Requires-Dist: imageio[ffmpeg,pyav]>=2.37.0; extra == "video"
83
83
  Requires-Dist: opencv-python; extra == "video"
84
+ Provides-Extra: postgres
85
+ Requires-Dist: psycopg2-binary>=2.9.0; extra == "postgres"
84
86
  Provides-Extra: tests
85
- Requires-Dist: datachain[audio,hf,remote,torch,vector,video]; extra == "tests"
87
+ Requires-Dist: datachain[audio,hf,postgres,remote,torch,vector,video]; extra == "tests"
86
88
  Requires-Dist: pytest<9,>=8; extra == "tests"
87
89
  Requires-Dist: pytest-sugar>=0.9.6; extra == "tests"
88
90
  Requires-Dist: pytest-cov>=4.1.0; extra == "tests"
@@ -90,6 +92,7 @@ Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
90
92
  Requires-Dist: pytest-servers[all]>=0.5.9; extra == "tests"
91
93
  Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
92
94
  Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
95
+ Requires-Dist: pytest-env>=1.1.0; extra == "tests"
93
96
  Requires-Dist: virtualenv; extra == "tests"
94
97
  Requires-Dist: dulwich; extra == "tests"
95
98
  Requires-Dist: hypothesis; extra == "tests"
@@ -103,8 +103,8 @@ datachain/lib/convert/unflatten.py,sha256=ysMkstwJzPMWUlnxn-Z-tXJR3wmhjHeSN_P-sD
103
103
  datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUOzHUGPoyZXAB0,4360
104
104
  datachain/lib/dc/__init__.py,sha256=TFci5HTvYGjBesNUxDAnXaX36PnzPEUSn5a6JxB9o0U,872
105
105
  datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
106
- datachain/lib/dc/database.py,sha256=MPE-KzwcR2DhWLCEbl1gWFp63dLqjWuiJ1iEfC2BrJI,12443
107
- datachain/lib/dc/datachain.py,sha256=_C9PZjUHVewpdp94AR2GS3QEI96Svsyx52dLJVM4tm4,98143
106
+ datachain/lib/dc/database.py,sha256=4Fhen6KZRMYzSfONydwTFwSUECbdff0t-9GSj6ADyYM,13288
107
+ datachain/lib/dc/datachain.py,sha256=UHICzncqG6GmDXxrX0DEYVXJK19c-8H-eoRuA7097zc,98439
108
108
  datachain/lib/dc/datasets.py,sha256=P6CIJizD2IYFwOQG5D3VbQRjDmUiRH0ysdtb551Xdm8,15098
109
109
  datachain/lib/dc/hf.py,sha256=AP_MUHg6HJWae10PN9hD_beQVjrl0cleZ6Cvhtl1yoI,2901
110
110
  datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
@@ -126,7 +126,7 @@ datachain/model/ultralytics/pose.py,sha256=pBlmt63Qe68FKmexHimUGlNbNOoOlMHXG4fzX
126
126
  datachain/model/ultralytics/segment.py,sha256=63bDCj43E6iZ0hFI5J6uQfksdCmjEp6sEm1XzVaE8pw,2986
127
127
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
128
128
  datachain/query/batch.py,sha256=-goxLpE0EUvaDHu66rstj53UnfHpYfBUGux8GSpJ93k,4306
129
- datachain/query/dataset.py,sha256=OJZ_YwpS5i4B0wVmosMmMNW1qABr6zyOmqNHQdAWir4,62704
129
+ datachain/query/dataset.py,sha256=hUKKHuqkfNaojzOt6rMIksU-PG72i_lfbKSHZ5rt--M,62715
130
130
  datachain/query/dispatch.py,sha256=A0nPxn6mEN5d9dDo6S8m16Ji_9IvJLXrgF2kqXdi4fs,15546
131
131
  datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
132
132
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -137,9 +137,11 @@ datachain/query/udf.py,sha256=e753bDJzTNjGFQn1WGTvOAWSwjDbrFI1-_DDWkWN2ls,1343
137
137
  datachain/query/utils.py,sha256=a2PTBZ3qsG6XlUcp9XsoGiQfKkca4Q3m-VzFgiGQPAc,1230
138
138
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
139
139
  datachain/remote/studio.py,sha256=pDThxvEEpIKVGfa9rmtz_zeqHwrgzh0Lv-Pd4wzDx5k,15448
140
- datachain/sql/__init__.py,sha256=6SQRdbljO3d2hx3EAVXEZrHQKv5jth0Jh98PogT59No,262
140
+ datachain/sql/__init__.py,sha256=8D2omsBiATt8bjLjGo6jBEtaKEkOlnlNFWhVryHMDv0,388
141
+ datachain/sql/postgresql_dialect.py,sha256=pDTfH8xaXz5xZsq8O1aQUvWLRIv_ogYeAqtmKlPp3Rw,280
142
+ datachain/sql/postgresql_types.py,sha256=ryb_0lzuA9UOJ_B6nW9Yb8nJjzeSmEItAL_Ceue65lc,627
141
143
  datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
142
- datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
144
+ datachain/sql/types.py,sha256=RWOghtYFx14K-e71QOGg5yfKb-A4-4JgFjaJ0wCZ17Y,15006
143
145
  datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
144
146
  datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESmVAWw,45
145
147
  datachain/sql/default/base.py,sha256=QD-31C6JnyOXzogyDx90sUhm7QvgXIYpeHEASH84igU,628
@@ -151,16 +153,16 @@ datachain/sql/functions/numeric.py,sha256=BK2KCiPSgM2IveCq-9M_PG3CtPBlztaS9TTn1L
151
153
  datachain/sql/functions/path.py,sha256=zixpERotTFP6LZ7I4TiGtyRA8kXOoZmH1yzH9oRW0mg,1294
152
154
  datachain/sql/functions/random.py,sha256=vBwEEj98VH4LjWixUCygQ5Bz1mv1nohsCG0-ZTELlVg,271
153
155
  datachain/sql/functions/string.py,sha256=E-T9OIzUR-GKaLgjZsEtg5CJrY_sLf1lt1awTvY7w2w,1426
154
- datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7d04,166
155
- datachain/sql/sqlite/base.py,sha256=ldyWoMBvrYH789GK8xE4wbBNYxz4B-g4BmQAK6_KjR0,21398
156
+ datachain/sql/sqlite/__init__.py,sha256=PsLaDSij9a03VxGSpagpNl7NQsGtgm72ArUeALZONoc,183
157
+ datachain/sql/sqlite/base.py,sha256=6aoQHeggY3hs31_YZ-wlYKA1Lto4MFOpgfgRspH6IMc,21498
156
158
  datachain/sql/sqlite/types.py,sha256=cH6oge2E_YWFy22wY-txPJH8gxoQFSpCthtZR8PZjpo,1849
157
159
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
158
160
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
159
161
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
160
162
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
161
- datachain-0.30.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
162
- datachain-0.30.0.dist-info/METADATA,sha256=khld2YGTkc6UpViqw2RxeeDeUwG8wGW6X2RR2zBaydk,13766
163
- datachain-0.30.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
164
- datachain-0.30.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
165
- datachain-0.30.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
166
- datachain-0.30.0.dist-info/RECORD,,
163
+ datachain-0.30.2.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
164
+ datachain-0.30.2.dist-info/METADATA,sha256=nLnOc_mmzRuLWdFk3hiUi_P71TLY975X_ZWk4iyojeg,13910
165
+ datachain-0.30.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
+ datachain-0.30.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
167
+ datachain-0.30.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
168
+ datachain-0.30.2.dist-info/RECORD,,