datachain 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -120,13 +120,25 @@ def noop(_: str):
120
120
 
121
121
  @contextmanager
122
122
  def print_and_capture(
123
- stream: "IO[str]", callback: Callable[[str], None] = noop
123
+ stream: "IO[bytes]|IO[str]", callback: Callable[[str], None] = noop
124
124
  ) -> "Iterator[list[str]]":
125
125
  lines: list[str] = []
126
126
  append = lines.append
127
127
 
128
128
  def loop() -> None:
129
- for line in iter(stream.readline, ""):
129
+ buffer = b""
130
+ while byt := stream.read(1): # Read one byte at a time
131
+ buffer += byt.encode("utf-8") if isinstance(byt, str) else byt
132
+
133
+ if byt in (b"\n", b"\r"): # Check for newline or carriage return
134
+ line = buffer.decode("utf-8")
135
+ print(line, end="")
136
+ callback(line)
137
+ append(line)
138
+ buffer = b"" # Clear buffer for next line
139
+
140
+ if buffer: # Handle any remaining data in the buffer
141
+ line = buffer.decode("utf-8")
130
142
  print(line, end="")
131
143
  callback(line)
132
144
  append(line)
@@ -2128,7 +2140,7 @@ class Catalog:
2128
2140
  stdout=subprocess.PIPE if capture_output else None,
2129
2141
  stderr=subprocess.STDOUT if capture_output else None,
2130
2142
  bufsize=1,
2131
- text=True,
2143
+ text=False,
2132
2144
  **kwargs,
2133
2145
  ) as proc:
2134
2146
  os.close(w)
@@ -50,7 +50,7 @@ def convert_rows_custom_column_types(
50
50
  columns: "ColumnCollection[str, ColumnElement[Any]]",
51
51
  rows: Iterator[tuple[Any, ...]],
52
52
  dialect: "Dialect",
53
- ):
53
+ ) -> Iterator[tuple[Any, ...]]:
54
54
  """
55
55
  This function converts values of rows columns based on their types which are
56
56
  defined in columns. We are only converting column values for which types are
@@ -27,10 +27,7 @@ import datachain.sql.sqlite
27
27
  from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
28
28
  from datachain.data_storage.db_engine import DatabaseEngine
29
29
  from datachain.data_storage.id_generator import AbstractDBIDGenerator
30
- from datachain.data_storage.schema import (
31
- DefaultSchema,
32
- convert_rows_custom_column_types,
33
- )
30
+ from datachain.data_storage.schema import DefaultSchema
34
31
  from datachain.dataset import DatasetRecord
35
32
  from datachain.error import DataChainError
36
33
  from datachain.sql.sqlite import create_user_defined_sql_functions, sqlite_dialect
@@ -209,6 +206,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
209
206
  return cursor.executemany(self.compile(query).string, params)
210
207
  return self.db.executemany(self.compile(query).string, params)
211
208
 
209
+ @retry_sqlite_locks
212
210
  def execute_str(self, sql: str, parameters=None) -> sqlite3.Cursor:
213
211
  if parameters is None:
214
212
  return self.db.execute(sql)
@@ -650,12 +648,6 @@ class SQLiteWarehouse(AbstractWarehouse):
650
648
  self.db.create_table(table, if_not_exists=if_not_exists)
651
649
  return table
652
650
 
653
- def dataset_rows_select(self, select_query: Select, **kwargs):
654
- rows = self.db.execute(select_query, **kwargs)
655
- yield from convert_rows_custom_column_types(
656
- select_query.selected_columns, rows, sqlite_dialect
657
- )
658
-
659
651
  def get_dataset_sources(
660
652
  self, dataset: DatasetRecord, version: int
661
653
  ) -> list[StorageURI]:
@@ -17,6 +17,7 @@ from sqlalchemy.sql.expression import true
17
17
  from tqdm import tqdm
18
18
 
19
19
  from datachain.client import Client
20
+ from datachain.data_storage.schema import convert_rows_custom_column_types
20
21
  from datachain.data_storage.serializer import Serializable
21
22
  from datachain.dataset import DatasetRecord
22
23
  from datachain.node import DirType, DirTypeGroup, Entry, Node, NodeWithPath, get_path
@@ -226,7 +227,7 @@ class AbstractWarehouse(ABC, Serializable):
226
227
  if limit < page_size:
227
228
  paginated_query = paginated_query.limit(None).limit(limit)
228
229
 
229
- results = self.db.execute(paginated_query.offset(offset))
230
+ results = self.dataset_rows_select(paginated_query.offset(offset))
230
231
 
231
232
  processed = False
232
233
  for row in results:
@@ -309,12 +310,18 @@ class AbstractWarehouse(ABC, Serializable):
309
310
  Merge results should not contain duplicates.
310
311
  """
311
312
 
312
- @abstractmethod
313
- def dataset_rows_select(self, select_query: sa.sql.selectable.Select, **kwargs):
313
+ def dataset_rows_select(
314
+ self,
315
+ query: sa.sql.selectable.Select,
316
+ **kwargs,
317
+ ) -> Iterator[tuple[Any, ...]]:
314
318
  """
315
- Method for fetching dataset rows from database. This is abstract since
316
- in some DBs we need to use special settings
319
+ Fetch dataset rows from database.
317
320
  """
321
+ rows = self.db.execute(query, **kwargs)
322
+ yield from convert_rows_custom_column_types(
323
+ query.selected_columns, rows, self.db.dialect
324
+ )
318
325
 
319
326
  @abstractmethod
320
327
  def get_dataset_sources(
datachain/lib/dc.py CHANGED
@@ -839,6 +839,10 @@ class DataChain(DatasetQuery):
839
839
  def mutate(self, **kwargs) -> "Self":
840
840
  """Create new signals based on existing signals.
841
841
 
842
+ This method cannot modify existing columns. If you need to modify an
843
+ existing column, use a different name for the new column and then use
844
+ `select()` to choose which columns to keep.
845
+
842
846
  This method is vectorized and more efficient compared to map(), and it does not
843
847
  extract or download any data from the internal database. However, it can only
844
848
  utilize predefined built-in functions and their combinations.
@@ -859,7 +863,26 @@ class DataChain(DatasetQuery):
859
863
  dist=cosine_distance(embedding_text, embedding_image)
860
864
  )
861
865
  ```
866
+
867
+ This method can be also used to rename signals. If the Column("name") provided
868
+ as value for the new signal - the old column will be dropped. Otherwise a new
869
+ column is created.
870
+
871
+ Example:
872
+ ```py
873
+ dc.mutate(
874
+ newkey=Column("oldkey")
875
+ )
876
+ ```
862
877
  """
878
+ existing_columns = set(self.signals_schema.values.keys())
879
+ for col_name in kwargs:
880
+ if col_name in existing_columns:
881
+ raise DataChainColumnError(
882
+ col_name,
883
+ "Cannot modify existing column with mutate(). "
884
+ "Use a different name for the new column.",
885
+ )
863
886
  for col_name, expr in kwargs.items():
864
887
  if not isinstance(expr, Column) and isinstance(expr.type, NullType):
865
888
  raise DataChainColumnError(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.5
3
+ Version: 0.3.7
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -17,7 +17,7 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
18
  datachain/utils.py,sha256=ROVCLwb37VmFRzgTlSGUDw4eJNgYGiQ4yMX581HfUX8,12988
19
19
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
20
- datachain/catalog/catalog.py,sha256=_BRaD261RnCJgXr_DJcDf58XmbjLiuLMSsX97E8k3z8,80771
20
+ datachain/catalog/catalog.py,sha256=dSEpktnwnpx1yY_QMvUexZVvvn6085olV7bnyImPM_k,81280
21
21
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
22
22
  datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
23
23
  datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
@@ -33,16 +33,16 @@ datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kT
33
33
  datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
34
34
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
35
35
  datachain/data_storage/metastore.py,sha256=nxcY6nwyEmQWMAo33sNGO-FgUFQs2amBGGnZz2ftEz0,55362
36
- datachain/data_storage/schema.py,sha256=GwJIHkjhrnBxJAV1WvCMM8jiJN5h79LXDyzMmUDtRw0,8523
36
+ datachain/data_storage/schema.py,sha256=JKpSEz8igpwZ9zkpRPYVXZxEpiXuLKEs2WNhH0KqM6U,8552
37
37
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
38
- datachain/data_storage/sqlite.py,sha256=GEE07ZXTAtzdf53J1UDLscS0xZjukRGlmZzG6q0fZI0,28589
39
- datachain/data_storage/warehouse.py,sha256=tyJJDxFae6XWgLmOoG0B_MJ_Z_UEMoW_wJb96zzwTtA,33471
38
+ datachain/data_storage/sqlite.py,sha256=jLgkvikYkENQUO_ykoNFfsBc2ofZXwFHLMa1nyWP3aw,28316
39
+ datachain/data_storage/warehouse.py,sha256=cvlfa-nyIxqrrpSRtCdeVjlTwhn7rcIoWjOq91HhItU,33668
40
40
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
41
  datachain/lib/arrow.py,sha256=D8N7zCppRdc5sTYT1hNIbROc-sKA_8FN5J_m-KjD3Us,4929
42
42
  datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
43
43
  datachain/lib/data_model.py,sha256=ZvtMRMcPpBxI-rOhkXb-ry1PkGYcEFFK1w1wH12vs4g,1718
44
44
  datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
45
- datachain/lib/dc.py,sha256=atGpaeCUwxDEgHIFmWqG1rAnqe7utT6S7c1jM5yVb7c,59246
45
+ datachain/lib/dc.py,sha256=DkVhbjlxpl-HgHenIK1msofU2tUwsSiKPtNim5ai6OE,60136
46
46
  datachain/lib/file.py,sha256=ZHpdilDPYCob8uqtwUPtBvBNxVvQRq4AC_0IGg5m-G4,12003
47
47
  datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
48
48
  datachain/lib/listing.py,sha256=nXLmGae_oQke4hnurzzWiHTEjHjWiqqHdB41Wb-hMTk,3521
@@ -94,9 +94,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
94
94
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
95
95
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
96
96
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
97
- datachain-0.3.5.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
98
- datachain-0.3.5.dist-info/METADATA,sha256=SaQj0C0_Ugll_S1RTRCkFM4U1fZwC7bweiaQZhovqcs,16719
99
- datachain-0.3.5.dist-info/WHEEL,sha256=Mdi9PDNwEZptOjTlUcAth7XJDFtKrHYaQMPulZeBCiQ,91
100
- datachain-0.3.5.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
101
- datachain-0.3.5.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
102
- datachain-0.3.5.dist-info/RECORD,,
97
+ datachain-0.3.7.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
98
+ datachain-0.3.7.dist-info/METADATA,sha256=m7NZ31iEMCD2xOF8HZNp8YvGu05TmF_3UiZQQPUVmmc,16719
99
+ datachain-0.3.7.dist-info/WHEEL,sha256=Mdi9PDNwEZptOjTlUcAth7XJDFtKrHYaQMPulZeBCiQ,91
100
+ datachain-0.3.7.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
101
+ datachain-0.3.7.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
102
+ datachain-0.3.7.dist-info/RECORD,,