datachain 0.18.9__py3-none-any.whl → 0.18.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -109,12 +109,14 @@ class SQLiteDatabaseEngine(DatabaseEngine):
109
109
  metadata: "MetaData",
110
110
  db: sqlite3.Connection,
111
111
  db_file: Optional[str] = None,
112
+ max_variable_number: Optional[int] = 999,
112
113
  ):
113
114
  self.engine = engine
114
115
  self.metadata = metadata
115
116
  self.db = db
116
117
  self.db_file = db_file
117
118
  self.is_closed = False
119
+ self.max_variable_number = max_variable_number
118
120
 
119
121
  @classmethod
120
122
  def from_db_file(cls, db_file: Optional[str] = None) -> "SQLiteDatabaseEngine":
@@ -123,7 +125,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
123
125
  @staticmethod
124
126
  def _connect(
125
127
  db_file: Optional[str] = None,
126
- ) -> tuple["Engine", "MetaData", sqlite3.Connection, str]:
128
+ ) -> tuple["Engine", "MetaData", sqlite3.Connection, str, int]:
127
129
  try:
128
130
  if db_file == ":memory:":
129
131
  # Enable multithreaded usage of the same in-memory db
@@ -150,6 +152,13 @@ class SQLiteDatabaseEngine(DatabaseEngine):
150
152
  db.execute("PRAGMA journal_mode = WAL")
151
153
  db.execute("PRAGMA synchronous = NORMAL")
152
154
  db.execute("PRAGMA case_sensitive_like = ON")
155
+
156
+ max_variable_number = 999 # minimum in old SQLite versions
157
+ for row in db.execute("PRAGMA compile_options;").fetchall():
158
+ option = row[0]
159
+ if option.startswith("MAX_VARIABLE_NUMBER="):
160
+ max_variable_number = int(option.split("=")[1])
161
+
153
162
  if os.environ.get("DEBUG_SHOW_SQL_QUERIES"):
154
163
  import sys
155
164
 
@@ -157,7 +166,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
157
166
 
158
167
  load_usearch_extension(db)
159
168
 
160
- return engine, MetaData(), db, db_file
169
+ return engine, MetaData(), db, db_file, max_variable_number
161
170
  except RuntimeError:
162
171
  raise DataChainError("Can't connect to SQLite DB") from None
163
172
 
@@ -180,11 +189,14 @@ class SQLiteDatabaseEngine(DatabaseEngine):
180
189
  def _reconnect(self) -> None:
181
190
  if not self.is_closed:
182
191
  raise RuntimeError("Cannot reconnect on still-open DB!")
183
- engine, metadata, db, db_file = self._connect(db_file=self.db_file)
192
+ engine, metadata, db, db_file, max_variable_number = self._connect(
193
+ db_file=self.db_file
194
+ )
184
195
  self.engine = engine
185
196
  self.metadata = metadata
186
197
  self.db = db
187
198
  self.db_file = db_file
199
+ self.max_variable_number = max_variable_number
188
200
  self.is_closed = False
189
201
 
190
202
  def get_table(self, name: str) -> Table:
@@ -231,13 +243,27 @@ class SQLiteDatabaseEngine(DatabaseEngine):
231
243
  return self.db.execute(sql, parameters)
232
244
 
233
245
  def insert_dataframe(self, table_name: str, df) -> int:
246
+ # Dynamically calculates chunksize by dividing max variable limit in a
247
+ # single SQL insert with number of columns in dataframe.
248
+ # This way we avoid error: sqlite3.OperationalError: too many SQL variables,
249
+ num_columns = df.shape[1]
250
+ if num_columns == 0:
251
+ num_columns = 1
252
+
253
+ if self.max_variable_number < num_columns:
254
+ raise RuntimeError(
255
+ "Number of columns exceeds DB maximum variables when inserting data"
256
+ )
257
+
258
+ chunksize = self.max_variable_number // num_columns
259
+
234
260
  return df.to_sql(
235
261
  table_name,
236
262
  self.db,
237
263
  if_exists="append",
238
264
  index=False,
239
265
  method="multi",
240
- chunksize=1000,
266
+ chunksize=chunksize,
241
267
  )
242
268
 
243
269
  def cursor(self, factory=None):
@@ -17,6 +17,8 @@ from datachain.client import Client
17
17
  from datachain.data_storage.schema import convert_rows_custom_column_types
18
18
  from datachain.data_storage.serializer import Serializable
19
19
  from datachain.dataset import DatasetRecord, StorageURI
20
+ from datachain.lib.file import File
21
+ from datachain.lib.signal_schema import SignalSchema
20
22
  from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
21
23
  from datachain.query.batch import RowsOutput
22
24
  from datachain.query.utils import get_query_id_column
@@ -35,7 +37,6 @@ if TYPE_CHECKING:
35
37
  from datachain.data_storage import schema
36
38
  from datachain.data_storage.db_engine import DatabaseEngine
37
39
  from datachain.data_storage.schema import DataTable
38
- from datachain.lib.file import File
39
40
 
40
41
 
41
42
  logger = logging.getLogger("datachain")
@@ -370,14 +371,18 @@ class AbstractWarehouse(ABC, Serializable):
370
371
  if not (self.db.has_table(self.dataset_table_name(dataset.name, version))):
371
372
  return None, None
372
373
 
374
+ file_signals = list(
375
+ SignalSchema.deserialize(dataset.feature_schema).get_signals(File)
376
+ )
377
+
373
378
  dr = self.dataset_rows(dataset, version)
374
379
  table = dr.get_table()
375
380
  expressions: tuple[_ColumnsClauseArgument[Any], ...] = (
376
381
  sa.func.count(table.c.sys__id),
377
382
  )
378
- size_columns = [
379
- c for c in table.columns if c.name == "size" or c.name.endswith("__size")
380
- ]
383
+ size_column_names = [s.replace(".", "__") + "__size" for s in file_signals]
384
+ size_columns = [c for c in table.columns if c.name in size_column_names]
385
+
381
386
  if size_columns:
382
387
  expressions = (*expressions, sa.func.sum(sum(size_columns)))
383
388
  query = sa.select(*expressions)
datachain/lib/dc/json.py CHANGED
@@ -1,18 +1,13 @@
1
1
  import os
2
2
  import os.path
3
3
  import re
4
- from typing import (
5
- TYPE_CHECKING,
6
- Optional,
7
- Union,
8
- )
4
+ from typing import TYPE_CHECKING, Optional, Union
9
5
 
6
+ import cloudpickle
7
+
8
+ from datachain.lib import meta_formats
10
9
  from datachain.lib.data_model import DataType
11
- from datachain.lib.file import (
12
- File,
13
- FileType,
14
- )
15
- from datachain.lib.meta_formats import read_meta
10
+ from datachain.lib.file import File, FileType
16
11
 
17
12
  if TYPE_CHECKING:
18
13
  from typing_extensions import ParamSpec
@@ -76,7 +71,7 @@ def read_json(
76
71
  column = format
77
72
  chain = read_storage(uri=path, type=type, **kwargs)
78
73
  signal_dict = {
79
- column: read_meta(
74
+ column: meta_formats.read_meta(
80
75
  schema_from=schema_from,
81
76
  format=format,
82
77
  spec=spec,
@@ -88,4 +83,7 @@ def read_json(
88
83
  }
89
84
  # disable prefetch if nrows is set
90
85
  settings = {"prefetch": 0} if nrows else {}
86
+
87
+ cloudpickle.register_pickle_by_value(meta_formats)
88
+
91
89
  return chain.settings(**settings).gen(**signal_dict) # type: ignore[misc, arg-type]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.18.9
3
+ Version: 0.18.11
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -50,8 +50,8 @@ datachain/data_storage/job.py,sha256=9r0OGwh22bHNIvLHqg8_-eJSP1YYB-BN5HOla5TdCxw
50
50
  datachain/data_storage/metastore.py,sha256=1PaRTQbL7kjcU1BVjiLjXJLrrLzQtUvpqLmm0pwc1rU,39882
51
51
  datachain/data_storage/schema.py,sha256=asZYz1cg_WKfe2Q-k5W51E2z2CzHU5B4QEDZDMFr8yo,9346
52
52
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
53
- datachain/data_storage/sqlite.py,sha256=bwZAB_NUMT2WMv5tPQnnLFA0P-PiQtxzSaQ1q6xDxOU,24590
54
- datachain/data_storage/warehouse.py,sha256=RkdX1cunfmpDkRYRdOGNy0kLw7RekIokVl3Dd0i-hrA,31534
53
+ datachain/data_storage/sqlite.py,sha256=BB8x7jtBmHK9lwn2zTo4HgfTKWGF43JxOsGr38J8YV8,25698
54
+ datachain/data_storage/warehouse.py,sha256=imPm4R2V7TkqgGNSO2FGnKu03axU9UVLMfdUPfpwgHE,31747
55
55
  datachain/diff/__init__.py,sha256=-OFZzgOplqO84iWgGY7kfe60NXaWR9JRIh9T-uJboAM,9668
56
56
  datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
57
57
  datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
@@ -102,7 +102,7 @@ datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,
102
102
  datachain/lib/dc/datachain.py,sha256=5rR_QqG4vesq-x545ZTSFJDSb6Oc5CW4-ziQYD6DpW4,80993
103
103
  datachain/lib/dc/datasets.py,sha256=G65leCuo_3bItmvjoV1wK0pzj7a2IQqe3xRsflpF3xM,10794
104
104
  datachain/lib/dc/hf.py,sha256=PJl2wiLjdRsMz0SYbLT-6H8b-D5i2WjeH7li8HHOk_0,2145
105
- datachain/lib/dc/json.py,sha256=ZUThPDAaP2gBFIL5vsQTwKBcuN_dhvC_O44wdDv0jEc,2683
105
+ datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
106
106
  datachain/lib/dc/listings.py,sha256=2na9v63xO1vPUNaoBSzA-TSN49V7zQAb-4iS1wOPLFE,1029
107
107
  datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
108
108
  datachain/lib/dc/parquet.py,sha256=zYcSgrWwyEDW9UxGUSVdIVsCu15IGEf0xL8KfWQqK94,1782
@@ -153,9 +153,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
153
153
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
154
154
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
155
155
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
156
- datachain-0.18.9.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
157
- datachain-0.18.9.dist-info/METADATA,sha256=0BhJEeQiYf41Rg7DLgJ-WtiUu9cpwwUtVwo__lPaMAw,11319
158
- datachain-0.18.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
159
- datachain-0.18.9.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
160
- datachain-0.18.9.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
161
- datachain-0.18.9.dist-info/RECORD,,
156
+ datachain-0.18.11.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
157
+ datachain-0.18.11.dist-info/METADATA,sha256=TgOokr9DxfY4A1mq7-5APy8DTHUqFEf2FslYxASH1IA,11320
158
+ datachain-0.18.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
159
+ datachain-0.18.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
160
+ datachain-0.18.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
161
+ datachain-0.18.11.dist-info/RECORD,,