datachain 0.18.9__py3-none-any.whl → 0.18.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/data_storage/sqlite.py +30 -4
- datachain/data_storage/warehouse.py +9 -4
- datachain/lib/dc/json.py +9 -11
- {datachain-0.18.9.dist-info → datachain-0.18.11.dist-info}/METADATA +1 -1
- {datachain-0.18.9.dist-info → datachain-0.18.11.dist-info}/RECORD +9 -9
- {datachain-0.18.9.dist-info → datachain-0.18.11.dist-info}/WHEEL +0 -0
- {datachain-0.18.9.dist-info → datachain-0.18.11.dist-info}/entry_points.txt +0 -0
- {datachain-0.18.9.dist-info → datachain-0.18.11.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.18.9.dist-info → datachain-0.18.11.dist-info}/top_level.txt +0 -0
datachain/data_storage/sqlite.py
CHANGED
|
@@ -109,12 +109,14 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
109
109
|
metadata: "MetaData",
|
|
110
110
|
db: sqlite3.Connection,
|
|
111
111
|
db_file: Optional[str] = None,
|
|
112
|
+
max_variable_number: Optional[int] = 999,
|
|
112
113
|
):
|
|
113
114
|
self.engine = engine
|
|
114
115
|
self.metadata = metadata
|
|
115
116
|
self.db = db
|
|
116
117
|
self.db_file = db_file
|
|
117
118
|
self.is_closed = False
|
|
119
|
+
self.max_variable_number = max_variable_number
|
|
118
120
|
|
|
119
121
|
@classmethod
|
|
120
122
|
def from_db_file(cls, db_file: Optional[str] = None) -> "SQLiteDatabaseEngine":
|
|
@@ -123,7 +125,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
123
125
|
@staticmethod
|
|
124
126
|
def _connect(
|
|
125
127
|
db_file: Optional[str] = None,
|
|
126
|
-
) -> tuple["Engine", "MetaData", sqlite3.Connection, str]:
|
|
128
|
+
) -> tuple["Engine", "MetaData", sqlite3.Connection, str, int]:
|
|
127
129
|
try:
|
|
128
130
|
if db_file == ":memory:":
|
|
129
131
|
# Enable multithreaded usage of the same in-memory db
|
|
@@ -150,6 +152,13 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
150
152
|
db.execute("PRAGMA journal_mode = WAL")
|
|
151
153
|
db.execute("PRAGMA synchronous = NORMAL")
|
|
152
154
|
db.execute("PRAGMA case_sensitive_like = ON")
|
|
155
|
+
|
|
156
|
+
max_variable_number = 999 # minimum in old SQLite versions
|
|
157
|
+
for row in db.execute("PRAGMA compile_options;").fetchall():
|
|
158
|
+
option = row[0]
|
|
159
|
+
if option.startswith("MAX_VARIABLE_NUMBER="):
|
|
160
|
+
max_variable_number = int(option.split("=")[1])
|
|
161
|
+
|
|
153
162
|
if os.environ.get("DEBUG_SHOW_SQL_QUERIES"):
|
|
154
163
|
import sys
|
|
155
164
|
|
|
@@ -157,7 +166,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
157
166
|
|
|
158
167
|
load_usearch_extension(db)
|
|
159
168
|
|
|
160
|
-
return engine, MetaData(), db, db_file
|
|
169
|
+
return engine, MetaData(), db, db_file, max_variable_number
|
|
161
170
|
except RuntimeError:
|
|
162
171
|
raise DataChainError("Can't connect to SQLite DB") from None
|
|
163
172
|
|
|
@@ -180,11 +189,14 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
180
189
|
def _reconnect(self) -> None:
|
|
181
190
|
if not self.is_closed:
|
|
182
191
|
raise RuntimeError("Cannot reconnect on still-open DB!")
|
|
183
|
-
engine, metadata, db, db_file = self._connect(
|
|
192
|
+
engine, metadata, db, db_file, max_variable_number = self._connect(
|
|
193
|
+
db_file=self.db_file
|
|
194
|
+
)
|
|
184
195
|
self.engine = engine
|
|
185
196
|
self.metadata = metadata
|
|
186
197
|
self.db = db
|
|
187
198
|
self.db_file = db_file
|
|
199
|
+
self.max_variable_number = max_variable_number
|
|
188
200
|
self.is_closed = False
|
|
189
201
|
|
|
190
202
|
def get_table(self, name: str) -> Table:
|
|
@@ -231,13 +243,27 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
231
243
|
return self.db.execute(sql, parameters)
|
|
232
244
|
|
|
233
245
|
def insert_dataframe(self, table_name: str, df) -> int:
|
|
246
|
+
# Dynamically calculates chunksize by dividing max variable limit in a
|
|
247
|
+
# single SQL insert with number of columns in dataframe.
|
|
248
|
+
# This way we avoid error: sqlite3.OperationalError: too many SQL variables,
|
|
249
|
+
num_columns = df.shape[1]
|
|
250
|
+
if num_columns == 0:
|
|
251
|
+
num_columns = 1
|
|
252
|
+
|
|
253
|
+
if self.max_variable_number < num_columns:
|
|
254
|
+
raise RuntimeError(
|
|
255
|
+
"Number of columns exceeds DB maximum variables when inserting data"
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
chunksize = self.max_variable_number // num_columns
|
|
259
|
+
|
|
234
260
|
return df.to_sql(
|
|
235
261
|
table_name,
|
|
236
262
|
self.db,
|
|
237
263
|
if_exists="append",
|
|
238
264
|
index=False,
|
|
239
265
|
method="multi",
|
|
240
|
-
chunksize=
|
|
266
|
+
chunksize=chunksize,
|
|
241
267
|
)
|
|
242
268
|
|
|
243
269
|
def cursor(self, factory=None):
|
|
@@ -17,6 +17,8 @@ from datachain.client import Client
|
|
|
17
17
|
from datachain.data_storage.schema import convert_rows_custom_column_types
|
|
18
18
|
from datachain.data_storage.serializer import Serializable
|
|
19
19
|
from datachain.dataset import DatasetRecord, StorageURI
|
|
20
|
+
from datachain.lib.file import File
|
|
21
|
+
from datachain.lib.signal_schema import SignalSchema
|
|
20
22
|
from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
|
|
21
23
|
from datachain.query.batch import RowsOutput
|
|
22
24
|
from datachain.query.utils import get_query_id_column
|
|
@@ -35,7 +37,6 @@ if TYPE_CHECKING:
|
|
|
35
37
|
from datachain.data_storage import schema
|
|
36
38
|
from datachain.data_storage.db_engine import DatabaseEngine
|
|
37
39
|
from datachain.data_storage.schema import DataTable
|
|
38
|
-
from datachain.lib.file import File
|
|
39
40
|
|
|
40
41
|
|
|
41
42
|
logger = logging.getLogger("datachain")
|
|
@@ -370,14 +371,18 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
370
371
|
if not (self.db.has_table(self.dataset_table_name(dataset.name, version))):
|
|
371
372
|
return None, None
|
|
372
373
|
|
|
374
|
+
file_signals = list(
|
|
375
|
+
SignalSchema.deserialize(dataset.feature_schema).get_signals(File)
|
|
376
|
+
)
|
|
377
|
+
|
|
373
378
|
dr = self.dataset_rows(dataset, version)
|
|
374
379
|
table = dr.get_table()
|
|
375
380
|
expressions: tuple[_ColumnsClauseArgument[Any], ...] = (
|
|
376
381
|
sa.func.count(table.c.sys__id),
|
|
377
382
|
)
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
383
|
+
size_column_names = [s.replace(".", "__") + "__size" for s in file_signals]
|
|
384
|
+
size_columns = [c for c in table.columns if c.name in size_column_names]
|
|
385
|
+
|
|
381
386
|
if size_columns:
|
|
382
387
|
expressions = (*expressions, sa.func.sum(sum(size_columns)))
|
|
383
388
|
query = sa.select(*expressions)
|
datachain/lib/dc/json.py
CHANGED
|
@@ -1,18 +1,13 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import os.path
|
|
3
3
|
import re
|
|
4
|
-
from typing import
|
|
5
|
-
TYPE_CHECKING,
|
|
6
|
-
Optional,
|
|
7
|
-
Union,
|
|
8
|
-
)
|
|
4
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
9
5
|
|
|
6
|
+
import cloudpickle
|
|
7
|
+
|
|
8
|
+
from datachain.lib import meta_formats
|
|
10
9
|
from datachain.lib.data_model import DataType
|
|
11
|
-
from datachain.lib.file import
|
|
12
|
-
File,
|
|
13
|
-
FileType,
|
|
14
|
-
)
|
|
15
|
-
from datachain.lib.meta_formats import read_meta
|
|
10
|
+
from datachain.lib.file import File, FileType
|
|
16
11
|
|
|
17
12
|
if TYPE_CHECKING:
|
|
18
13
|
from typing_extensions import ParamSpec
|
|
@@ -76,7 +71,7 @@ def read_json(
|
|
|
76
71
|
column = format
|
|
77
72
|
chain = read_storage(uri=path, type=type, **kwargs)
|
|
78
73
|
signal_dict = {
|
|
79
|
-
column: read_meta(
|
|
74
|
+
column: meta_formats.read_meta(
|
|
80
75
|
schema_from=schema_from,
|
|
81
76
|
format=format,
|
|
82
77
|
spec=spec,
|
|
@@ -88,4 +83,7 @@ def read_json(
|
|
|
88
83
|
}
|
|
89
84
|
# disable prefetch if nrows is set
|
|
90
85
|
settings = {"prefetch": 0} if nrows else {}
|
|
86
|
+
|
|
87
|
+
cloudpickle.register_pickle_by_value(meta_formats)
|
|
88
|
+
|
|
91
89
|
return chain.settings(**settings).gen(**signal_dict) # type: ignore[misc, arg-type]
|
|
@@ -50,8 +50,8 @@ datachain/data_storage/job.py,sha256=9r0OGwh22bHNIvLHqg8_-eJSP1YYB-BN5HOla5TdCxw
|
|
|
50
50
|
datachain/data_storage/metastore.py,sha256=1PaRTQbL7kjcU1BVjiLjXJLrrLzQtUvpqLmm0pwc1rU,39882
|
|
51
51
|
datachain/data_storage/schema.py,sha256=asZYz1cg_WKfe2Q-k5W51E2z2CzHU5B4QEDZDMFr8yo,9346
|
|
52
52
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
53
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
54
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
53
|
+
datachain/data_storage/sqlite.py,sha256=BB8x7jtBmHK9lwn2zTo4HgfTKWGF43JxOsGr38J8YV8,25698
|
|
54
|
+
datachain/data_storage/warehouse.py,sha256=imPm4R2V7TkqgGNSO2FGnKu03axU9UVLMfdUPfpwgHE,31747
|
|
55
55
|
datachain/diff/__init__.py,sha256=-OFZzgOplqO84iWgGY7kfe60NXaWR9JRIh9T-uJboAM,9668
|
|
56
56
|
datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
57
57
|
datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
|
|
@@ -102,7 +102,7 @@ datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,
|
|
|
102
102
|
datachain/lib/dc/datachain.py,sha256=5rR_QqG4vesq-x545ZTSFJDSb6Oc5CW4-ziQYD6DpW4,80993
|
|
103
103
|
datachain/lib/dc/datasets.py,sha256=G65leCuo_3bItmvjoV1wK0pzj7a2IQqe3xRsflpF3xM,10794
|
|
104
104
|
datachain/lib/dc/hf.py,sha256=PJl2wiLjdRsMz0SYbLT-6H8b-D5i2WjeH7li8HHOk_0,2145
|
|
105
|
-
datachain/lib/dc/json.py,sha256=
|
|
105
|
+
datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
|
|
106
106
|
datachain/lib/dc/listings.py,sha256=2na9v63xO1vPUNaoBSzA-TSN49V7zQAb-4iS1wOPLFE,1029
|
|
107
107
|
datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
|
|
108
108
|
datachain/lib/dc/parquet.py,sha256=zYcSgrWwyEDW9UxGUSVdIVsCu15IGEf0xL8KfWQqK94,1782
|
|
@@ -153,9 +153,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
153
153
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
154
154
|
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
155
155
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
156
|
-
datachain-0.18.
|
|
157
|
-
datachain-0.18.
|
|
158
|
-
datachain-0.18.
|
|
159
|
-
datachain-0.18.
|
|
160
|
-
datachain-0.18.
|
|
161
|
-
datachain-0.18.
|
|
156
|
+
datachain-0.18.11.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
157
|
+
datachain-0.18.11.dist-info/METADATA,sha256=TgOokr9DxfY4A1mq7-5APy8DTHUqFEf2FslYxASH1IA,11320
|
|
158
|
+
datachain-0.18.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
159
|
+
datachain-0.18.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
160
|
+
datachain-0.18.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
161
|
+
datachain-0.18.11.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|