datachain 0.30.4__py3-none-any.whl → 0.30.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -1,30 +1,41 @@
1
1
  import sys
2
- from typing import TYPE_CHECKING, Optional
2
+ from collections.abc import Iterable, Iterator
3
+ from typing import TYPE_CHECKING, Optional, Union
3
4
 
4
5
  from tabulate import tabulate
5
6
 
6
- if TYPE_CHECKING:
7
- from datachain.catalog import Catalog
8
-
7
+ from datachain import semver
9
8
  from datachain.catalog import is_namespace_local
10
9
  from datachain.cli.utils import determine_flavors
11
10
  from datachain.config import Config
12
11
  from datachain.error import DataChainError, DatasetNotFoundError
13
12
  from datachain.studio import list_datasets as list_datasets_studio
14
13
 
14
+ if TYPE_CHECKING:
15
+ from datachain.catalog import Catalog
16
+
17
+
18
+ def group_dataset_versions(
19
+ datasets: Iterable[tuple[str, str]], latest_only=True
20
+ ) -> dict[str, Union[str, list[str]]]:
21
+ grouped: dict[str, list[tuple[int, int, int]]] = {}
15
22
 
16
- def group_dataset_versions(datasets, latest_only=True):
17
- grouped = {}
18
23
  # Sort to ensure groupby works as expected
19
24
  # (groupby expects consecutive items with the same key)
20
25
  for name, version in sorted(datasets):
21
- grouped.setdefault(name, []).append(version)
26
+ grouped.setdefault(name, []).append(semver.parse(version))
22
27
 
23
28
  if latest_only:
24
29
  # For each dataset name, pick the highest version.
25
- return {name: max(versions) for name, versions in grouped.items()}
30
+ return {
31
+ name: semver.create(*(max(versions))) for name, versions in grouped.items()
32
+ }
33
+
26
34
  # For each dataset name, return a sorted list of unique versions.
27
- return {name: sorted(set(versions)) for name, versions in grouped.items()}
35
+ return {
36
+ name: [semver.create(*v) for v in sorted(set(versions))]
37
+ for name, versions in grouped.items()
38
+ }
28
39
 
29
40
 
30
41
  def list_datasets(
@@ -35,7 +46,7 @@ def list_datasets(
35
46
  team: Optional[str] = None,
36
47
  latest_only: bool = True,
37
48
  name: Optional[str] = None,
38
- ):
49
+ ) -> None:
39
50
  token = Config().read().get("studio", {}).get("token")
40
51
  all, local, studio = determine_flavors(studio, local, all, token)
41
52
  if name:
@@ -95,27 +106,31 @@ def list_datasets(
95
106
  print(tabulate(rows, headers="keys"))
96
107
 
97
108
 
98
- def list_datasets_local(catalog: "Catalog", name: Optional[str] = None):
109
+ def list_datasets_local(
110
+ catalog: "Catalog", name: Optional[str] = None
111
+ ) -> Iterator[tuple[str, str]]:
99
112
  if name:
100
113
  yield from list_datasets_local_versions(catalog, name)
101
114
  return
102
115
 
103
116
  for d in catalog.ls_datasets():
104
117
  for v in d.versions:
105
- yield (d.full_name, v.version)
118
+ yield d.full_name, v.version
106
119
 
107
120
 
108
- def list_datasets_local_versions(catalog: "Catalog", name: str):
121
+ def list_datasets_local_versions(
122
+ catalog: "Catalog", name: str
123
+ ) -> Iterator[tuple[str, str]]:
109
124
  namespace_name, project_name, name = catalog.get_full_dataset_name(name)
110
125
 
111
126
  ds = catalog.get_dataset(
112
127
  name, namespace_name=namespace_name, project_name=project_name
113
128
  )
114
129
  for v in ds.versions:
115
- yield (name, v.version)
130
+ yield name, v.version
116
131
 
117
132
 
118
- def _datasets_tabulate_row(name, both, local_version, studio_version):
133
+ def _datasets_tabulate_row(name, both, local_version, studio_version) -> dict[str, str]:
119
134
  row = {
120
135
  "Name": name,
121
136
  }
@@ -136,7 +151,7 @@ def rm_dataset(
136
151
  force: Optional[bool] = False,
137
152
  studio: Optional[bool] = False,
138
153
  team: Optional[str] = None,
139
- ):
154
+ ) -> None:
140
155
  namespace_name, project_name, name = catalog.get_full_dataset_name(name)
141
156
 
142
157
  if studio:
@@ -166,7 +181,7 @@ def edit_dataset(
166
181
  description: Optional[str] = None,
167
182
  attrs: Optional[list[str]] = None,
168
183
  team: Optional[str] = None,
169
- ):
184
+ ) -> None:
170
185
  from datachain.lib.dc.utils import is_studio
171
186
 
172
187
  namespace_name, project_name, name = catalog.get_full_dataset_name(name)
@@ -1,5 +1,4 @@
1
1
  import glob
2
- import json
3
2
  import logging
4
3
  import posixpath
5
4
  import random
@@ -11,6 +10,7 @@ from urllib.parse import urlparse
11
10
 
12
11
  import attrs
13
12
  import sqlalchemy as sa
13
+ import ujson as json
14
14
  from sqlalchemy.sql.expression import true
15
15
 
16
16
  from datachain.client import Client
@@ -122,7 +122,7 @@ class AbstractWarehouse(ABC, Serializable):
122
122
  if value_type is str:
123
123
  return val
124
124
  if value_type in (dict, list):
125
- return json.dumps(val)
125
+ return json.dumps(val, ensure_ascii=False)
126
126
  raise ValueError(
127
127
  f"Cannot convert value {val!r} with type {value_type} to JSON"
128
128
  )
datachain/delta.py CHANGED
@@ -4,7 +4,7 @@ from functools import wraps
4
4
  from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
5
5
 
6
6
  import datachain
7
- from datachain.dataset import DatasetDependency
7
+ from datachain.dataset import DatasetDependency, DatasetRecord
8
8
  from datachain.error import DatasetNotFoundError
9
9
  from datachain.project import Project
10
10
 
@@ -30,9 +30,10 @@ def delta_disabled(
30
30
 
31
31
  @wraps(method)
32
32
  def _inner(self: T, *args: "P.args", **kwargs: "P.kwargs") -> T:
33
- if self.delta:
33
+ if self.delta and not self._delta_unsafe:
34
34
  raise NotImplementedError(
35
- f"Delta update cannot be used with {method.__name__}"
35
+ f"Cannot use {method.__name__} with delta datasets - may cause"
36
+ " inconsistency. Use delta_unsafe flag to allow this operation."
36
37
  )
37
38
  return method(self, *args, **kwargs)
38
39
 
@@ -124,10 +125,19 @@ def _get_retry_chain(
124
125
  # Subtract also diff chain since some items might be picked
125
126
  # up by `delta=True` itself (e.g. records got modified AND are missing in the
126
127
  # result dataset atm)
127
- return retry_chain.subtract(diff_chain, on=on) if retry_chain else None
128
+ on = [on] if isinstance(on, str) else on
129
+
130
+ return (
131
+ retry_chain.diff(
132
+ diff_chain, on=on, added=True, same=True, modified=False, deleted=False
133
+ ).distinct(*on)
134
+ if retry_chain
135
+ else None
136
+ )
128
137
 
129
138
 
130
139
  def _get_source_info(
140
+ source_ds: DatasetRecord,
131
141
  name: str,
132
142
  namespace_name: str,
133
143
  project_name: str,
@@ -154,25 +164,23 @@ def _get_source_info(
154
164
  indirect=False,
155
165
  )
156
166
 
157
- dep = dependencies[0]
158
- if not dep:
167
+ source_ds_dep = next((d for d in dependencies if d.name == source_ds.name), None)
168
+ if not source_ds_dep:
159
169
  # Starting dataset was removed, back off to normal dataset creation
160
170
  return None, None, None, None, None
161
171
 
162
- source_ds_project = catalog.metastore.get_project(dep.project, dep.namespace)
163
- source_ds_name = dep.name
164
- source_ds_version = dep.version
165
- source_ds_latest_version = catalog.get_dataset(
166
- source_ds_name,
167
- namespace_name=source_ds_project.namespace.name,
168
- project_name=source_ds_project.name,
169
- ).latest_version
172
+ # Refresh starting dataset to have new versions if they are created
173
+ source_ds = catalog.get_dataset(
174
+ source_ds.name,
175
+ namespace_name=source_ds.project.namespace.name,
176
+ project_name=source_ds.project.name,
177
+ )
170
178
 
171
179
  return (
172
- source_ds_name,
173
- source_ds_project,
174
- source_ds_version,
175
- source_ds_latest_version,
180
+ source_ds.name,
181
+ source_ds.project,
182
+ source_ds_dep.version,
183
+ source_ds.latest_version,
176
184
  dependencies,
177
185
  )
178
186
 
@@ -244,7 +252,14 @@ def delta_retry_update(
244
252
  source_ds_version,
245
253
  source_ds_latest_version,
246
254
  dependencies,
247
- ) = _get_source_info(name, namespace_name, project_name, latest_version, catalog)
255
+ ) = _get_source_info(
256
+ dc._query.starting_step.dataset, # type: ignore[union-attr]
257
+ name,
258
+ namespace_name,
259
+ project_name,
260
+ latest_version,
261
+ catalog,
262
+ )
248
263
 
249
264
  # If source_ds_name is None, starting dataset was removed
250
265
  if source_ds_name is None:
@@ -267,8 +282,9 @@ def delta_retry_update(
267
282
  if dependencies:
268
283
  dependencies = copy(dependencies)
269
284
  dependencies = [d for d in dependencies if d is not None]
285
+ source_ds_dep = next(d for d in dependencies if d.name == source_ds_name)
270
286
  # Update to latest version
271
- dependencies[0].version = source_ds_latest_version # type: ignore[union-attr]
287
+ source_ds_dep.version = source_ds_latest_version # type: ignore[union-attr]
272
288
 
273
289
  # Handle retry functionality if enabled
274
290
  if delta_retry:
datachain/lib/arrow.py CHANGED
@@ -2,8 +2,8 @@ from collections.abc import Sequence
2
2
  from itertools import islice
3
3
  from typing import TYPE_CHECKING, Any, Optional
4
4
 
5
- import orjson
6
5
  import pyarrow as pa
6
+ import ujson as json
7
7
  from pyarrow._csv import ParseOptions
8
8
  from pyarrow.dataset import CsvFileFormat, dataset
9
9
  from tqdm.auto import tqdm
@@ -269,7 +269,7 @@ def _get_hf_schema(
269
269
  def _get_datachain_schema(schema: "pa.Schema") -> Optional[SignalSchema]:
270
270
  """Return a restored SignalSchema from parquet metadata, if any is found."""
271
271
  if schema.metadata and DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY in schema.metadata:
272
- serialized_signal_schema = orjson.loads(
272
+ serialized_signal_schema = json.loads(
273
273
  schema.metadata[DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY]
274
274
  )
275
275
  return SignalSchema.deserialize(serialized_signal_schema)
@@ -19,8 +19,8 @@ from typing import (
19
19
  overload,
20
20
  )
21
21
 
22
- import orjson
23
22
  import sqlalchemy
23
+ import ujson as json
24
24
  from pydantic import BaseModel
25
25
  from sqlalchemy.sql.elements import ColumnElement
26
26
  from tqdm import tqdm
@@ -193,6 +193,7 @@ class DataChain:
193
193
  self._setup: dict = setup or {}
194
194
  self._sys = _sys
195
195
  self._delta = False
196
+ self._delta_unsafe = False
196
197
  self._delta_on: Optional[Union[str, Sequence[str]]] = None
197
198
  self._delta_result_on: Optional[Union[str, Sequence[str]]] = None
198
199
  self._delta_compare: Optional[Union[str, Sequence[str]]] = None
@@ -216,6 +217,7 @@ class DataChain:
216
217
  right_on: Optional[Union[str, Sequence[str]]] = None,
217
218
  compare: Optional[Union[str, Sequence[str]]] = None,
218
219
  delta_retry: Optional[Union[bool, str]] = None,
220
+ delta_unsafe: bool = False,
219
221
  ) -> "Self":
220
222
  """Marks this chain as delta, which means special delta process will be
221
223
  called on saving dataset for optimization"""
@@ -226,6 +228,7 @@ class DataChain:
226
228
  self._delta_result_on = right_on
227
229
  self._delta_compare = compare
228
230
  self._delta_retry = delta_retry
231
+ self._delta_unsafe = delta_unsafe
229
232
  return self
230
233
 
231
234
  @property
@@ -238,6 +241,10 @@ class DataChain:
238
241
  """Returns True if this chain is ran in "delta" update mode"""
239
242
  return self._delta
240
243
 
244
+ @property
245
+ def delta_unsafe(self) -> bool:
246
+ return self._delta_unsafe
247
+
241
248
  @property
242
249
  def schema(self) -> dict[str, DataType]:
243
250
  """Get schema of the chain."""
@@ -328,6 +335,7 @@ class DataChain:
328
335
  right_on=self._delta_result_on,
329
336
  compare=self._delta_compare,
330
337
  delta_retry=self._delta_retry,
338
+ delta_unsafe=self._delta_unsafe,
331
339
  )
332
340
 
333
341
  return chain
@@ -462,8 +470,6 @@ class DataChain:
462
470
  Returns:
463
471
  DataChain: A new DataChain instance with the new set of columns.
464
472
  """
465
- import json
466
-
467
473
  import pyarrow as pa
468
474
 
469
475
  from datachain.lib.arrow import schema_to_output
@@ -2129,9 +2135,9 @@ class DataChain:
2129
2135
  fsspec_fs = client.create_fs(**fs_kwargs)
2130
2136
 
2131
2137
  _partition_cols = list(partition_cols) if partition_cols else None
2132
- signal_schema_metadata = orjson.dumps(
2133
- self._effective_signals_schema.serialize()
2134
- )
2138
+ signal_schema_metadata = json.dumps(
2139
+ self._effective_signals_schema.serialize(), ensure_ascii=False
2140
+ ).encode("utf-8")
2135
2141
 
2136
2142
  column_names, column_chunks = self.to_columnar_data_with_names(chunk_size)
2137
2143
 
@@ -2278,7 +2284,11 @@ class DataChain:
2278
2284
  f.write(b"\n")
2279
2285
  else:
2280
2286
  is_first = False
2281
- f.write(orjson.dumps(row_to_nested_dict(headers, row)))
2287
+ f.write(
2288
+ json.dumps(
2289
+ row_to_nested_dict(headers, row), ensure_ascii=False
2290
+ ).encode("utf-8")
2291
+ )
2282
2292
  if include_outer_list:
2283
2293
  # This makes the file JSON instead of JSON lines.
2284
2294
  f.write(b"\n]\n")
@@ -40,6 +40,7 @@ def read_dataset(
40
40
  delta_result_on: Optional[Union[str, Sequence[str]]] = None,
41
41
  delta_compare: Optional[Union[str, Sequence[str]]] = None,
42
42
  delta_retry: Optional[Union[bool, str]] = None,
43
+ delta_unsafe: bool = False,
43
44
  update: bool = False,
44
45
  ) -> "DataChain":
45
46
  """Get data from a saved Dataset. It returns the chain itself.
@@ -80,6 +81,8 @@ def read_dataset(
80
81
  update: If True always checks for newer versions available on Studio, even if
81
82
  some version of the dataset exists locally already. If False (default), it
82
83
  will only fetch the dataset from Studio if it is not found locally.
84
+ delta_unsafe: Allow restricted ops in delta: merge, agg, union, group_by,
85
+ distinct.
83
86
 
84
87
 
85
88
  Example:
@@ -205,6 +208,7 @@ def read_dataset(
205
208
  right_on=delta_result_on,
206
209
  compare=delta_compare,
207
210
  delta_retry=delta_retry,
211
+ delta_unsafe=delta_unsafe,
208
212
  )
209
213
 
210
214
  return chain
@@ -43,6 +43,7 @@ def read_storage(
43
43
  delta_result_on: Optional[Union[str, Sequence[str]]] = None,
44
44
  delta_compare: Optional[Union[str, Sequence[str]]] = None,
45
45
  delta_retry: Optional[Union[bool, str]] = None,
46
+ delta_unsafe: bool = False,
46
47
  client_config: Optional[dict] = None,
47
48
  ) -> "DataChain":
48
49
  """Get data from storage(s) as a list of file with all file attributes.
@@ -77,6 +78,9 @@ def read_storage(
77
78
  (error mode)
78
79
  - True: Reprocess records missing from the result dataset (missing mode)
79
80
  - None: No retry processing (default)
81
+ delta_unsafe: Allow restricted ops in delta: merge, agg, union, group_by,
82
+ distinct. Caller must ensure datasets are consistent and not partially
83
+ updated.
80
84
 
81
85
  Returns:
82
86
  DataChain: A DataChain object containing the file information.
@@ -218,6 +222,7 @@ def read_storage(
218
222
  right_on=delta_result_on,
219
223
  compare=delta_compare,
220
224
  delta_retry=delta_retry,
225
+ delta_unsafe=delta_unsafe,
221
226
  )
222
227
 
223
228
  return storage_chain
@@ -89,3 +89,15 @@ class ModelStore:
89
89
  and ModelStore.is_pydantic(parent_type)
90
90
  and "@" in ModelStore.get_name(parent_type)
91
91
  )
92
+
93
+ @classmethod
94
+ def rebuild_all(cls) -> None:
95
+ """Ensure pydantic schemas are (re)built for all registered models.
96
+
97
+ Uses ``force=True`` to avoid subtle cases where a deserialized class
98
+ (e.g. from by-value cloudpickle in workers) reports built state but
99
+ nested model field schemas aren't fully resolved yet.
100
+ """
101
+ for versions in cls.store.values():
102
+ for model in versions.values():
103
+ model.model_rebuild(force=True)
@@ -13,6 +13,7 @@ from multiprocess import get_context
13
13
  from datachain.catalog import Catalog
14
14
  from datachain.catalog.catalog import clone_catalog_with_cache
15
15
  from datachain.catalog.loader import DISTRIBUTED_IMPORT_PATH, get_udf_distributor_class
16
+ from datachain.lib.model_store import ModelStore
16
17
  from datachain.lib.udf import _get_cache
17
18
  from datachain.query.dataset import (
18
19
  get_download_callback,
@@ -130,6 +131,8 @@ class UDFDispatcher:
130
131
 
131
132
  def _create_worker(self) -> "UDFWorker":
132
133
  udf: UDFAdapter = loads(self.udf_data)
134
+ # Ensure all registered DataModels have rebuilt schemas in worker processes.
135
+ ModelStore.rebuild_all()
133
136
  return UDFWorker(
134
137
  self.catalog,
135
138
  udf,
@@ -196,6 +199,8 @@ class UDFDispatcher:
196
199
  generated_cb: Callback = DEFAULT_CALLBACK,
197
200
  ) -> None:
198
201
  udf: UDFAdapter = loads(self.udf_data)
202
+ # Rebuild schemas in single process too for consistency (cheap, idempotent).
203
+ ModelStore.rebuild_all()
199
204
 
200
205
  if ids_only and not self.is_batching:
201
206
  input_rows = flatten(input_rows)
@@ -8,8 +8,8 @@ from functools import cache
8
8
  from types import MappingProxyType
9
9
  from typing import Callable, Optional
10
10
 
11
- import orjson
12
11
  import sqlalchemy as sa
12
+ import ujson as json
13
13
  from sqlalchemy.dialects import sqlite
14
14
  from sqlalchemy.ext.compiler import compiles
15
15
  from sqlalchemy.sql.elements import literal
@@ -182,7 +182,7 @@ def missing_vector_function(name, exc):
182
182
 
183
183
 
184
184
  def sqlite_string_split(string: str, sep: str, maxsplit: int = -1) -> str:
185
- return orjson.dumps(string.split(sep, maxsplit)).decode("utf-8")
185
+ return json.dumps(string.split(sep, maxsplit), ensure_ascii=False)
186
186
 
187
187
 
188
188
  def sqlite_int_hash_64(x: int) -> int:
@@ -453,17 +453,17 @@ def compile_byte_hamming_distance(element, compiler, **kwargs):
453
453
 
454
454
 
455
455
  def py_json_array_length(arr):
456
- return len(orjson.loads(arr))
456
+ return len(json.loads(arr))
457
457
 
458
458
 
459
459
  def py_json_array_contains(arr, value, is_json):
460
460
  if is_json:
461
- value = orjson.loads(value)
462
- return value in orjson.loads(arr)
461
+ value = json.loads(value)
462
+ return value in json.loads(arr)
463
463
 
464
464
 
465
465
  def py_json_array_get_element(val, idx):
466
- arr = orjson.loads(val)
466
+ arr = json.loads(val)
467
467
  try:
468
468
  return arr[idx]
469
469
  except IndexError:
@@ -471,17 +471,18 @@ def py_json_array_get_element(val, idx):
471
471
 
472
472
 
473
473
  def py_json_array_slice(val, offset: int, length: Optional[int] = None):
474
- arr = orjson.loads(val)
474
+ arr = json.loads(val)
475
475
  try:
476
- return orjson.dumps(
477
- list(arr[offset : offset + length] if length is not None else arr[offset:])
478
- ).decode("utf-8")
476
+ return json.dumps(
477
+ list(arr[offset : offset + length] if length is not None else arr[offset:]),
478
+ ensure_ascii=False,
479
+ )
479
480
  except IndexError:
480
481
  return None
481
482
 
482
483
 
483
484
  def py_json_array_join(val, sep: str):
484
- return sep.join(orjson.loads(val))
485
+ return sep.join(json.loads(val))
485
486
 
486
487
 
487
488
  def compile_array_get_element(element, compiler, **kwargs):
@@ -1,6 +1,6 @@
1
1
  import sqlite3
2
2
 
3
- import orjson
3
+ import ujson as json
4
4
  from sqlalchemy import types
5
5
 
6
6
  from datachain.sql.types import TypeConverter, TypeReadConverter
@@ -28,26 +28,21 @@ class Array(types.UserDefinedType):
28
28
 
29
29
 
30
30
  def adapt_array(arr):
31
- return orjson.dumps(arr).decode("utf-8")
31
+ return json.dumps(arr, ensure_ascii=False)
32
32
 
33
33
 
34
34
  def adapt_dict(dct):
35
- return orjson.dumps(dct).decode("utf-8")
35
+ return json.dumps(dct, ensure_ascii=False)
36
36
 
37
37
 
38
38
  def convert_array(arr):
39
- return orjson.loads(arr)
39
+ return json.loads(arr)
40
40
 
41
41
 
42
42
  def adapt_np_array(arr):
43
- def _json_serialize(obj):
44
- if isinstance(obj, np.ndarray):
45
- return obj.tolist()
46
- return obj
47
-
48
- return orjson.dumps(
49
- arr, option=orjson.OPT_SERIALIZE_NUMPY, default=_json_serialize
50
- ).decode("utf-8")
43
+ # Primarily needed for UDF numpy results (e.g. WDS)
44
+ # tolist() gives nested Python lists + native scalars; ujson.dumps handles NaN/Inf.
45
+ return json.dumps(arr.tolist(), ensure_ascii=False)
51
46
 
52
47
 
53
48
  def adapt_np_generic(val):
@@ -74,5 +69,5 @@ class SQLiteTypeConverter(TypeConverter):
74
69
  class SQLiteTypeReadConverter(TypeReadConverter):
75
70
  def array(self, value, item_type, dialect):
76
71
  if isinstance(value, str):
77
- value = orjson.loads(value)
72
+ value = json.loads(value)
78
73
  return super().array(value, item_type, dialect)
datachain/sql/types.py CHANGED
@@ -16,8 +16,8 @@ from datetime import datetime
16
16
  from types import MappingProxyType
17
17
  from typing import Any, Union
18
18
 
19
- import orjson
20
19
  import sqlalchemy as sa
20
+ import ujson as jsonlib
21
21
  from sqlalchemy import TypeDecorator, types
22
22
 
23
23
  from datachain.lib.data_model import StandardType
@@ -352,7 +352,7 @@ class Array(SQLType):
352
352
  def on_read_convert(self, value, dialect):
353
353
  r = read_converter(dialect).array(value, self.item_type, dialect)
354
354
  if isinstance(self.item_type, JSON):
355
- r = [orjson.loads(item) if isinstance(item, str) else item for item in r]
355
+ r = [jsonlib.loads(item) if isinstance(item, str) else item for item in r]
356
356
  return r
357
357
 
358
358
 
@@ -466,7 +466,7 @@ class TypeReadConverter:
466
466
  if isinstance(value, str):
467
467
  if value == "":
468
468
  return {}
469
- return orjson.loads(value)
469
+ return jsonlib.loads(value)
470
470
  return value
471
471
 
472
472
  def datetime(self, value):
datachain/utils.py CHANGED
@@ -417,7 +417,7 @@ class JSONSerialize(json.JSONEncoder):
417
417
 
418
418
  def inside_colab() -> bool:
419
419
  try:
420
- from google import colab # noqa: F401
420
+ from google import colab # type: ignore[attr-defined] # noqa: F401
421
421
  except ImportError:
422
422
  return False
423
423
  return True
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.30.4
3
+ Version: 0.30.6
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -22,6 +22,7 @@ Requires-Dist: tomlkit
22
22
  Requires-Dist: tqdm
23
23
  Requires-Dist: numpy<3,>=1
24
24
  Requires-Dist: pandas>=2.0.0
25
+ Requires-Dist: ujson>=5.10.0
25
26
  Requires-Dist: packaging
26
27
  Requires-Dist: pyarrow
27
28
  Requires-Dist: typing-extensions
@@ -38,7 +39,6 @@ Requires-Dist: shtab<2,>=1.3.4
38
39
  Requires-Dist: sqlalchemy>=2
39
40
  Requires-Dist: multiprocess==0.70.16
40
41
  Requires-Dist: cloudpickle
41
- Requires-Dist: orjson>=3.10.5
42
42
  Requires-Dist: pydantic
43
43
  Requires-Dist: jmespath>=1.0
44
44
  Requires-Dist: datamodel-code-generator>=0.25
@@ -92,7 +92,7 @@ Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
92
92
  Requires-Dist: pytest-servers[all]>=0.5.9; extra == "tests"
93
93
  Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
94
94
  Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
95
- Requires-Dist: pytest-env>=1.1.0; extra == "tests"
95
+ Requires-Dist: pytest-dotenv; extra == "tests"
96
96
  Requires-Dist: virtualenv; extra == "tests"
97
97
  Requires-Dist: dulwich; extra == "tests"
98
98
  Requires-Dist: hypothesis; extra == "tests"
@@ -4,7 +4,7 @@ datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
4
4
  datachain/cache.py,sha256=ESVRaCJXEThMIfGEFVHx6wJPOZA7FYk9V6WxjyuqUBY,3626
5
5
  datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
6
6
  datachain/dataset.py,sha256=ATGa-CBTFoZeTN2V40-zHEzfMBcdYK0WuoJ6H2yEAvo,25268
7
- datachain/delta.py,sha256=dghGvD44LcglvL5-kUOIKk75ywBO0U7eikA3twKZC28,10202
7
+ datachain/delta.py,sha256=X5Lw6GQ8MAYNl2YIExNvl0tPIkylQEWwnCw0We7NtHM,10693
8
8
  datachain/error.py,sha256=OWwWMkzZYJrkcoEDGhJHMf7SfKvxcsOLRF94mjPf29I,1609
9
9
  datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
10
10
  datachain/listing.py,sha256=aqayl5St3D9PwdwM6nR1STkpLSw-S3U8pudO9PWi3N8,7241
@@ -19,7 +19,7 @@ datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
19
19
  datachain/semver.py,sha256=UB8GHPBtAP3UJGeiuJoInD7SK-DnB93_Xd1qy_CQ9cU,2074
20
20
  datachain/studio.py,sha256=27750qCSNxIChEzhV02damIFreLMfr7UdiWqMFyk8AA,15361
21
21
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
22
- datachain/utils.py,sha256=Md1iu-ehIo5X72ampXzvxWOBEx6Y3CtzzD2iLDQL3Vs,15634
22
+ datachain/utils.py,sha256=RKe1-VuC9juQSIbIpMnELJ7QrsKQggj8l7Q8_FiCZHE,15664
23
23
  datachain/catalog/__init__.py,sha256=9NBaywvAOaXdkyqiHjbBEiXs7JImR1OJsY9r8D5Q16g,403
24
24
  datachain/catalog/catalog.py,sha256=a1AN6eDHWWzII1wi46T_1JvTsW1AeMudwR_6sVQ4f7I,67588
25
25
  datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
@@ -27,7 +27,7 @@ datachain/catalog/loader.py,sha256=53VnuSRkt_CO9RdlHWkzQsPF55qMxcXvEm3ecsZREw8,6
27
27
  datachain/cli/__init__.py,sha256=so3WxEQF03KdGvjav15Sw7a6-lriiE24uDSGbBDBp8o,8298
28
28
  datachain/cli/utils.py,sha256=wrLnAh7Wx8O_ojZE8AE4Lxn5WoxHbOj7as8NWlLAA74,3036
29
29
  datachain/cli/commands/__init__.py,sha256=zp3bYIioO60x_X04A4-IpZqSYVnpwOa1AdERQaRlIhI,493
30
- datachain/cli/commands/datasets.py,sha256=Q2zYbiWXYPjg6e_YHyUKaYRg1L6-lxv0L214bogwsUY,6565
30
+ datachain/cli/commands/datasets.py,sha256=DAbONwcA__JM1qkcKVOP5sKukGbCGqLWCMBkBscA3_s,6971
31
31
  datachain/cli/commands/du.py,sha256=9edEzDEs98K2VYk8Wf-ZMpUzALcgm9uD6YtoqbvtUGU,391
32
32
  datachain/cli/commands/index.py,sha256=eglNaIe1yyIadUHHumjtNbgIjht6kme7SS7xE3YHR88,198
33
33
  datachain/cli/commands/ls.py,sha256=CBmk838Q-EQp04lE2Qdnpsc1GXAkC4-I-b-a_828n1E,5272
@@ -53,7 +53,7 @@ datachain/data_storage/metastore.py,sha256=aSeTRh43hmrOhULi9YD2VlgCj8B4bjE3jqCOv
53
53
  datachain/data_storage/schema.py,sha256=o3JbURKXRg3IJyIVA4QjHHkn6byRuz7avbydU2FlvNY,9897
54
54
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
55
55
  datachain/data_storage/sqlite.py,sha256=edcTegzEoAEdEp62Rg9oERvHWXDcpg8d4onrD-P2xKM,30159
56
- datachain/data_storage/warehouse.py,sha256=66PETLzfkgSmj-EF604m62xmFMQBXaRZSw8sdKGMam8,32613
56
+ datachain/data_storage/warehouse.py,sha256=sEbNiWKdB7yuLt88FuIfRur7U7WiOZrcHWhnBS_eMAg,32642
57
57
  datachain/diff/__init__.py,sha256=-OFZzgOplqO84iWgGY7kfe60NXaWR9JRIh9T-uJboAM,9668
58
58
  datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
59
59
  datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
@@ -70,7 +70,7 @@ datachain/func/random.py,sha256=t7jwXsI8-hy0qAdvjAntgzy-AHtTAfozlZ1CpKR-QZE,458
70
70
  datachain/func/string.py,sha256=6-fZM7wHv0JZ2ZzpLFPLLYW15K_CT5VfYsmx56zBrpA,7419
71
71
  datachain/func/window.py,sha256=ImyRpc1QI8QUSPO7KdD60e_DPVo7Ja0G5kcm6BlyMcw,1584
72
72
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
- datachain/lib/arrow.py,sha256=geoLvyDd5uMqS3D9Ec1ODlShCUAdtwHUwl8FqbUX_hg,10776
73
+ datachain/lib/arrow.py,sha256=aedsosbFNjIBa6LQIxR2zhIVcA4pVw1p5hCVmrDhWsQ,10781
74
74
  datachain/lib/audio.py,sha256=fQmIBq-9hrUZtkgeJdPHYA_D8Wfe9D4cQZk4_ijxpNc,7580
75
75
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
76
76
  datachain/lib/data_model.py,sha256=Rjah76GHwIV6AZQk4rsdg6JLre5D8Kb9T4PS5SXzsPA,3740
@@ -81,7 +81,7 @@ datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
81
81
  datachain/lib/listing.py,sha256=U-2stsTEwEsq4Y80dqGfktGzkmB5-ZntnL1_rzXlH0k,7089
82
82
  datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
83
83
  datachain/lib/meta_formats.py,sha256=zdyg6XLk3QIsSk3I7s0Ez5kaCJSlE3uq7JiGxf7UwtU,6348
84
- datachain/lib/model_store.py,sha256=dkL2rcT5ag-kbgkhQPL_byEs-TCYr29qvdltroL5NxM,2734
84
+ datachain/lib/model_store.py,sha256=A0pSVQ7uaZ9RvANapzirF8Cqq9N6ysosPpMSkzdRPkU,3226
85
85
  datachain/lib/namespaces.py,sha256=I6gLC4ZzgyatFtHL85MWR4ml7-yuQOzxHE7IQNbt_ac,2107
86
86
  datachain/lib/projects.py,sha256=VJgmzHzKjmNPZD1tm0a1RNHmUQwn6WLWCLpKyc4UrSk,2605
87
87
  datachain/lib/pytorch.py,sha256=S-st2SAczYut13KMf6eSqP_OQ8otWI5TRmzhK5fN3k0,7828
@@ -104,15 +104,15 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
104
104
  datachain/lib/dc/__init__.py,sha256=UrUzmDH6YyVl8fxM5iXTSFtl5DZTUzEYm1MaazK4vdQ,900
105
105
  datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
106
106
  datachain/lib/dc/database.py,sha256=F6EOjPKwSdp26kJsOKGq49D9OxqyKEalINHEwLQav2s,14716
107
- datachain/lib/dc/datachain.py,sha256=vHGrrFv1vhXadp0JExfrFMioH858Yc00hGbZkCpOdLE,99324
108
- datachain/lib/dc/datasets.py,sha256=HKQXnCpIGFsYQ9ociLAUm8cwg2H0GaUmgWCF4FkKpbk,15180
107
+ datachain/lib/dc/datachain.py,sha256=2UtDhtBzx5VejkDE0UTS3t1517jCGr7YEKvO5wqNU-Q,99709
108
+ datachain/lib/dc/datasets.py,sha256=-Bvyyu4XXDXLiWa-bOnsp0Q11RSYXRO0j5DaX8ShaFs,15355
109
109
  datachain/lib/dc/hf.py,sha256=AP_MUHg6HJWae10PN9hD_beQVjrl0cleZ6Cvhtl1yoI,2901
110
110
  datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
111
111
  datachain/lib/dc/listings.py,sha256=V379Cb-7ZyquM0w7sWArQZkzInZy4GB7QQ1ZfowKzQY,4544
112
112
  datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
113
113
  datachain/lib/dc/parquet.py,sha256=zYcSgrWwyEDW9UxGUSVdIVsCu15IGEf0xL8KfWQqK94,1782
114
114
  datachain/lib/dc/records.py,sha256=4N1Fq-j5r4GK-PR5jIO-9B2u_zTNX9l-6SmcRhQDAsw,3136
115
- datachain/lib/dc/storage.py,sha256=FXroEdxOZfbuEBIWfWTkbGwrI0D4_mrLZSRsIQm0WFE,7693
115
+ datachain/lib/dc/storage.py,sha256=OMJE-9ob9Ku5le8W6O8J1W-XJ0pwHt2PsO-ZCcee1ZA,7950
116
116
  datachain/lib/dc/utils.py,sha256=9OMiFu2kXIbtMqzJTEr1qbCoCBGpOmTnkWImVgFTKgo,4112
117
117
  datachain/lib/dc/values.py,sha256=7l1n352xWrEdql2NhBcZ3hj8xyPglWiY4qHjFPjn6iw,1428
118
118
  datachain/model/__init__.py,sha256=R9faX5OHV1xh2EW-g2MPedwbtEqt3LodJRyluB-QylI,189
@@ -127,7 +127,7 @@ datachain/model/ultralytics/segment.py,sha256=v9_xDxd5zw_I8rXsbl7yQXgEdTs2T38zyY
127
127
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
128
128
  datachain/query/batch.py,sha256=-goxLpE0EUvaDHu66rstj53UnfHpYfBUGux8GSpJ93k,4306
129
129
  datachain/query/dataset.py,sha256=OaGRBNSWYNaRbYn6avij0fiFN5DT-nwdM-wJ4yTfaYs,63317
130
- datachain/query/dispatch.py,sha256=A0nPxn6mEN5d9dDo6S8m16Ji_9IvJLXrgF2kqXdi4fs,15546
130
+ datachain/query/dispatch.py,sha256=f8IIvuLBJaCEwSRv7bWPMy1uXyc28W0LGqrBffjYf98,15831
131
131
  datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
132
132
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
133
133
  datachain/query/queue.py,sha256=v0UeK4ilmdiRoJ5OdjB5qpnHTYDxRP4vhVp5Iw_toaI,3512
@@ -141,7 +141,7 @@ datachain/sql/__init__.py,sha256=8D2omsBiATt8bjLjGo6jBEtaKEkOlnlNFWhVryHMDv0,388
141
141
  datachain/sql/postgresql_dialect.py,sha256=pDTfH8xaXz5xZsq8O1aQUvWLRIv_ogYeAqtmKlPp3Rw,280
142
142
  datachain/sql/postgresql_types.py,sha256=ryb_0lzuA9UOJ_B6nW9Yb8nJjzeSmEItAL_Ceue65lc,627
143
143
  datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
144
- datachain/sql/types.py,sha256=RWOghtYFx14K-e71QOGg5yfKb-A4-4JgFjaJ0wCZ17Y,15006
144
+ datachain/sql/types.py,sha256=2XbNaQTTc2BGJ6qL7RcwrBByIEbf9PXcsElIz6q9Mkg,15018
145
145
  datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
146
146
  datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESmVAWw,45
147
147
  datachain/sql/default/base.py,sha256=QD-31C6JnyOXzogyDx90sUhm7QvgXIYpeHEASH84igU,628
@@ -154,15 +154,15 @@ datachain/sql/functions/path.py,sha256=zixpERotTFP6LZ7I4TiGtyRA8kXOoZmH1yzH9oRW0
154
154
  datachain/sql/functions/random.py,sha256=vBwEEj98VH4LjWixUCygQ5Bz1mv1nohsCG0-ZTELlVg,271
155
155
  datachain/sql/functions/string.py,sha256=E-T9OIzUR-GKaLgjZsEtg5CJrY_sLf1lt1awTvY7w2w,1426
156
156
  datachain/sql/sqlite/__init__.py,sha256=PsLaDSij9a03VxGSpagpNl7NQsGtgm72ArUeALZONoc,183
157
- datachain/sql/sqlite/base.py,sha256=6aoQHeggY3hs31_YZ-wlYKA1Lto4MFOpgfgRspH6IMc,21498
158
- datachain/sql/sqlite/types.py,sha256=cH6oge2E_YWFy22wY-txPJH8gxoQFSpCthtZR8PZjpo,1849
157
+ datachain/sql/sqlite/base.py,sha256=WzRxJ8lHAeBCQlh4Z_NmX0CCkxeOt10M_vudCQzY4gE,21510
158
+ datachain/sql/sqlite/types.py,sha256=DCK7q-Zdc_m1o1T33xrKjYX1zRg1231gw3o3ACO_qho,1815
159
159
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
160
160
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
161
161
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
162
162
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
163
- datachain-0.30.4.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
164
- datachain-0.30.4.dist-info/METADATA,sha256=HLbefq934ZEwQ2A7JVkUEqNy_y0_YxGVTu0iRrV1pOo,13903
165
- datachain-0.30.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
- datachain-0.30.4.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
167
- datachain-0.30.4.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
168
- datachain-0.30.4.dist-info/RECORD,,
163
+ datachain-0.30.6.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
164
+ datachain-0.30.6.dist-info/METADATA,sha256=ZyXo8wdTrN08k--Soy3UHpCu_Jni_6ocO3_PbjCswCE,13898
165
+ datachain-0.30.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
166
+ datachain-0.30.6.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
167
+ datachain-0.30.6.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
168
+ datachain-0.30.6.dist-info/RECORD,,