datachain 0.30.3__py3-none-any.whl → 0.30.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +2 -0
- datachain/catalog/__init__.py +2 -0
- datachain/catalog/catalog.py +14 -2
- datachain/catalog/loader.py +4 -2
- datachain/cli/__init__.py +1 -0
- datachain/cli/commands/datasets.py +16 -10
- datachain/data_storage/metastore.py +0 -21
- datachain/data_storage/sqlite.py +0 -4
- datachain/data_storage/warehouse.py +2 -2
- datachain/lib/arrow.py +2 -2
- datachain/lib/dc/__init__.py +2 -1
- datachain/lib/dc/datachain.py +26 -18
- datachain/lib/dc/datasets.py +3 -3
- datachain/lib/dc/utils.py +5 -0
- datachain/lib/model_store.py +12 -0
- datachain/lib/namespaces.py +3 -1
- datachain/lib/projects.py +3 -1
- datachain/lib/signal_schema.py +28 -17
- datachain/query/dataset.py +22 -18
- datachain/query/dispatch.py +5 -0
- datachain/query/schema.py +4 -0
- datachain/sql/sqlite/base.py +12 -11
- datachain/sql/sqlite/types.py +8 -13
- datachain/sql/types.py +3 -3
- datachain/utils.py +8 -1
- {datachain-0.30.3.dist-info → datachain-0.30.5.dist-info}/METADATA +3 -3
- {datachain-0.30.3.dist-info → datachain-0.30.5.dist-info}/RECORD +31 -31
- {datachain-0.30.3.dist-info → datachain-0.30.5.dist-info}/WHEEL +0 -0
- {datachain-0.30.3.dist-info → datachain-0.30.5.dist-info}/entry_points.txt +0 -0
- {datachain-0.30.3.dist-info → datachain-0.30.5.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.30.3.dist-info → datachain-0.30.5.dist-info}/top_level.txt +0 -0
datachain/__init__.py
CHANGED
|
@@ -6,6 +6,7 @@ from datachain.lib.dc import (
|
|
|
6
6
|
Sys,
|
|
7
7
|
datasets,
|
|
8
8
|
delete_dataset,
|
|
9
|
+
is_studio,
|
|
9
10
|
listings,
|
|
10
11
|
move_dataset,
|
|
11
12
|
read_csv,
|
|
@@ -74,6 +75,7 @@ __all__ = [
|
|
|
74
75
|
"datasets",
|
|
75
76
|
"delete_dataset",
|
|
76
77
|
"is_chain_type",
|
|
78
|
+
"is_studio",
|
|
77
79
|
"listings",
|
|
78
80
|
"metrics",
|
|
79
81
|
"move_dataset",
|
datachain/catalog/__init__.py
CHANGED
|
@@ -3,6 +3,7 @@ from .catalog import (
|
|
|
3
3
|
QUERY_SCRIPT_CANCELED_EXIT_CODE,
|
|
4
4
|
QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE,
|
|
5
5
|
Catalog,
|
|
6
|
+
is_namespace_local,
|
|
6
7
|
)
|
|
7
8
|
from .loader import get_catalog
|
|
8
9
|
|
|
@@ -12,4 +13,5 @@ __all__ = [
|
|
|
12
13
|
"QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE",
|
|
13
14
|
"Catalog",
|
|
14
15
|
"get_catalog",
|
|
16
|
+
"is_namespace_local",
|
|
15
17
|
]
|
datachain/catalog/catalog.py
CHANGED
|
@@ -113,6 +113,11 @@ else:
|
|
|
113
113
|
SIGINT = signal.SIGINT
|
|
114
114
|
|
|
115
115
|
|
|
116
|
+
def is_namespace_local(namespace_name) -> bool:
|
|
117
|
+
"""Checks if namespace is from local environment, i.e. is `local`"""
|
|
118
|
+
return namespace_name == "local"
|
|
119
|
+
|
|
120
|
+
|
|
116
121
|
def shutdown_process(
|
|
117
122
|
proc: subprocess.Popen,
|
|
118
123
|
interrupt_timeout: Optional[int] = None,
|
|
@@ -1121,6 +1126,8 @@ class Catalog:
|
|
|
1121
1126
|
pull_dataset: bool = False,
|
|
1122
1127
|
update: bool = False,
|
|
1123
1128
|
) -> DatasetRecord:
|
|
1129
|
+
from datachain.lib.dc.utils import is_studio
|
|
1130
|
+
|
|
1124
1131
|
# Intentionally ignore update flag is version is provided. Here only exact
|
|
1125
1132
|
# version can be provided and update then doesn't make sense.
|
|
1126
1133
|
# It corresponds to a query like this for example:
|
|
@@ -1129,7 +1136,12 @@ class Catalog:
|
|
|
1129
1136
|
if version:
|
|
1130
1137
|
update = False
|
|
1131
1138
|
|
|
1132
|
-
|
|
1139
|
+
# we don't do Studio fallback is script is already ran in Studio, or if we try
|
|
1140
|
+
# to fetch dataset with local namespace as that one cannot
|
|
1141
|
+
# exist in Studio in the first place
|
|
1142
|
+
no_fallback = is_studio() or is_namespace_local(namespace_name)
|
|
1143
|
+
|
|
1144
|
+
if no_fallback or not update:
|
|
1133
1145
|
try:
|
|
1134
1146
|
ds = self.get_dataset(
|
|
1135
1147
|
name,
|
|
@@ -1141,7 +1153,7 @@ class Catalog:
|
|
|
1141
1153
|
except (NamespaceNotFoundError, ProjectNotFoundError, DatasetNotFoundError):
|
|
1142
1154
|
pass
|
|
1143
1155
|
|
|
1144
|
-
if
|
|
1156
|
+
if no_fallback:
|
|
1145
1157
|
raise DatasetNotFoundError(
|
|
1146
1158
|
f"Dataset {name}"
|
|
1147
1159
|
+ (f" version {version} " if version else " ")
|
datachain/catalog/loader.py
CHANGED
|
@@ -127,7 +127,8 @@ def get_udf_distributor_class() -> Optional[type["AbstractUDFDistributor"]]:
|
|
|
127
127
|
|
|
128
128
|
|
|
129
129
|
def get_catalog(
|
|
130
|
-
client_config: Optional[dict[str, Any]] = None,
|
|
130
|
+
client_config: Optional[dict[str, Any]] = None,
|
|
131
|
+
in_memory: bool = False,
|
|
131
132
|
) -> "Catalog":
|
|
132
133
|
"""
|
|
133
134
|
Function that creates Catalog instance with appropriate metastore
|
|
@@ -142,8 +143,9 @@ def get_catalog(
|
|
|
142
143
|
"""
|
|
143
144
|
from datachain.catalog import Catalog
|
|
144
145
|
|
|
146
|
+
metastore = get_metastore(in_memory=in_memory)
|
|
145
147
|
return Catalog(
|
|
146
|
-
metastore=
|
|
148
|
+
metastore=metastore,
|
|
147
149
|
warehouse=get_warehouse(in_memory=in_memory),
|
|
148
150
|
client_config=client_config,
|
|
149
151
|
in_memory=in_memory,
|
datachain/cli/__init__.py
CHANGED
|
@@ -6,6 +6,7 @@ from tabulate import tabulate
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
7
|
from datachain.catalog import Catalog
|
|
8
8
|
|
|
9
|
+
from datachain.catalog import is_namespace_local
|
|
9
10
|
from datachain.cli.utils import determine_flavors
|
|
10
11
|
from datachain.config import Config
|
|
11
12
|
from datachain.error import DataChainError, DatasetNotFoundError
|
|
@@ -138,15 +139,18 @@ def rm_dataset(
|
|
|
138
139
|
):
|
|
139
140
|
namespace_name, project_name, name = catalog.get_full_dataset_name(name)
|
|
140
141
|
|
|
141
|
-
if
|
|
142
|
+
if studio:
|
|
143
|
+
# removing Studio dataset from CLI
|
|
142
144
|
from datachain.studio import remove_studio_dataset
|
|
143
145
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
+
if Config().read().get("studio", {}).get("token"):
|
|
147
|
+
remove_studio_dataset(
|
|
148
|
+
team, name, namespace_name, project_name, version, force
|
|
149
|
+
)
|
|
150
|
+
else:
|
|
146
151
|
raise DataChainError(
|
|
147
152
|
"Not logged in to Studio. Log in with 'datachain auth login'."
|
|
148
153
|
)
|
|
149
|
-
remove_studio_dataset(team, name, namespace_name, project_name, version, force)
|
|
150
154
|
else:
|
|
151
155
|
try:
|
|
152
156
|
project = catalog.metastore.get_project(project_name, namespace_name)
|
|
@@ -163,9 +167,11 @@ def edit_dataset(
|
|
|
163
167
|
attrs: Optional[list[str]] = None,
|
|
164
168
|
team: Optional[str] = None,
|
|
165
169
|
):
|
|
170
|
+
from datachain.lib.dc.utils import is_studio
|
|
171
|
+
|
|
166
172
|
namespace_name, project_name, name = catalog.get_full_dataset_name(name)
|
|
167
173
|
|
|
168
|
-
if
|
|
174
|
+
if is_studio() or is_namespace_local(namespace_name):
|
|
169
175
|
try:
|
|
170
176
|
catalog.edit_dataset(
|
|
171
177
|
name, catalog.metastore.default_project, new_name, description, attrs
|
|
@@ -175,11 +181,11 @@ def edit_dataset(
|
|
|
175
181
|
else:
|
|
176
182
|
from datachain.studio import edit_studio_dataset
|
|
177
183
|
|
|
178
|
-
|
|
179
|
-
|
|
184
|
+
if Config().read().get("studio", {}).get("token"):
|
|
185
|
+
edit_studio_dataset(
|
|
186
|
+
team, name, namespace_name, project_name, new_name, description, attrs
|
|
187
|
+
)
|
|
188
|
+
else:
|
|
180
189
|
raise DataChainError(
|
|
181
190
|
"Not logged in to Studio. Log in with 'datachain auth login'."
|
|
182
191
|
)
|
|
183
|
-
edit_studio_dataset(
|
|
184
|
-
team, name, namespace_name, project_name, new_name, description, attrs
|
|
185
|
-
)
|
|
@@ -145,23 +145,6 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
145
145
|
def list_namespaces(self, conn=None) -> list[Namespace]:
|
|
146
146
|
"""Gets a list of all namespaces"""
|
|
147
147
|
|
|
148
|
-
@property
|
|
149
|
-
@abstractmethod
|
|
150
|
-
def is_studio(self) -> bool:
|
|
151
|
-
"""Returns True if this code is ran in Studio"""
|
|
152
|
-
|
|
153
|
-
def is_local_dataset(self, dataset_namespace: str) -> bool:
|
|
154
|
-
"""
|
|
155
|
-
Returns True if this is local dataset i.e. not pulled from Studio but
|
|
156
|
-
created locally. This is False if we ran code in CLI mode but using dataset
|
|
157
|
-
names that are present in Studio.
|
|
158
|
-
"""
|
|
159
|
-
return self.is_studio or dataset_namespace == Namespace.default()
|
|
160
|
-
|
|
161
|
-
@property
|
|
162
|
-
def namespace_allowed_to_create(self):
|
|
163
|
-
return self.is_studio
|
|
164
|
-
|
|
165
148
|
#
|
|
166
149
|
# Projects
|
|
167
150
|
#
|
|
@@ -215,10 +198,6 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
215
198
|
def list_projects(self, namespace_id: Optional[int], conn=None) -> list[Project]:
|
|
216
199
|
"""Gets list of projects in some namespace or in general (in all namespaces)"""
|
|
217
200
|
|
|
218
|
-
@property
|
|
219
|
-
def project_allowed_to_create(self):
|
|
220
|
-
return self.is_studio
|
|
221
|
-
|
|
222
201
|
#
|
|
223
202
|
# Datasets
|
|
224
203
|
#
|
datachain/data_storage/sqlite.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import glob
|
|
2
|
-
import json
|
|
3
2
|
import logging
|
|
4
3
|
import posixpath
|
|
5
4
|
import random
|
|
@@ -11,6 +10,7 @@ from urllib.parse import urlparse
|
|
|
11
10
|
|
|
12
11
|
import attrs
|
|
13
12
|
import sqlalchemy as sa
|
|
13
|
+
import ujson as json
|
|
14
14
|
from sqlalchemy.sql.expression import true
|
|
15
15
|
|
|
16
16
|
from datachain.client import Client
|
|
@@ -122,7 +122,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
122
122
|
if value_type is str:
|
|
123
123
|
return val
|
|
124
124
|
if value_type in (dict, list):
|
|
125
|
-
return json.dumps(val)
|
|
125
|
+
return json.dumps(val, ensure_ascii=False)
|
|
126
126
|
raise ValueError(
|
|
127
127
|
f"Cannot convert value {val!r} with type {value_type} to JSON"
|
|
128
128
|
)
|
datachain/lib/arrow.py
CHANGED
|
@@ -2,8 +2,8 @@ from collections.abc import Sequence
|
|
|
2
2
|
from itertools import islice
|
|
3
3
|
from typing import TYPE_CHECKING, Any, Optional
|
|
4
4
|
|
|
5
|
-
import orjson
|
|
6
5
|
import pyarrow as pa
|
|
6
|
+
import ujson as json
|
|
7
7
|
from pyarrow._csv import ParseOptions
|
|
8
8
|
from pyarrow.dataset import CsvFileFormat, dataset
|
|
9
9
|
from tqdm.auto import tqdm
|
|
@@ -269,7 +269,7 @@ def _get_hf_schema(
|
|
|
269
269
|
def _get_datachain_schema(schema: "pa.Schema") -> Optional[SignalSchema]:
|
|
270
270
|
"""Return a restored SignalSchema from parquet metadata, if any is found."""
|
|
271
271
|
if schema.metadata and DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY in schema.metadata:
|
|
272
|
-
serialized_signal_schema =
|
|
272
|
+
serialized_signal_schema = json.loads(
|
|
273
273
|
schema.metadata[DATACHAIN_SIGNAL_SCHEMA_PARQUET_KEY]
|
|
274
274
|
)
|
|
275
275
|
return SignalSchema.deserialize(serialized_signal_schema)
|
datachain/lib/dc/__init__.py
CHANGED
|
@@ -9,7 +9,7 @@ from .pandas import read_pandas
|
|
|
9
9
|
from .parquet import read_parquet
|
|
10
10
|
from .records import read_records
|
|
11
11
|
from .storage import read_storage
|
|
12
|
-
from .utils import DatasetMergeError, DatasetPrepareError, Sys
|
|
12
|
+
from .utils import DatasetMergeError, DatasetPrepareError, Sys, is_studio
|
|
13
13
|
from .values import read_values
|
|
14
14
|
|
|
15
15
|
__all__ = [
|
|
@@ -21,6 +21,7 @@ __all__ = [
|
|
|
21
21
|
"Sys",
|
|
22
22
|
"datasets",
|
|
23
23
|
"delete_dataset",
|
|
24
|
+
"is_studio",
|
|
24
25
|
"listings",
|
|
25
26
|
"move_dataset",
|
|
26
27
|
"read_csv",
|
datachain/lib/dc/datachain.py
CHANGED
|
@@ -19,8 +19,8 @@ from typing import (
|
|
|
19
19
|
overload,
|
|
20
20
|
)
|
|
21
21
|
|
|
22
|
-
import orjson
|
|
23
22
|
import sqlalchemy
|
|
23
|
+
import ujson as json
|
|
24
24
|
from pydantic import BaseModel
|
|
25
25
|
from sqlalchemy.sql.elements import ColumnElement
|
|
26
26
|
from tqdm import tqdm
|
|
@@ -67,6 +67,7 @@ from .utils import (
|
|
|
67
67
|
Sys,
|
|
68
68
|
_get_merge_error_str,
|
|
69
69
|
_validate_merge_on,
|
|
70
|
+
is_studio,
|
|
70
71
|
resolve_columns,
|
|
71
72
|
)
|
|
72
73
|
|
|
@@ -461,8 +462,6 @@ class DataChain:
|
|
|
461
462
|
Returns:
|
|
462
463
|
DataChain: A new DataChain instance with the new set of columns.
|
|
463
464
|
"""
|
|
464
|
-
import json
|
|
465
|
-
|
|
466
465
|
import pyarrow as pa
|
|
467
466
|
|
|
468
467
|
from datachain.lib.arrow import schema_to_output
|
|
@@ -609,7 +608,7 @@ class DataChain:
|
|
|
609
608
|
project = self.session.catalog.metastore.get_project(
|
|
610
609
|
project_name,
|
|
611
610
|
namespace_name,
|
|
612
|
-
create=
|
|
611
|
+
create=is_studio(),
|
|
613
612
|
)
|
|
614
613
|
except ProjectNotFoundError as e:
|
|
615
614
|
# not being able to create it as creation is not allowed
|
|
@@ -1184,17 +1183,13 @@ class DataChain:
|
|
|
1184
1183
|
)
|
|
1185
1184
|
|
|
1186
1185
|
def mutate(self, **kwargs) -> "Self":
|
|
1187
|
-
"""Create
|
|
1188
|
-
|
|
1189
|
-
This method cannot modify existing columns. If you need to modify an
|
|
1190
|
-
existing column, use a different name for the new column and then use
|
|
1191
|
-
`select()` to choose which columns to keep.
|
|
1186
|
+
"""Create or modify signals based on existing signals.
|
|
1192
1187
|
|
|
1193
1188
|
This method is vectorized and more efficient compared to map(), and it does not
|
|
1194
1189
|
extract or download any data from the internal database. However, it can only
|
|
1195
1190
|
utilize predefined built-in functions and their combinations.
|
|
1196
1191
|
|
|
1197
|
-
|
|
1192
|
+
Supported functions:
|
|
1198
1193
|
Numerical: +, -, *, /, rand(), avg(), count(), func(),
|
|
1199
1194
|
greatest(), least(), max(), min(), sum()
|
|
1200
1195
|
String: length(), split(), replace(), regexp_replace()
|
|
@@ -1221,13 +1216,20 @@ class DataChain:
|
|
|
1221
1216
|
```
|
|
1222
1217
|
|
|
1223
1218
|
This method can be also used to rename signals. If the Column("name") provided
|
|
1224
|
-
as value for the new signal - the old
|
|
1225
|
-
|
|
1219
|
+
as value for the new signal - the old signal will be dropped. Otherwise a new
|
|
1220
|
+
signal is created. Exception, if the old signal is nested one (e.g.
|
|
1221
|
+
`C("file.path")`), it will be kept to keep the object intact.
|
|
1226
1222
|
|
|
1227
1223
|
Example:
|
|
1228
1224
|
```py
|
|
1229
1225
|
dc.mutate(
|
|
1230
|
-
newkey=Column("oldkey")
|
|
1226
|
+
newkey=Column("oldkey") # drops oldkey
|
|
1227
|
+
)
|
|
1228
|
+
```
|
|
1229
|
+
|
|
1230
|
+
```py
|
|
1231
|
+
dc.mutate(
|
|
1232
|
+
size=Column("file.size") # keeps `file.size`
|
|
1231
1233
|
)
|
|
1232
1234
|
```
|
|
1233
1235
|
"""
|
|
@@ -1262,8 +1264,10 @@ class DataChain:
|
|
|
1262
1264
|
# adding new signal
|
|
1263
1265
|
mutated[name] = value
|
|
1264
1266
|
|
|
1267
|
+
new_schema = schema.mutate(kwargs)
|
|
1265
1268
|
return self._evolve(
|
|
1266
|
-
query=self._query.mutate(**mutated),
|
|
1269
|
+
query=self._query.mutate(new_schema=new_schema, **mutated),
|
|
1270
|
+
signal_schema=new_schema,
|
|
1267
1271
|
)
|
|
1268
1272
|
|
|
1269
1273
|
@property
|
|
@@ -2123,9 +2127,9 @@ class DataChain:
|
|
|
2123
2127
|
fsspec_fs = client.create_fs(**fs_kwargs)
|
|
2124
2128
|
|
|
2125
2129
|
_partition_cols = list(partition_cols) if partition_cols else None
|
|
2126
|
-
signal_schema_metadata =
|
|
2127
|
-
self._effective_signals_schema.serialize()
|
|
2128
|
-
)
|
|
2130
|
+
signal_schema_metadata = json.dumps(
|
|
2131
|
+
self._effective_signals_schema.serialize(), ensure_ascii=False
|
|
2132
|
+
).encode("utf-8")
|
|
2129
2133
|
|
|
2130
2134
|
column_names, column_chunks = self.to_columnar_data_with_names(chunk_size)
|
|
2131
2135
|
|
|
@@ -2272,7 +2276,11 @@ class DataChain:
|
|
|
2272
2276
|
f.write(b"\n")
|
|
2273
2277
|
else:
|
|
2274
2278
|
is_first = False
|
|
2275
|
-
f.write(
|
|
2279
|
+
f.write(
|
|
2280
|
+
json.dumps(
|
|
2281
|
+
row_to_nested_dict(headers, row), ensure_ascii=False
|
|
2282
|
+
).encode("utf-8")
|
|
2283
|
+
)
|
|
2276
2284
|
if include_outer_list:
|
|
2277
2285
|
# This makes the file JSON instead of JSON lines.
|
|
2278
2286
|
f.write(b"\n]\n")
|
datachain/lib/dc/datasets.py
CHANGED
|
@@ -13,7 +13,7 @@ from datachain.lib.signal_schema import SignalSchema
|
|
|
13
13
|
from datachain.query import Session
|
|
14
14
|
from datachain.query.dataset import DatasetQuery
|
|
15
15
|
|
|
16
|
-
from .utils import Sys
|
|
16
|
+
from .utils import Sys, is_studio
|
|
17
17
|
from .values import read_values
|
|
18
18
|
|
|
19
19
|
if TYPE_CHECKING:
|
|
@@ -343,7 +343,7 @@ def delete_dataset(
|
|
|
343
343
|
namespace_name=namespace,
|
|
344
344
|
)
|
|
345
345
|
|
|
346
|
-
if not
|
|
346
|
+
if not is_studio() and studio:
|
|
347
347
|
return remove_studio_dataset(
|
|
348
348
|
None, name, namespace_name, project_name, version=version, force=force
|
|
349
349
|
)
|
|
@@ -418,6 +418,6 @@ def move_dataset(
|
|
|
418
418
|
project_id=catalog.metastore.get_project(
|
|
419
419
|
dest_project,
|
|
420
420
|
dest_namespace,
|
|
421
|
-
create=
|
|
421
|
+
create=is_studio(),
|
|
422
422
|
).id,
|
|
423
423
|
)
|
datachain/lib/dc/utils.py
CHANGED
|
@@ -15,6 +15,7 @@ from datachain.func.base import Function
|
|
|
15
15
|
from datachain.lib.data_model import DataModel, DataType
|
|
16
16
|
from datachain.lib.utils import DataChainParamsError
|
|
17
17
|
from datachain.query.schema import DEFAULT_DELIMITER
|
|
18
|
+
from datachain.utils import getenv_bool
|
|
18
19
|
|
|
19
20
|
if TYPE_CHECKING:
|
|
20
21
|
from typing_extensions import Concatenate, ParamSpec
|
|
@@ -26,6 +27,10 @@ if TYPE_CHECKING:
|
|
|
26
27
|
D = TypeVar("D", bound="DataChain")
|
|
27
28
|
|
|
28
29
|
|
|
30
|
+
def is_studio() -> bool:
|
|
31
|
+
return getenv_bool("DATACHAIN_IS_STUDIO", default=False)
|
|
32
|
+
|
|
33
|
+
|
|
29
34
|
def resolve_columns(
|
|
30
35
|
method: "Callable[Concatenate[D, P], D]",
|
|
31
36
|
) -> "Callable[Concatenate[D, P], D]":
|
datachain/lib/model_store.py
CHANGED
|
@@ -89,3 +89,15 @@ class ModelStore:
|
|
|
89
89
|
and ModelStore.is_pydantic(parent_type)
|
|
90
90
|
and "@" in ModelStore.get_name(parent_type)
|
|
91
91
|
)
|
|
92
|
+
|
|
93
|
+
@classmethod
|
|
94
|
+
def rebuild_all(cls) -> None:
|
|
95
|
+
"""Ensure pydantic schemas are (re)built for all registered models.
|
|
96
|
+
|
|
97
|
+
Uses ``force=True`` to avoid subtle cases where a deserialized class
|
|
98
|
+
(e.g. from by-value cloudpickle in workers) reports built state but
|
|
99
|
+
nested model field schemas aren't fully resolved yet.
|
|
100
|
+
"""
|
|
101
|
+
for versions in cls.store.values():
|
|
102
|
+
for model in versions.values():
|
|
103
|
+
model.model_rebuild(force=True)
|
datachain/lib/namespaces.py
CHANGED
|
@@ -28,7 +28,9 @@ def create(
|
|
|
28
28
|
"""
|
|
29
29
|
session = Session.get(session)
|
|
30
30
|
|
|
31
|
-
|
|
31
|
+
from datachain.lib.dc.utils import is_studio
|
|
32
|
+
|
|
33
|
+
if not is_studio():
|
|
32
34
|
raise NamespaceCreateNotAllowedError("Creating namespace is not allowed")
|
|
33
35
|
|
|
34
36
|
Namespace.validate_name(name)
|
datachain/lib/projects.py
CHANGED
|
@@ -32,7 +32,9 @@ def create(
|
|
|
32
32
|
"""
|
|
33
33
|
session = Session.get(session)
|
|
34
34
|
|
|
35
|
-
|
|
35
|
+
from datachain.lib.dc.utils import is_studio
|
|
36
|
+
|
|
37
|
+
if not is_studio():
|
|
36
38
|
raise ProjectCreateNotAllowedError("Creating project is not allowed")
|
|
37
39
|
|
|
38
40
|
Project.validate_name(name)
|
datachain/lib/signal_schema.py
CHANGED
|
@@ -34,7 +34,7 @@ from datachain.lib.data_model import DataModel, DataType, DataValue
|
|
|
34
34
|
from datachain.lib.file import File
|
|
35
35
|
from datachain.lib.model_store import ModelStore
|
|
36
36
|
from datachain.lib.utils import DataChainParamsError
|
|
37
|
-
from datachain.query.schema import DEFAULT_DELIMITER, Column, ColumnMeta
|
|
37
|
+
from datachain.query.schema import DEFAULT_DELIMITER, C, Column, ColumnMeta
|
|
38
38
|
from datachain.sql.types import SQLType
|
|
39
39
|
|
|
40
40
|
if TYPE_CHECKING:
|
|
@@ -680,35 +680,46 @@ class SignalSchema:
|
|
|
680
680
|
primitives = (bool, str, int, float)
|
|
681
681
|
|
|
682
682
|
for name, value in args_map.items():
|
|
683
|
+
current_type = None
|
|
684
|
+
|
|
685
|
+
if C.is_nested(name):
|
|
686
|
+
try:
|
|
687
|
+
current_type = self.get_column_type(name)
|
|
688
|
+
except SignalResolvingError as err:
|
|
689
|
+
msg = f"Creating new nested columns directly is not allowed: {name}"
|
|
690
|
+
raise ValueError(msg) from err
|
|
691
|
+
|
|
683
692
|
if isinstance(value, Column) and value.name in self.values:
|
|
684
693
|
# renaming existing signal
|
|
694
|
+
# Note: it won't touch nested signals here (e.g. file__path)
|
|
695
|
+
# we don't allow removing nested columns to keep objects consistent
|
|
685
696
|
del new_values[value.name]
|
|
686
697
|
new_values[name] = self.values[value.name]
|
|
687
|
-
|
|
688
|
-
if isinstance(value, Column):
|
|
698
|
+
elif isinstance(value, Column):
|
|
689
699
|
# adding new signal from existing signal field
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
value.name, with_subtree=True
|
|
693
|
-
)
|
|
694
|
-
continue
|
|
695
|
-
except SignalResolvingError:
|
|
696
|
-
pass
|
|
697
|
-
if isinstance(value, Func):
|
|
700
|
+
new_values[name] = self.get_column_type(value.name, with_subtree=True)
|
|
701
|
+
elif isinstance(value, Func):
|
|
698
702
|
# adding new signal with function
|
|
699
703
|
new_values[name] = value.get_result_type(self)
|
|
700
|
-
|
|
701
|
-
if isinstance(value, primitives):
|
|
704
|
+
elif isinstance(value, primitives):
|
|
702
705
|
# For primitives, store the type, not the value
|
|
703
706
|
val = literal(value)
|
|
704
707
|
val.type = python_to_sql(type(value))()
|
|
705
708
|
new_values[name] = sql_to_python(val)
|
|
706
|
-
|
|
707
|
-
if isinstance(value, ColumnElement):
|
|
709
|
+
elif isinstance(value, ColumnElement):
|
|
708
710
|
# adding new signal
|
|
709
711
|
new_values[name] = sql_to_python(value)
|
|
710
|
-
|
|
711
|
-
|
|
712
|
+
else:
|
|
713
|
+
new_values[name] = value
|
|
714
|
+
|
|
715
|
+
if C.is_nested(name):
|
|
716
|
+
if current_type != new_values[name]:
|
|
717
|
+
msg = (
|
|
718
|
+
f"Altering nested column type is not allowed: {name}, "
|
|
719
|
+
f"current type: {current_type}, new type: {new_values[name]}"
|
|
720
|
+
)
|
|
721
|
+
raise ValueError(msg)
|
|
722
|
+
del new_values[name]
|
|
712
723
|
|
|
713
724
|
return SignalSchema(new_values)
|
|
714
725
|
|
datachain/query/dataset.py
CHANGED
|
@@ -10,7 +10,6 @@ from abc import ABC, abstractmethod
|
|
|
10
10
|
from collections.abc import Generator, Iterable, Iterator, Sequence
|
|
11
11
|
from copy import copy
|
|
12
12
|
from functools import wraps
|
|
13
|
-
from secrets import token_hex
|
|
14
13
|
from types import GeneratorType
|
|
15
14
|
from typing import (
|
|
16
15
|
TYPE_CHECKING,
|
|
@@ -29,7 +28,7 @@ from attrs import frozen
|
|
|
29
28
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback, TqdmCallback
|
|
30
29
|
from sqlalchemy import Column
|
|
31
30
|
from sqlalchemy.sql import func as f
|
|
32
|
-
from sqlalchemy.sql.elements import ColumnClause, ColumnElement
|
|
31
|
+
from sqlalchemy.sql.elements import ColumnClause, ColumnElement, Label
|
|
33
32
|
from sqlalchemy.sql.expression import label
|
|
34
33
|
from sqlalchemy.sql.schema import TableClause
|
|
35
34
|
from sqlalchemy.sql.selectable import Select
|
|
@@ -46,6 +45,7 @@ from datachain.dataset import DatasetDependency, DatasetStatus, RowDict
|
|
|
46
45
|
from datachain.error import DatasetNotFoundError, QueryScriptCancelError
|
|
47
46
|
from datachain.func.base import Function
|
|
48
47
|
from datachain.lib.listing import is_listing_dataset, listing_dataset_expired
|
|
48
|
+
from datachain.lib.signal_schema import SignalSchema
|
|
49
49
|
from datachain.lib.udf import UDFAdapter, _get_cache
|
|
50
50
|
from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
|
|
51
51
|
from datachain.project import Project
|
|
@@ -795,28 +795,32 @@ class SQLSelectExcept(SQLClause):
|
|
|
795
795
|
|
|
796
796
|
@frozen
|
|
797
797
|
class SQLMutate(SQLClause):
|
|
798
|
-
args: tuple[
|
|
798
|
+
args: tuple[Label, ...]
|
|
799
|
+
new_schema: SignalSchema
|
|
799
800
|
|
|
800
801
|
def apply_sql_clause(self, query: Select) -> Select:
|
|
801
802
|
original_subquery = query.subquery()
|
|
802
|
-
|
|
803
|
-
original_subquery.c[str(c)] if isinstance(c, (str, C)) else c
|
|
804
|
-
for c in self.parse_cols(self.args)
|
|
805
|
-
]
|
|
806
|
-
to_mutate = {c.name for c in args}
|
|
803
|
+
to_mutate = {c.name for c in self.args}
|
|
807
804
|
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
805
|
+
# Drop the original versions to avoid name collisions, exclude renamed
|
|
806
|
+
# columns. Always keep system columns (sys__*) if they exist in original query
|
|
807
|
+
new_schema_columns = set(self.new_schema.db_signals())
|
|
808
|
+
base_cols = [
|
|
809
|
+
c
|
|
811
810
|
for c in original_subquery.c
|
|
811
|
+
if c.name not in to_mutate
|
|
812
|
+
and (c.name in new_schema_columns or c.name.startswith("sys__"))
|
|
812
813
|
]
|
|
813
|
-
|
|
814
|
-
#
|
|
815
|
-
|
|
816
|
-
|
|
814
|
+
|
|
815
|
+
# Create intermediate subquery to properly handle window functions
|
|
816
|
+
intermediate_query = sqlalchemy.select(*base_cols, *self.args).select_from(
|
|
817
|
+
original_subquery
|
|
817
818
|
)
|
|
819
|
+
intermediate_subquery = intermediate_query.subquery()
|
|
818
820
|
|
|
819
|
-
return sqlalchemy.select(*
|
|
821
|
+
return sqlalchemy.select(*intermediate_subquery.c).select_from(
|
|
822
|
+
intermediate_subquery
|
|
823
|
+
)
|
|
820
824
|
|
|
821
825
|
|
|
822
826
|
@frozen
|
|
@@ -1470,7 +1474,7 @@ class DatasetQuery:
|
|
|
1470
1474
|
return query
|
|
1471
1475
|
|
|
1472
1476
|
@detach
|
|
1473
|
-
def mutate(self, *args, **kwargs) -> "Self":
|
|
1477
|
+
def mutate(self, *args, new_schema, **kwargs) -> "Self":
|
|
1474
1478
|
"""
|
|
1475
1479
|
Add new columns to this query.
|
|
1476
1480
|
|
|
@@ -1482,7 +1486,7 @@ class DatasetQuery:
|
|
|
1482
1486
|
"""
|
|
1483
1487
|
query_args = [v.label(k) for k, v in dict(args, **kwargs).items()]
|
|
1484
1488
|
query = self.clone()
|
|
1485
|
-
query.steps.append(SQLMutate((*query_args,)))
|
|
1489
|
+
query.steps.append(SQLMutate((*query_args,), new_schema))
|
|
1486
1490
|
return query
|
|
1487
1491
|
|
|
1488
1492
|
@detach
|
datachain/query/dispatch.py
CHANGED
|
@@ -13,6 +13,7 @@ from multiprocess import get_context
|
|
|
13
13
|
from datachain.catalog import Catalog
|
|
14
14
|
from datachain.catalog.catalog import clone_catalog_with_cache
|
|
15
15
|
from datachain.catalog.loader import DISTRIBUTED_IMPORT_PATH, get_udf_distributor_class
|
|
16
|
+
from datachain.lib.model_store import ModelStore
|
|
16
17
|
from datachain.lib.udf import _get_cache
|
|
17
18
|
from datachain.query.dataset import (
|
|
18
19
|
get_download_callback,
|
|
@@ -130,6 +131,8 @@ class UDFDispatcher:
|
|
|
130
131
|
|
|
131
132
|
def _create_worker(self) -> "UDFWorker":
|
|
132
133
|
udf: UDFAdapter = loads(self.udf_data)
|
|
134
|
+
# Ensure all registered DataModels have rebuilt schemas in worker processes.
|
|
135
|
+
ModelStore.rebuild_all()
|
|
133
136
|
return UDFWorker(
|
|
134
137
|
self.catalog,
|
|
135
138
|
udf,
|
|
@@ -196,6 +199,8 @@ class UDFDispatcher:
|
|
|
196
199
|
generated_cb: Callback = DEFAULT_CALLBACK,
|
|
197
200
|
) -> None:
|
|
198
201
|
udf: UDFAdapter = loads(self.udf_data)
|
|
202
|
+
# Rebuild schemas in single process too for consistency (cheap, idempotent).
|
|
203
|
+
ModelStore.rebuild_all()
|
|
199
204
|
|
|
200
205
|
if ids_only and not self.is_batching:
|
|
201
206
|
input_rows = flatten(input_rows)
|
datachain/query/schema.py
CHANGED
|
@@ -36,6 +36,10 @@ class ColumnMeta(type):
|
|
|
36
36
|
def __getattr__(cls, name: str):
|
|
37
37
|
return cls(ColumnMeta.to_db_name(name))
|
|
38
38
|
|
|
39
|
+
@staticmethod
|
|
40
|
+
def is_nested(name: str) -> bool:
|
|
41
|
+
return DEFAULT_DELIMITER in name
|
|
42
|
+
|
|
39
43
|
|
|
40
44
|
class Column(sa.ColumnClause, metaclass=ColumnMeta):
|
|
41
45
|
inherit_cache: Optional[bool] = True
|
datachain/sql/sqlite/base.py
CHANGED
|
@@ -8,8 +8,8 @@ from functools import cache
|
|
|
8
8
|
from types import MappingProxyType
|
|
9
9
|
from typing import Callable, Optional
|
|
10
10
|
|
|
11
|
-
import orjson
|
|
12
11
|
import sqlalchemy as sa
|
|
12
|
+
import ujson as json
|
|
13
13
|
from sqlalchemy.dialects import sqlite
|
|
14
14
|
from sqlalchemy.ext.compiler import compiles
|
|
15
15
|
from sqlalchemy.sql.elements import literal
|
|
@@ -182,7 +182,7 @@ def missing_vector_function(name, exc):
|
|
|
182
182
|
|
|
183
183
|
|
|
184
184
|
def sqlite_string_split(string: str, sep: str, maxsplit: int = -1) -> str:
|
|
185
|
-
return
|
|
185
|
+
return json.dumps(string.split(sep, maxsplit), ensure_ascii=False)
|
|
186
186
|
|
|
187
187
|
|
|
188
188
|
def sqlite_int_hash_64(x: int) -> int:
|
|
@@ -453,17 +453,17 @@ def compile_byte_hamming_distance(element, compiler, **kwargs):
|
|
|
453
453
|
|
|
454
454
|
|
|
455
455
|
def py_json_array_length(arr):
|
|
456
|
-
return len(
|
|
456
|
+
return len(json.loads(arr))
|
|
457
457
|
|
|
458
458
|
|
|
459
459
|
def py_json_array_contains(arr, value, is_json):
|
|
460
460
|
if is_json:
|
|
461
|
-
value =
|
|
462
|
-
return value in
|
|
461
|
+
value = json.loads(value)
|
|
462
|
+
return value in json.loads(arr)
|
|
463
463
|
|
|
464
464
|
|
|
465
465
|
def py_json_array_get_element(val, idx):
|
|
466
|
-
arr =
|
|
466
|
+
arr = json.loads(val)
|
|
467
467
|
try:
|
|
468
468
|
return arr[idx]
|
|
469
469
|
except IndexError:
|
|
@@ -471,17 +471,18 @@ def py_json_array_get_element(val, idx):
|
|
|
471
471
|
|
|
472
472
|
|
|
473
473
|
def py_json_array_slice(val, offset: int, length: Optional[int] = None):
|
|
474
|
-
arr =
|
|
474
|
+
arr = json.loads(val)
|
|
475
475
|
try:
|
|
476
|
-
return
|
|
477
|
-
list(arr[offset : offset + length] if length is not None else arr[offset:])
|
|
478
|
-
|
|
476
|
+
return json.dumps(
|
|
477
|
+
list(arr[offset : offset + length] if length is not None else arr[offset:]),
|
|
478
|
+
ensure_ascii=False,
|
|
479
|
+
)
|
|
479
480
|
except IndexError:
|
|
480
481
|
return None
|
|
481
482
|
|
|
482
483
|
|
|
483
484
|
def py_json_array_join(val, sep: str):
|
|
484
|
-
return sep.join(
|
|
485
|
+
return sep.join(json.loads(val))
|
|
485
486
|
|
|
486
487
|
|
|
487
488
|
def compile_array_get_element(element, compiler, **kwargs):
|
datachain/sql/sqlite/types.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import sqlite3
|
|
2
2
|
|
|
3
|
-
import
|
|
3
|
+
import ujson as json
|
|
4
4
|
from sqlalchemy import types
|
|
5
5
|
|
|
6
6
|
from datachain.sql.types import TypeConverter, TypeReadConverter
|
|
@@ -28,26 +28,21 @@ class Array(types.UserDefinedType):
|
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
def adapt_array(arr):
|
|
31
|
-
return
|
|
31
|
+
return json.dumps(arr, ensure_ascii=False)
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
def adapt_dict(dct):
|
|
35
|
-
return
|
|
35
|
+
return json.dumps(dct, ensure_ascii=False)
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
def convert_array(arr):
|
|
39
|
-
return
|
|
39
|
+
return json.loads(arr)
|
|
40
40
|
|
|
41
41
|
|
|
42
42
|
def adapt_np_array(arr):
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
return obj
|
|
47
|
-
|
|
48
|
-
return orjson.dumps(
|
|
49
|
-
arr, option=orjson.OPT_SERIALIZE_NUMPY, default=_json_serialize
|
|
50
|
-
).decode("utf-8")
|
|
43
|
+
# Primarily needed for UDF numpy results (e.g. WDS)
|
|
44
|
+
# tolist() gives nested Python lists + native scalars; ujson.dumps handles NaN/Inf.
|
|
45
|
+
return json.dumps(arr.tolist(), ensure_ascii=False)
|
|
51
46
|
|
|
52
47
|
|
|
53
48
|
def adapt_np_generic(val):
|
|
@@ -74,5 +69,5 @@ class SQLiteTypeConverter(TypeConverter):
|
|
|
74
69
|
class SQLiteTypeReadConverter(TypeReadConverter):
|
|
75
70
|
def array(self, value, item_type, dialect):
|
|
76
71
|
if isinstance(value, str):
|
|
77
|
-
value =
|
|
72
|
+
value = json.loads(value)
|
|
78
73
|
return super().array(value, item_type, dialect)
|
datachain/sql/types.py
CHANGED
|
@@ -16,8 +16,8 @@ from datetime import datetime
|
|
|
16
16
|
from types import MappingProxyType
|
|
17
17
|
from typing import Any, Union
|
|
18
18
|
|
|
19
|
-
import orjson
|
|
20
19
|
import sqlalchemy as sa
|
|
20
|
+
import ujson as jsonlib
|
|
21
21
|
from sqlalchemy import TypeDecorator, types
|
|
22
22
|
|
|
23
23
|
from datachain.lib.data_model import StandardType
|
|
@@ -352,7 +352,7 @@ class Array(SQLType):
|
|
|
352
352
|
def on_read_convert(self, value, dialect):
|
|
353
353
|
r = read_converter(dialect).array(value, self.item_type, dialect)
|
|
354
354
|
if isinstance(self.item_type, JSON):
|
|
355
|
-
r = [
|
|
355
|
+
r = [jsonlib.loads(item) if isinstance(item, str) else item for item in r]
|
|
356
356
|
return r
|
|
357
357
|
|
|
358
358
|
|
|
@@ -466,7 +466,7 @@ class TypeReadConverter:
|
|
|
466
466
|
if isinstance(value, str):
|
|
467
467
|
if value == "":
|
|
468
468
|
return {}
|
|
469
|
-
return
|
|
469
|
+
return jsonlib.loads(value)
|
|
470
470
|
return value
|
|
471
471
|
|
|
472
472
|
def datetime(self, value):
|
datachain/utils.py
CHANGED
|
@@ -417,7 +417,7 @@ class JSONSerialize(json.JSONEncoder):
|
|
|
417
417
|
|
|
418
418
|
def inside_colab() -> bool:
|
|
419
419
|
try:
|
|
420
|
-
from google import colab # noqa: F401
|
|
420
|
+
from google import colab # type: ignore[attr-defined] # noqa: F401
|
|
421
421
|
except ImportError:
|
|
422
422
|
return False
|
|
423
423
|
return True
|
|
@@ -531,3 +531,10 @@ def safe_closing(thing: T) -> Iterator[T]:
|
|
|
531
531
|
finally:
|
|
532
532
|
if hasattr(thing, "close"):
|
|
533
533
|
thing.close()
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
def getenv_bool(name: str, default: bool = False) -> bool:
|
|
537
|
+
val = os.getenv(name)
|
|
538
|
+
if val is None:
|
|
539
|
+
return default
|
|
540
|
+
return val.lower() in ("1", "true", "yes", "on")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.30.
|
|
3
|
+
Version: 0.30.5
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -22,6 +22,7 @@ Requires-Dist: tomlkit
|
|
|
22
22
|
Requires-Dist: tqdm
|
|
23
23
|
Requires-Dist: numpy<3,>=1
|
|
24
24
|
Requires-Dist: pandas>=2.0.0
|
|
25
|
+
Requires-Dist: ujson>=5.10.0
|
|
25
26
|
Requires-Dist: packaging
|
|
26
27
|
Requires-Dist: pyarrow
|
|
27
28
|
Requires-Dist: typing-extensions
|
|
@@ -38,7 +39,6 @@ Requires-Dist: shtab<2,>=1.3.4
|
|
|
38
39
|
Requires-Dist: sqlalchemy>=2
|
|
39
40
|
Requires-Dist: multiprocess==0.70.16
|
|
40
41
|
Requires-Dist: cloudpickle
|
|
41
|
-
Requires-Dist: orjson>=3.10.5
|
|
42
42
|
Requires-Dist: pydantic
|
|
43
43
|
Requires-Dist: jmespath>=1.0
|
|
44
44
|
Requires-Dist: datamodel-code-generator>=0.25
|
|
@@ -92,7 +92,7 @@ Requires-Dist: pytest-mock>=3.12.0; extra == "tests"
|
|
|
92
92
|
Requires-Dist: pytest-servers[all]>=0.5.9; extra == "tests"
|
|
93
93
|
Requires-Dist: pytest-benchmark[histogram]; extra == "tests"
|
|
94
94
|
Requires-Dist: pytest-xdist>=3.3.1; extra == "tests"
|
|
95
|
-
Requires-Dist: pytest-
|
|
95
|
+
Requires-Dist: pytest-dotenv; extra == "tests"
|
|
96
96
|
Requires-Dist: virtualenv; extra == "tests"
|
|
97
97
|
Requires-Dist: dulwich; extra == "tests"
|
|
98
98
|
Requires-Dist: hypothesis; extra == "tests"
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datachain/__init__.py,sha256=
|
|
1
|
+
datachain/__init__.py,sha256=Ze-u6SSNsTFBRFw0lVPCdoP0kt8ybKxJIhO8jfC22Cw,1744
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
|
|
4
4
|
datachain/cache.py,sha256=ESVRaCJXEThMIfGEFVHx6wJPOZA7FYk9V6WxjyuqUBY,3626
|
|
@@ -19,15 +19,15 @@ datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
|
|
|
19
19
|
datachain/semver.py,sha256=UB8GHPBtAP3UJGeiuJoInD7SK-DnB93_Xd1qy_CQ9cU,2074
|
|
20
20
|
datachain/studio.py,sha256=27750qCSNxIChEzhV02damIFreLMfr7UdiWqMFyk8AA,15361
|
|
21
21
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
22
|
-
datachain/utils.py,sha256=
|
|
23
|
-
datachain/catalog/__init__.py,sha256=
|
|
24
|
-
datachain/catalog/catalog.py,sha256=
|
|
22
|
+
datachain/utils.py,sha256=RKe1-VuC9juQSIbIpMnELJ7QrsKQggj8l7Q8_FiCZHE,15664
|
|
23
|
+
datachain/catalog/__init__.py,sha256=9NBaywvAOaXdkyqiHjbBEiXs7JImR1OJsY9r8D5Q16g,403
|
|
24
|
+
datachain/catalog/catalog.py,sha256=a1AN6eDHWWzII1wi46T_1JvTsW1AeMudwR_6sVQ4f7I,67588
|
|
25
25
|
datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
|
|
26
|
-
datachain/catalog/loader.py,sha256=
|
|
27
|
-
datachain/cli/__init__.py,sha256=
|
|
26
|
+
datachain/catalog/loader.py,sha256=53VnuSRkt_CO9RdlHWkzQsPF55qMxcXvEm3ecsZREw8,6150
|
|
27
|
+
datachain/cli/__init__.py,sha256=so3WxEQF03KdGvjav15Sw7a6-lriiE24uDSGbBDBp8o,8298
|
|
28
28
|
datachain/cli/utils.py,sha256=wrLnAh7Wx8O_ojZE8AE4Lxn5WoxHbOj7as8NWlLAA74,3036
|
|
29
29
|
datachain/cli/commands/__init__.py,sha256=zp3bYIioO60x_X04A4-IpZqSYVnpwOa1AdERQaRlIhI,493
|
|
30
|
-
datachain/cli/commands/datasets.py,sha256=
|
|
30
|
+
datachain/cli/commands/datasets.py,sha256=Q2zYbiWXYPjg6e_YHyUKaYRg1L6-lxv0L214bogwsUY,6565
|
|
31
31
|
datachain/cli/commands/du.py,sha256=9edEzDEs98K2VYk8Wf-ZMpUzALcgm9uD6YtoqbvtUGU,391
|
|
32
32
|
datachain/cli/commands/index.py,sha256=eglNaIe1yyIadUHHumjtNbgIjht6kme7SS7xE3YHR88,198
|
|
33
33
|
datachain/cli/commands/ls.py,sha256=CBmk838Q-EQp04lE2Qdnpsc1GXAkC4-I-b-a_828n1E,5272
|
|
@@ -49,11 +49,11 @@ datachain/client/s3.py,sha256=6DNVGLg-woPS1DVlYVX2rIlunNblsuxyOnI1rSzhW3k,7515
|
|
|
49
49
|
datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
|
|
50
50
|
datachain/data_storage/db_engine.py,sha256=n8ojCbvVMPY2e3SG8fUaaD0b9GkVfpl_Naa_6EiHfWg,3788
|
|
51
51
|
datachain/data_storage/job.py,sha256=ZkeXCNUj_VCkoKYx29hqB4AcfVUielnRjY-GYUcUxt4,426
|
|
52
|
-
datachain/data_storage/metastore.py,sha256=
|
|
52
|
+
datachain/data_storage/metastore.py,sha256=aSeTRh43hmrOhULi9YD2VlgCj8B4bjE3jqCOvnb_HQs,53851
|
|
53
53
|
datachain/data_storage/schema.py,sha256=o3JbURKXRg3IJyIVA4QjHHkn6byRuz7avbydU2FlvNY,9897
|
|
54
54
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
55
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
56
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
55
|
+
datachain/data_storage/sqlite.py,sha256=edcTegzEoAEdEp62Rg9oERvHWXDcpg8d4onrD-P2xKM,30159
|
|
56
|
+
datachain/data_storage/warehouse.py,sha256=sEbNiWKdB7yuLt88FuIfRur7U7WiOZrcHWhnBS_eMAg,32642
|
|
57
57
|
datachain/diff/__init__.py,sha256=-OFZzgOplqO84iWgGY7kfe60NXaWR9JRIh9T-uJboAM,9668
|
|
58
58
|
datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
59
59
|
datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
|
|
@@ -70,7 +70,7 @@ datachain/func/random.py,sha256=t7jwXsI8-hy0qAdvjAntgzy-AHtTAfozlZ1CpKR-QZE,458
|
|
|
70
70
|
datachain/func/string.py,sha256=6-fZM7wHv0JZ2ZzpLFPLLYW15K_CT5VfYsmx56zBrpA,7419
|
|
71
71
|
datachain/func/window.py,sha256=ImyRpc1QI8QUSPO7KdD60e_DPVo7Ja0G5kcm6BlyMcw,1584
|
|
72
72
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
73
|
-
datachain/lib/arrow.py,sha256=
|
|
73
|
+
datachain/lib/arrow.py,sha256=aedsosbFNjIBa6LQIxR2zhIVcA4pVw1p5hCVmrDhWsQ,10781
|
|
74
74
|
datachain/lib/audio.py,sha256=fQmIBq-9hrUZtkgeJdPHYA_D8Wfe9D4cQZk4_ijxpNc,7580
|
|
75
75
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
76
76
|
datachain/lib/data_model.py,sha256=Rjah76GHwIV6AZQk4rsdg6JLre5D8Kb9T4PS5SXzsPA,3740
|
|
@@ -81,12 +81,12 @@ datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
|
|
|
81
81
|
datachain/lib/listing.py,sha256=U-2stsTEwEsq4Y80dqGfktGzkmB5-ZntnL1_rzXlH0k,7089
|
|
82
82
|
datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
|
|
83
83
|
datachain/lib/meta_formats.py,sha256=zdyg6XLk3QIsSk3I7s0Ez5kaCJSlE3uq7JiGxf7UwtU,6348
|
|
84
|
-
datachain/lib/model_store.py,sha256=
|
|
85
|
-
datachain/lib/namespaces.py,sha256=
|
|
86
|
-
datachain/lib/projects.py,sha256=
|
|
84
|
+
datachain/lib/model_store.py,sha256=A0pSVQ7uaZ9RvANapzirF8Cqq9N6ysosPpMSkzdRPkU,3226
|
|
85
|
+
datachain/lib/namespaces.py,sha256=I6gLC4ZzgyatFtHL85MWR4ml7-yuQOzxHE7IQNbt_ac,2107
|
|
86
|
+
datachain/lib/projects.py,sha256=VJgmzHzKjmNPZD1tm0a1RNHmUQwn6WLWCLpKyc4UrSk,2605
|
|
87
87
|
datachain/lib/pytorch.py,sha256=S-st2SAczYut13KMf6eSqP_OQ8otWI5TRmzhK5fN3k0,7828
|
|
88
88
|
datachain/lib/settings.py,sha256=n0YYhCVdgCdMkCSLY7kscJF9mUhlQ0a4ENWBsJFynkw,3809
|
|
89
|
-
datachain/lib/signal_schema.py,sha256=
|
|
89
|
+
datachain/lib/signal_schema.py,sha256=YMMcc9gHIzBz88zfsreGa1nOoO_56HBtZlT6jf3V1WE,39224
|
|
90
90
|
datachain/lib/tar.py,sha256=MLcVjzIgBqRuJacCNpZ6kwSZNq1i2tLyROc8PVprHsA,999
|
|
91
91
|
datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
|
|
92
92
|
datachain/lib/udf.py,sha256=IB1IKF5KyA-NiyfhVzmBPpF_aITPS3zSlrt24f_Ofjo,17956
|
|
@@ -101,11 +101,11 @@ datachain/lib/convert/python_to_sql.py,sha256=wg-O5FRKX3x3Wh8ZL1b9ntMlgf1zRO4djM
|
|
|
101
101
|
datachain/lib/convert/sql_to_python.py,sha256=Gxc4FylWC_Pvvuawuc2MKZIiuAWI7wje8pyeN1MxRrU,670
|
|
102
102
|
datachain/lib/convert/unflatten.py,sha256=ysMkstwJzPMWUlnxn-Z-tXJR3wmhjHeSN_P-sDcLS6s,2010
|
|
103
103
|
datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUOzHUGPoyZXAB0,4360
|
|
104
|
-
datachain/lib/dc/__init__.py,sha256=
|
|
104
|
+
datachain/lib/dc/__init__.py,sha256=UrUzmDH6YyVl8fxM5iXTSFtl5DZTUzEYm1MaazK4vdQ,900
|
|
105
105
|
datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
|
|
106
106
|
datachain/lib/dc/database.py,sha256=F6EOjPKwSdp26kJsOKGq49D9OxqyKEalINHEwLQav2s,14716
|
|
107
|
-
datachain/lib/dc/datachain.py,sha256=
|
|
108
|
-
datachain/lib/dc/datasets.py,sha256=
|
|
107
|
+
datachain/lib/dc/datachain.py,sha256=cJ0lbFteO5ync08M1QbriRrSAATOmU-nDkbxSH6SYgA,99462
|
|
108
|
+
datachain/lib/dc/datasets.py,sha256=HKQXnCpIGFsYQ9ociLAUm8cwg2H0GaUmgWCF4FkKpbk,15180
|
|
109
109
|
datachain/lib/dc/hf.py,sha256=AP_MUHg6HJWae10PN9hD_beQVjrl0cleZ6Cvhtl1yoI,2901
|
|
110
110
|
datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
|
|
111
111
|
datachain/lib/dc/listings.py,sha256=V379Cb-7ZyquM0w7sWArQZkzInZy4GB7QQ1ZfowKzQY,4544
|
|
@@ -113,7 +113,7 @@ datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,14
|
|
|
113
113
|
datachain/lib/dc/parquet.py,sha256=zYcSgrWwyEDW9UxGUSVdIVsCu15IGEf0xL8KfWQqK94,1782
|
|
114
114
|
datachain/lib/dc/records.py,sha256=4N1Fq-j5r4GK-PR5jIO-9B2u_zTNX9l-6SmcRhQDAsw,3136
|
|
115
115
|
datachain/lib/dc/storage.py,sha256=FXroEdxOZfbuEBIWfWTkbGwrI0D4_mrLZSRsIQm0WFE,7693
|
|
116
|
-
datachain/lib/dc/utils.py,sha256=
|
|
116
|
+
datachain/lib/dc/utils.py,sha256=9OMiFu2kXIbtMqzJTEr1qbCoCBGpOmTnkWImVgFTKgo,4112
|
|
117
117
|
datachain/lib/dc/values.py,sha256=7l1n352xWrEdql2NhBcZ3hj8xyPglWiY4qHjFPjn6iw,1428
|
|
118
118
|
datachain/model/__init__.py,sha256=R9faX5OHV1xh2EW-g2MPedwbtEqt3LodJRyluB-QylI,189
|
|
119
119
|
datachain/model/bbox.py,sha256=cQNHuQuVsh6bW3n3Hj40F2Cc20cExQ9Lg_q7R2jxUMI,9324
|
|
@@ -126,12 +126,12 @@ datachain/model/ultralytics/pose.py,sha256=pvoXrWWUSWT_UBaMwUb5MBHAY57Co2HFDPigF
|
|
|
126
126
|
datachain/model/ultralytics/segment.py,sha256=v9_xDxd5zw_I8rXsbl7yQXgEdTs2T38zyY_Y4XGN8ok,3194
|
|
127
127
|
datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
|
|
128
128
|
datachain/query/batch.py,sha256=-goxLpE0EUvaDHu66rstj53UnfHpYfBUGux8GSpJ93k,4306
|
|
129
|
-
datachain/query/dataset.py,sha256=
|
|
130
|
-
datachain/query/dispatch.py,sha256=
|
|
129
|
+
datachain/query/dataset.py,sha256=OaGRBNSWYNaRbYn6avij0fiFN5DT-nwdM-wJ4yTfaYs,63317
|
|
130
|
+
datachain/query/dispatch.py,sha256=f8IIvuLBJaCEwSRv7bWPMy1uXyc28W0LGqrBffjYf98,15831
|
|
131
131
|
datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
|
|
132
132
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
133
133
|
datachain/query/queue.py,sha256=v0UeK4ilmdiRoJ5OdjB5qpnHTYDxRP4vhVp5Iw_toaI,3512
|
|
134
|
-
datachain/query/schema.py,sha256=
|
|
134
|
+
datachain/query/schema.py,sha256=qLpEyvnzKlNCOrThQiTNpUKTUEsVIHT9trt-0UMt6ko,6704
|
|
135
135
|
datachain/query/session.py,sha256=gKblltJAVQAVSTswAgWGDgGbpmFlFzFVkIQojDCjgXM,6809
|
|
136
136
|
datachain/query/udf.py,sha256=e753bDJzTNjGFQn1WGTvOAWSwjDbrFI1-_DDWkWN2ls,1343
|
|
137
137
|
datachain/query/utils.py,sha256=a2PTBZ3qsG6XlUcp9XsoGiQfKkca4Q3m-VzFgiGQPAc,1230
|
|
@@ -141,7 +141,7 @@ datachain/sql/__init__.py,sha256=8D2omsBiATt8bjLjGo6jBEtaKEkOlnlNFWhVryHMDv0,388
|
|
|
141
141
|
datachain/sql/postgresql_dialect.py,sha256=pDTfH8xaXz5xZsq8O1aQUvWLRIv_ogYeAqtmKlPp3Rw,280
|
|
142
142
|
datachain/sql/postgresql_types.py,sha256=ryb_0lzuA9UOJ_B6nW9Yb8nJjzeSmEItAL_Ceue65lc,627
|
|
143
143
|
datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
|
|
144
|
-
datachain/sql/types.py,sha256=
|
|
144
|
+
datachain/sql/types.py,sha256=2XbNaQTTc2BGJ6qL7RcwrBByIEbf9PXcsElIz6q9Mkg,15018
|
|
145
145
|
datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
|
|
146
146
|
datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESmVAWw,45
|
|
147
147
|
datachain/sql/default/base.py,sha256=QD-31C6JnyOXzogyDx90sUhm7QvgXIYpeHEASH84igU,628
|
|
@@ -154,15 +154,15 @@ datachain/sql/functions/path.py,sha256=zixpERotTFP6LZ7I4TiGtyRA8kXOoZmH1yzH9oRW0
|
|
|
154
154
|
datachain/sql/functions/random.py,sha256=vBwEEj98VH4LjWixUCygQ5Bz1mv1nohsCG0-ZTELlVg,271
|
|
155
155
|
datachain/sql/functions/string.py,sha256=E-T9OIzUR-GKaLgjZsEtg5CJrY_sLf1lt1awTvY7w2w,1426
|
|
156
156
|
datachain/sql/sqlite/__init__.py,sha256=PsLaDSij9a03VxGSpagpNl7NQsGtgm72ArUeALZONoc,183
|
|
157
|
-
datachain/sql/sqlite/base.py,sha256=
|
|
158
|
-
datachain/sql/sqlite/types.py,sha256=
|
|
157
|
+
datachain/sql/sqlite/base.py,sha256=WzRxJ8lHAeBCQlh4Z_NmX0CCkxeOt10M_vudCQzY4gE,21510
|
|
158
|
+
datachain/sql/sqlite/types.py,sha256=DCK7q-Zdc_m1o1T33xrKjYX1zRg1231gw3o3ACO_qho,1815
|
|
159
159
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
160
160
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
161
161
|
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
162
162
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
163
|
-
datachain-0.30.
|
|
164
|
-
datachain-0.30.
|
|
165
|
-
datachain-0.30.
|
|
166
|
-
datachain-0.30.
|
|
167
|
-
datachain-0.30.
|
|
168
|
-
datachain-0.30.
|
|
163
|
+
datachain-0.30.5.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
164
|
+
datachain-0.30.5.dist-info/METADATA,sha256=90OhCbSbqZn245Dm4-9zgbyBxI6N2NF8jsIdEiw6PUs,13898
|
|
165
|
+
datachain-0.30.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
166
|
+
datachain-0.30.5.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
167
|
+
datachain-0.30.5.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
168
|
+
datachain-0.30.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|