datachain 0.14.4__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +4 -0
- datachain/catalog/catalog.py +13 -5
- datachain/catalog/loader.py +11 -7
- datachain/data_storage/schema.py +21 -23
- datachain/data_storage/sqlite.py +1 -1
- datachain/data_storage/warehouse.py +6 -8
- datachain/lib/convert/values_to_tuples.py +23 -14
- datachain/lib/dc/__init__.py +4 -1
- datachain/lib/dc/csv.py +3 -3
- datachain/lib/dc/database.py +151 -0
- datachain/lib/dc/datachain.py +25 -15
- datachain/lib/dc/datasets.py +70 -10
- datachain/lib/dc/hf.py +5 -5
- datachain/lib/dc/json.py +7 -7
- datachain/lib/dc/listings.py +3 -3
- datachain/lib/dc/pandas.py +13 -6
- datachain/lib/dc/parquet.py +3 -3
- datachain/lib/dc/records.py +12 -14
- datachain/lib/dc/storage.py +6 -6
- datachain/lib/dc/values.py +3 -3
- datachain/lib/listing.py +2 -2
- datachain/lib/signal_schema.py +34 -10
- datachain/listing.py +4 -4
- datachain/query/dataset.py +10 -12
- datachain/query/dispatch.py +7 -2
- datachain/query/schema.py +4 -1
- {datachain-0.14.4.dist-info → datachain-0.15.0.dist-info}/METADATA +3 -3
- {datachain-0.14.4.dist-info → datachain-0.15.0.dist-info}/RECORD +32 -31
- {datachain-0.14.4.dist-info → datachain-0.15.0.dist-info}/WHEEL +0 -0
- {datachain-0.14.4.dist-info → datachain-0.15.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.14.4.dist-info → datachain-0.15.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.14.4.dist-info → datachain-0.15.0.dist-info}/top_level.txt +0 -0
datachain/lib/dc/datasets.py
CHANGED
|
@@ -1,7 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
2
|
-
TYPE_CHECKING,
|
|
3
|
-
Optional,
|
|
4
|
-
)
|
|
1
|
+
from typing import TYPE_CHECKING, Optional, get_origin, get_type_hints
|
|
5
2
|
|
|
6
3
|
from datachain.lib.dataset_info import DatasetInfo
|
|
7
4
|
from datachain.lib.file import (
|
|
@@ -102,7 +99,7 @@ def datasets(
|
|
|
102
99
|
session: Optional[Session] = None,
|
|
103
100
|
settings: Optional[dict] = None,
|
|
104
101
|
in_memory: bool = False,
|
|
105
|
-
|
|
102
|
+
column: Optional[str] = None,
|
|
106
103
|
include_listing: bool = False,
|
|
107
104
|
studio: bool = False,
|
|
108
105
|
) -> "DataChain":
|
|
@@ -112,7 +109,8 @@ def datasets(
|
|
|
112
109
|
session: Optional session instance. If not provided, uses default session.
|
|
113
110
|
settings: Optional dictionary of settings to configure the chain.
|
|
114
111
|
in_memory: If True, creates an in-memory session. Defaults to False.
|
|
115
|
-
|
|
112
|
+
column: Name of the output column in the chain. Defaults to None which
|
|
113
|
+
means no top level column will be created.
|
|
116
114
|
include_listing: If True, includes listing datasets. Defaults to False.
|
|
117
115
|
studio: If True, returns datasets from Studio only,
|
|
118
116
|
otherwise returns all local datasets. Defaults to False.
|
|
@@ -124,7 +122,7 @@ def datasets(
|
|
|
124
122
|
```py
|
|
125
123
|
import datachain as dc
|
|
126
124
|
|
|
127
|
-
chain = dc.datasets()
|
|
125
|
+
chain = dc.datasets(column="dataset")
|
|
128
126
|
for ds in chain.collect("dataset"):
|
|
129
127
|
print(f"{ds.name}@v{ds.version}")
|
|
130
128
|
```
|
|
@@ -139,13 +137,75 @@ def datasets(
|
|
|
139
137
|
include_listing=include_listing, studio=studio
|
|
140
138
|
)
|
|
141
139
|
]
|
|
142
|
-
|
|
143
140
|
datasets_values = [d for d in datasets_values if not d.is_temp]
|
|
144
141
|
|
|
142
|
+
if not column:
|
|
143
|
+
# flattening dataset fields
|
|
144
|
+
schema = {
|
|
145
|
+
k: get_origin(v) if get_origin(v) is dict else v
|
|
146
|
+
for k, v in get_type_hints(DatasetInfo).items()
|
|
147
|
+
if k in DatasetInfo.model_fields
|
|
148
|
+
}
|
|
149
|
+
data = {k: [] for k in DatasetInfo.model_fields} # type: ignore[var-annotated]
|
|
150
|
+
for d in [d.model_dump() for d in datasets_values]:
|
|
151
|
+
for field, value in d.items():
|
|
152
|
+
data[field].append(value)
|
|
153
|
+
|
|
154
|
+
return read_values(
|
|
155
|
+
session=session,
|
|
156
|
+
settings=settings,
|
|
157
|
+
in_memory=in_memory,
|
|
158
|
+
output=schema,
|
|
159
|
+
**data, # type: ignore[arg-type]
|
|
160
|
+
)
|
|
161
|
+
|
|
145
162
|
return read_values(
|
|
146
163
|
session=session,
|
|
147
164
|
settings=settings,
|
|
148
165
|
in_memory=in_memory,
|
|
149
|
-
output={
|
|
150
|
-
**{
|
|
166
|
+
output={column: DatasetInfo},
|
|
167
|
+
**{column: datasets_values}, # type: ignore[arg-type]
|
|
151
168
|
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def delete_dataset(
|
|
172
|
+
name: str,
|
|
173
|
+
version: Optional[int] = None,
|
|
174
|
+
force: Optional[bool] = False,
|
|
175
|
+
studio: Optional[bool] = False,
|
|
176
|
+
session: Optional[Session] = None,
|
|
177
|
+
in_memory: bool = False,
|
|
178
|
+
) -> None:
|
|
179
|
+
"""Removes specific dataset version or all dataset versions, depending on
|
|
180
|
+
a force flag.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
name : Dataset name
|
|
184
|
+
version : Optional dataset version
|
|
185
|
+
force: If true, all datasets versions will be removed. Defaults to False.
|
|
186
|
+
studio: If True, removes dataset from Studio only,
|
|
187
|
+
otherwise remove from local. Defaults to False.
|
|
188
|
+
session: Optional session instance. If not provided, uses default session.
|
|
189
|
+
in_memory: If True, creates an in-memory session. Defaults to False.
|
|
190
|
+
|
|
191
|
+
Returns: None
|
|
192
|
+
|
|
193
|
+
Example:
|
|
194
|
+
```py
|
|
195
|
+
import datachain as dc
|
|
196
|
+
dc.delete_dataset("cats")
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
```py
|
|
200
|
+
import datachain as dc
|
|
201
|
+
dc.delete_dataset("cats", version=1)
|
|
202
|
+
```
|
|
203
|
+
"""
|
|
204
|
+
|
|
205
|
+
session = Session.get(session, in_memory=in_memory)
|
|
206
|
+
catalog = session.catalog
|
|
207
|
+
if not force:
|
|
208
|
+
version = version or catalog.get_dataset(name).latest_version
|
|
209
|
+
else:
|
|
210
|
+
version = None
|
|
211
|
+
catalog.remove_dataset(name, version=version, force=force, studio=studio)
|
datachain/lib/dc/hf.py
CHANGED
|
@@ -23,7 +23,7 @@ def read_hf(
|
|
|
23
23
|
*args,
|
|
24
24
|
session: Optional[Session] = None,
|
|
25
25
|
settings: Optional[dict] = None,
|
|
26
|
-
|
|
26
|
+
column: str = "",
|
|
27
27
|
model_name: str = "",
|
|
28
28
|
**kwargs,
|
|
29
29
|
) -> "DataChain":
|
|
@@ -34,7 +34,7 @@ def read_hf(
|
|
|
34
34
|
or an instance of `datasets.Dataset`-like object.
|
|
35
35
|
session : Session to use for the chain.
|
|
36
36
|
settings : Settings to use for the chain.
|
|
37
|
-
|
|
37
|
+
column : Generated object column name.
|
|
38
38
|
model_name : Generated model name.
|
|
39
39
|
kwargs : Parameters to pass to datasets.load_dataset.
|
|
40
40
|
|
|
@@ -62,12 +62,12 @@ def read_hf(
|
|
|
62
62
|
if len(ds_dict) > 1:
|
|
63
63
|
output = {"split": str}
|
|
64
64
|
|
|
65
|
-
model_name = model_name or
|
|
65
|
+
model_name = model_name or column or ""
|
|
66
66
|
hf_features = next(iter(ds_dict.values())).features
|
|
67
67
|
output = output | get_output_schema(hf_features)
|
|
68
68
|
model = dict_to_data_model(model_name, output)
|
|
69
|
-
if
|
|
70
|
-
output = {
|
|
69
|
+
if column:
|
|
70
|
+
output = {column: model}
|
|
71
71
|
|
|
72
72
|
chain = read_values(split=list(ds_dict.keys()), session=session, settings=settings)
|
|
73
73
|
return chain.gen(HFGenerator(dataset, model, *args, **kwargs), output=output)
|
datachain/lib/dc/json.py
CHANGED
|
@@ -28,7 +28,7 @@ def read_json(
|
|
|
28
28
|
spec: Optional[DataType] = None,
|
|
29
29
|
schema_from: Optional[str] = "auto",
|
|
30
30
|
jmespath: Optional[str] = None,
|
|
31
|
-
|
|
31
|
+
column: Optional[str] = "",
|
|
32
32
|
model_name: Optional[str] = None,
|
|
33
33
|
format: Optional[str] = "json",
|
|
34
34
|
nrows=None,
|
|
@@ -42,7 +42,7 @@ def read_json(
|
|
|
42
42
|
type : read file as "binary", "text", or "image" data. Default is "text".
|
|
43
43
|
spec : optional Data Model
|
|
44
44
|
schema_from : path to sample to infer spec (if schema not provided)
|
|
45
|
-
|
|
45
|
+
column : generated column name
|
|
46
46
|
model_name : optional generated model name
|
|
47
47
|
format: "json", "jsonl"
|
|
48
48
|
jmespath : optional JMESPATH expression to reduce JSON
|
|
@@ -70,13 +70,13 @@ def read_json(
|
|
|
70
70
|
name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
|
|
71
71
|
return s[:name_end]
|
|
72
72
|
|
|
73
|
-
if (not
|
|
74
|
-
|
|
75
|
-
if not
|
|
76
|
-
|
|
73
|
+
if (not column) and jmespath:
|
|
74
|
+
column = jmespath_to_name(jmespath)
|
|
75
|
+
if not column:
|
|
76
|
+
column = format
|
|
77
77
|
chain = read_storage(uri=path, type=type, **kwargs)
|
|
78
78
|
signal_dict = {
|
|
79
|
-
|
|
79
|
+
column: read_meta(
|
|
80
80
|
schema_from=schema_from,
|
|
81
81
|
format=format,
|
|
82
82
|
spec=spec,
|
datachain/lib/dc/listings.py
CHANGED
|
@@ -19,7 +19,7 @@ if TYPE_CHECKING:
|
|
|
19
19
|
def listings(
|
|
20
20
|
session: Optional[Session] = None,
|
|
21
21
|
in_memory: bool = False,
|
|
22
|
-
|
|
22
|
+
column: str = "listing",
|
|
23
23
|
**kwargs,
|
|
24
24
|
) -> "DataChain":
|
|
25
25
|
"""Generate chain with list of cached listings.
|
|
@@ -38,6 +38,6 @@ def listings(
|
|
|
38
38
|
return read_values(
|
|
39
39
|
session=session,
|
|
40
40
|
in_memory=in_memory,
|
|
41
|
-
output={
|
|
42
|
-
**{
|
|
41
|
+
output={column: ListingInfo},
|
|
42
|
+
**{column: catalog.listings()}, # type: ignore[arg-type]
|
|
43
43
|
)
|
datachain/lib/dc/pandas.py
CHANGED
|
@@ -22,7 +22,7 @@ def read_pandas( # type: ignore[override]
|
|
|
22
22
|
session: Optional[Session] = None,
|
|
23
23
|
settings: Optional[dict] = None,
|
|
24
24
|
in_memory: bool = False,
|
|
25
|
-
|
|
25
|
+
column: str = "",
|
|
26
26
|
) -> "DataChain":
|
|
27
27
|
"""Generate chain from pandas data-frame.
|
|
28
28
|
|
|
@@ -37,20 +37,27 @@ def read_pandas( # type: ignore[override]
|
|
|
37
37
|
"""
|
|
38
38
|
from .utils import DatasetPrepareError
|
|
39
39
|
|
|
40
|
-
|
|
40
|
+
def get_col_name(col):
|
|
41
|
+
if isinstance(col, tuple):
|
|
42
|
+
# Join tuple elements with underscore for MultiIndex columns
|
|
43
|
+
return "_".join(map(str, col)).lower()
|
|
44
|
+
# Handle regular string column names
|
|
45
|
+
return str(col).lower()
|
|
41
46
|
|
|
42
|
-
for
|
|
43
|
-
|
|
47
|
+
fr_map = {get_col_name(col): df[col].tolist() for col in df.columns}
|
|
48
|
+
|
|
49
|
+
for c in fr_map:
|
|
50
|
+
if not c.isidentifier():
|
|
44
51
|
raise DatasetPrepareError(
|
|
45
52
|
name,
|
|
46
|
-
f"import from pandas error - '{
|
|
53
|
+
f"import from pandas error - '{c}' cannot be a column name",
|
|
47
54
|
)
|
|
48
55
|
|
|
49
56
|
return read_values(
|
|
50
57
|
name,
|
|
51
58
|
session,
|
|
52
59
|
settings=settings,
|
|
53
|
-
|
|
60
|
+
column=column,
|
|
54
61
|
in_memory=in_memory,
|
|
55
62
|
**fr_map,
|
|
56
63
|
)
|
datachain/lib/dc/parquet.py
CHANGED
|
@@ -19,7 +19,7 @@ def read_parquet(
|
|
|
19
19
|
path,
|
|
20
20
|
partitioning: Any = "hive",
|
|
21
21
|
output: Optional[dict[str, DataType]] = None,
|
|
22
|
-
|
|
22
|
+
column: str = "",
|
|
23
23
|
model_name: str = "",
|
|
24
24
|
source: bool = True,
|
|
25
25
|
session: Optional[Session] = None,
|
|
@@ -33,7 +33,7 @@ def read_parquet(
|
|
|
33
33
|
as `s3://`, `gs://`, `az://` or "file:///".
|
|
34
34
|
partitioning : Any pyarrow partitioning schema.
|
|
35
35
|
output : Dictionary defining column names and their corresponding types.
|
|
36
|
-
|
|
36
|
+
column : Created column name.
|
|
37
37
|
model_name : Generated model name.
|
|
38
38
|
source : Whether to include info about the source file.
|
|
39
39
|
session : Session to use for the chain.
|
|
@@ -57,7 +57,7 @@ def read_parquet(
|
|
|
57
57
|
chain = read_storage(path, session=session, settings=settings, **kwargs)
|
|
58
58
|
return chain.parse_tabular(
|
|
59
59
|
output=output,
|
|
60
|
-
|
|
60
|
+
column=column,
|
|
61
61
|
model_name=model_name,
|
|
62
62
|
source=source,
|
|
63
63
|
format="parquet",
|
datachain/lib/dc/records.py
CHANGED
|
@@ -1,8 +1,5 @@
|
|
|
1
|
-
from
|
|
2
|
-
|
|
3
|
-
Optional,
|
|
4
|
-
Union,
|
|
5
|
-
)
|
|
1
|
+
from collections.abc import Iterable
|
|
2
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
6
3
|
|
|
7
4
|
import sqlalchemy
|
|
8
5
|
|
|
@@ -12,6 +9,7 @@ from datachain.lib.file import (
|
|
|
12
9
|
)
|
|
13
10
|
from datachain.lib.signal_schema import SignalSchema
|
|
14
11
|
from datachain.query import Session
|
|
12
|
+
from datachain.query.schema import Column
|
|
15
13
|
|
|
16
14
|
if TYPE_CHECKING:
|
|
17
15
|
from typing_extensions import ParamSpec
|
|
@@ -22,7 +20,7 @@ if TYPE_CHECKING:
|
|
|
22
20
|
|
|
23
21
|
|
|
24
22
|
def read_records(
|
|
25
|
-
to_insert: Optional[Union[dict,
|
|
23
|
+
to_insert: Optional[Union[dict, Iterable[dict]]],
|
|
26
24
|
session: Optional[Session] = None,
|
|
27
25
|
settings: Optional[dict] = None,
|
|
28
26
|
in_memory: bool = False,
|
|
@@ -54,10 +52,11 @@ def read_records(
|
|
|
54
52
|
|
|
55
53
|
if schema:
|
|
56
54
|
signal_schema = SignalSchema(schema)
|
|
57
|
-
columns = [
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
55
|
+
columns = []
|
|
56
|
+
for c in signal_schema.db_signals(as_columns=True):
|
|
57
|
+
assert isinstance(c, Column)
|
|
58
|
+
kw = {"nullable": c.nullable} if c.nullable is not None else {}
|
|
59
|
+
columns.append(sqlalchemy.Column(c.name, c.type, **kw))
|
|
61
60
|
else:
|
|
62
61
|
columns = [
|
|
63
62
|
sqlalchemy.Column(name, typ)
|
|
@@ -83,8 +82,7 @@ def read_records(
|
|
|
83
82
|
|
|
84
83
|
warehouse = catalog.warehouse
|
|
85
84
|
dr = warehouse.dataset_rows(dsr)
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
db.execute(insert_q.values(**record))
|
|
85
|
+
table = dr.get_table()
|
|
86
|
+
warehouse.insert_rows(table, to_insert)
|
|
87
|
+
warehouse.insert_rows_done(table)
|
|
90
88
|
return read_dataset(name=dsr.name, session=session, settings=settings)
|
datachain/lib/dc/storage.py
CHANGED
|
@@ -29,7 +29,7 @@ def read_storage(
|
|
|
29
29
|
settings: Optional[dict] = None,
|
|
30
30
|
in_memory: bool = False,
|
|
31
31
|
recursive: Optional[bool] = True,
|
|
32
|
-
|
|
32
|
+
column: str = "file",
|
|
33
33
|
update: bool = False,
|
|
34
34
|
anon: bool = False,
|
|
35
35
|
client_config: Optional[dict] = None,
|
|
@@ -43,7 +43,7 @@ def read_storage(
|
|
|
43
43
|
as `s3://`, `gs://`, `az://` or "file:///"
|
|
44
44
|
type : read file as "binary", "text", or "image" data. Default is "binary".
|
|
45
45
|
recursive : search recursively for the given path.
|
|
46
|
-
|
|
46
|
+
column : Created column name.
|
|
47
47
|
update : force storage reindexing. Default is False.
|
|
48
48
|
anon : If True, we will treat cloud bucket as public one
|
|
49
49
|
client_config : Optional client configuration for the storage client.
|
|
@@ -124,7 +124,7 @@ def read_storage(
|
|
|
124
124
|
|
|
125
125
|
dc = read_dataset(list_ds_name, session=session, settings=settings)
|
|
126
126
|
dc._query.update = update
|
|
127
|
-
dc.signals_schema = dc.signals_schema.mutate({f"{
|
|
127
|
+
dc.signals_schema = dc.signals_schema.mutate({f"{column}": file_type})
|
|
128
128
|
|
|
129
129
|
if update or not list_ds_exists:
|
|
130
130
|
|
|
@@ -140,7 +140,7 @@ def read_storage(
|
|
|
140
140
|
.settings(prefetch=0)
|
|
141
141
|
.gen(
|
|
142
142
|
list_bucket(lst_uri, cache, client_config=client_config),
|
|
143
|
-
output={f"{
|
|
143
|
+
output={f"{column}": file_type},
|
|
144
144
|
)
|
|
145
145
|
.save(ds_name, listing=True)
|
|
146
146
|
)
|
|
@@ -149,7 +149,7 @@ def read_storage(
|
|
|
149
149
|
lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
|
|
150
150
|
)
|
|
151
151
|
|
|
152
|
-
chain = ls(dc, list_path, recursive=recursive,
|
|
152
|
+
chain = ls(dc, list_path, recursive=recursive, column=column)
|
|
153
153
|
|
|
154
154
|
storage_chain = storage_chain.union(chain) if storage_chain else chain
|
|
155
155
|
listed_ds_name.add(list_ds_name)
|
|
@@ -162,7 +162,7 @@ def read_storage(
|
|
|
162
162
|
file=file_values,
|
|
163
163
|
)
|
|
164
164
|
file_chain.signals_schema = file_chain.signals_schema.mutate(
|
|
165
|
-
{f"{
|
|
165
|
+
{f"{column}": file_type}
|
|
166
166
|
)
|
|
167
167
|
storage_chain = storage_chain.union(file_chain) if storage_chain else file_chain
|
|
168
168
|
|
datachain/lib/dc/values.py
CHANGED
|
@@ -24,7 +24,7 @@ def read_values(
|
|
|
24
24
|
settings: Optional[dict] = None,
|
|
25
25
|
in_memory: bool = False,
|
|
26
26
|
output: OutputType = None,
|
|
27
|
-
|
|
27
|
+
column: str = "",
|
|
28
28
|
**fr_map,
|
|
29
29
|
) -> "DataChain":
|
|
30
30
|
"""Generate chain from list of values.
|
|
@@ -48,6 +48,6 @@ def read_values(
|
|
|
48
48
|
settings=settings,
|
|
49
49
|
in_memory=in_memory,
|
|
50
50
|
)
|
|
51
|
-
if
|
|
52
|
-
output = {
|
|
51
|
+
if column:
|
|
52
|
+
output = {column: dict_to_data_model(column, output)} # type: ignore[arg-type]
|
|
53
53
|
return chain.gen(_func_fr, output=output)
|
datachain/lib/listing.py
CHANGED
|
@@ -72,7 +72,7 @@ def ls(
|
|
|
72
72
|
dc: D,
|
|
73
73
|
path: str,
|
|
74
74
|
recursive: Optional[bool] = True,
|
|
75
|
-
|
|
75
|
+
column="file",
|
|
76
76
|
) -> D:
|
|
77
77
|
"""
|
|
78
78
|
Return files by some path from DataChain instance which contains bucket listing.
|
|
@@ -82,7 +82,7 @@ def ls(
|
|
|
82
82
|
"""
|
|
83
83
|
|
|
84
84
|
def _file_c(name: str) -> Column:
|
|
85
|
-
return Column(f"{
|
|
85
|
+
return Column(f"{column}.{name}")
|
|
86
86
|
|
|
87
87
|
dc = dc.filter(_file_c("is_latest") == true())
|
|
88
88
|
|
datachain/lib/signal_schema.py
CHANGED
|
@@ -87,6 +87,12 @@ class SignalResolvingTypeError(SignalResolvingError):
|
|
|
87
87
|
)
|
|
88
88
|
|
|
89
89
|
|
|
90
|
+
class SignalRemoveError(SignalSchemaError):
|
|
91
|
+
def __init__(self, path: Optional[list[str]], msg: str):
|
|
92
|
+
name = " '" + ".".join(path) + "'" if path else ""
|
|
93
|
+
super().__init__(f"cannot remove signal name{name}: {msg}")
|
|
94
|
+
|
|
95
|
+
|
|
90
96
|
class CustomType(BaseModel):
|
|
91
97
|
schema_version: int = Field(ge=1, le=2, strict=True)
|
|
92
98
|
name: str
|
|
@@ -575,7 +581,11 @@ class SignalSchema:
|
|
|
575
581
|
signals = [
|
|
576
582
|
DEFAULT_DELIMITER.join(path)
|
|
577
583
|
if not as_columns
|
|
578
|
-
else Column(
|
|
584
|
+
else Column(
|
|
585
|
+
DEFAULT_DELIMITER.join(path),
|
|
586
|
+
python_to_sql(_type),
|
|
587
|
+
nullable=is_optional(_type),
|
|
588
|
+
)
|
|
579
589
|
for path, _type, has_subtree, _ in self.get_flat_tree(
|
|
580
590
|
include_hidden=include_hidden
|
|
581
591
|
)
|
|
@@ -620,18 +630,27 @@ class SignalSchema:
|
|
|
620
630
|
return curr_type
|
|
621
631
|
|
|
622
632
|
def select_except_signals(self, *args: str) -> "SignalSchema":
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
raise SignalResolvingTypeError("select_except()", field)
|
|
633
|
+
def has_signal(signal: str):
|
|
634
|
+
signal = signal.replace(".", DEFAULT_DELIMITER)
|
|
635
|
+
return any(signal == s for s in self.db_signals())
|
|
627
636
|
|
|
628
|
-
|
|
637
|
+
schema = copy.deepcopy(self.values)
|
|
638
|
+
for signal in args:
|
|
639
|
+
if not isinstance(signal, str):
|
|
640
|
+
raise SignalResolvingTypeError("select_except()", signal)
|
|
641
|
+
|
|
642
|
+
if signal not in self.values:
|
|
643
|
+
if has_signal(signal):
|
|
644
|
+
raise SignalRemoveError(
|
|
645
|
+
signal.split("."),
|
|
646
|
+
"select_except() error - removing nested signal would"
|
|
647
|
+
" break parent schema, which isn't supported.",
|
|
648
|
+
)
|
|
629
649
|
raise SignalResolvingError(
|
|
630
|
-
|
|
631
|
-
"select_except() error - the
|
|
632
|
-
"inside of feature (not supported)",
|
|
650
|
+
signal.split("."),
|
|
651
|
+
"select_except() error - the signal does not exist",
|
|
633
652
|
)
|
|
634
|
-
del schema[
|
|
653
|
+
del schema[signal]
|
|
635
654
|
|
|
636
655
|
return SignalSchema(schema)
|
|
637
656
|
|
|
@@ -975,3 +994,8 @@ class SignalSchema:
|
|
|
975
994
|
}
|
|
976
995
|
|
|
977
996
|
return SignalSchema.deserialize(schema)
|
|
997
|
+
|
|
998
|
+
|
|
999
|
+
def is_optional(type_: Any) -> bool:
|
|
1000
|
+
"""Check if a type is Optional."""
|
|
1001
|
+
return get_origin(type_) is Union and type(None) in get_args(type_)
|
datachain/listing.py
CHANGED
|
@@ -27,14 +27,14 @@ class Listing:
|
|
|
27
27
|
client: "Client",
|
|
28
28
|
dataset_name: Optional["str"] = None,
|
|
29
29
|
dataset_version: Optional[int] = None,
|
|
30
|
-
|
|
30
|
+
column: str = "file",
|
|
31
31
|
):
|
|
32
32
|
self.metastore = metastore
|
|
33
33
|
self.warehouse = warehouse
|
|
34
34
|
self.client = client
|
|
35
35
|
self.dataset_name = dataset_name # dataset representing bucket listing
|
|
36
36
|
self.dataset_version = dataset_version # dataset representing bucket listing
|
|
37
|
-
self.
|
|
37
|
+
self.column = column
|
|
38
38
|
|
|
39
39
|
def clone(self) -> "Listing":
|
|
40
40
|
return self.__class__(
|
|
@@ -43,7 +43,7 @@ class Listing:
|
|
|
43
43
|
self.client,
|
|
44
44
|
self.dataset_name,
|
|
45
45
|
self.dataset_version,
|
|
46
|
-
self.
|
|
46
|
+
self.column,
|
|
47
47
|
)
|
|
48
48
|
|
|
49
49
|
def __enter__(self) -> "Listing":
|
|
@@ -74,7 +74,7 @@ class Listing:
|
|
|
74
74
|
return self.warehouse.dataset_rows(
|
|
75
75
|
dataset,
|
|
76
76
|
self.dataset_version or dataset.latest_version,
|
|
77
|
-
|
|
77
|
+
column=self.column,
|
|
78
78
|
)
|
|
79
79
|
|
|
80
80
|
def expand_path(self, path, use_glob=True) -> list[Node]:
|
datachain/query/dataset.py
CHANGED
|
@@ -437,9 +437,17 @@ class UDFStep(Step, ABC):
|
|
|
437
437
|
"distributed processing."
|
|
438
438
|
)
|
|
439
439
|
|
|
440
|
-
from datachain.catalog.loader import
|
|
440
|
+
from datachain.catalog.loader import (
|
|
441
|
+
DISTRIBUTED_IMPORT_PATH,
|
|
442
|
+
get_udf_distributor_class,
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
if not (udf_distributor_class := get_udf_distributor_class()):
|
|
446
|
+
raise RuntimeError(
|
|
447
|
+
f"{DISTRIBUTED_IMPORT_PATH} import path is required "
|
|
448
|
+
"for distributed UDF processing."
|
|
449
|
+
)
|
|
441
450
|
|
|
442
|
-
udf_distributor_class = get_udf_distributor_class()
|
|
443
451
|
udf_distributor = udf_distributor_class(
|
|
444
452
|
catalog=catalog,
|
|
445
453
|
table=udf_table,
|
|
@@ -1162,16 +1170,6 @@ class DatasetQuery:
|
|
|
1162
1170
|
)
|
|
1163
1171
|
return sqlalchemy.table(table_name)
|
|
1164
1172
|
|
|
1165
|
-
@staticmethod
|
|
1166
|
-
def delete(
|
|
1167
|
-
name: str, version: Optional[int] = None, catalog: Optional["Catalog"] = None
|
|
1168
|
-
) -> None:
|
|
1169
|
-
from datachain.catalog import get_catalog
|
|
1170
|
-
|
|
1171
|
-
catalog = catalog or get_catalog()
|
|
1172
|
-
version = version or catalog.get_dataset(name).latest_version
|
|
1173
|
-
catalog.remove_dataset(name, version)
|
|
1174
|
-
|
|
1175
1173
|
@property
|
|
1176
1174
|
def attached(self) -> bool:
|
|
1177
1175
|
"""
|
datachain/query/dispatch.py
CHANGED
|
@@ -13,7 +13,7 @@ from multiprocess import get_context
|
|
|
13
13
|
|
|
14
14
|
from datachain.catalog import Catalog
|
|
15
15
|
from datachain.catalog.catalog import clone_catalog_with_cache
|
|
16
|
-
from datachain.catalog.loader import get_udf_distributor_class
|
|
16
|
+
from datachain.catalog.loader import DISTRIBUTED_IMPORT_PATH, get_udf_distributor_class
|
|
17
17
|
from datachain.lib.udf import _get_cache
|
|
18
18
|
from datachain.query.batch import RowsOutput, RowsOutputBatch
|
|
19
19
|
from datachain.query.dataset import (
|
|
@@ -91,7 +91,12 @@ def udf_entrypoint() -> int:
|
|
|
91
91
|
|
|
92
92
|
|
|
93
93
|
def udf_worker_entrypoint() -> int:
|
|
94
|
-
|
|
94
|
+
if not (udf_distributor_class := get_udf_distributor_class()):
|
|
95
|
+
raise RuntimeError(
|
|
96
|
+
f"{DISTRIBUTED_IMPORT_PATH} import path is required "
|
|
97
|
+
"for distributed UDF processing."
|
|
98
|
+
)
|
|
99
|
+
return udf_distributor_class.run_worker()
|
|
95
100
|
|
|
96
101
|
|
|
97
102
|
class UDFDispatcher:
|
datachain/query/schema.py
CHANGED
|
@@ -40,12 +40,15 @@ class ColumnMeta(type):
|
|
|
40
40
|
class Column(sa.ColumnClause, metaclass=ColumnMeta):
|
|
41
41
|
inherit_cache: Optional[bool] = True
|
|
42
42
|
|
|
43
|
-
def __init__(
|
|
43
|
+
def __init__(
|
|
44
|
+
self, text, type_=None, is_literal=False, nullable=None, _selectable=None
|
|
45
|
+
):
|
|
44
46
|
"""Dataset column."""
|
|
45
47
|
self.name = ColumnMeta.to_db_name(text)
|
|
46
48
|
super().__init__(
|
|
47
49
|
self.name, type_=type_, is_literal=is_literal, _selectable=_selectable
|
|
48
50
|
)
|
|
51
|
+
self.nullable = nullable
|
|
49
52
|
|
|
50
53
|
def __getattr__(self, name: str):
|
|
51
54
|
return Column(self.name + DEFAULT_DELIMITER + name)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.15.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -171,7 +171,7 @@ high confidence scores.
|
|
|
171
171
|
|
|
172
172
|
import datachain as dc
|
|
173
173
|
|
|
174
|
-
meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json",
|
|
174
|
+
meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json", column="meta", anon=True)
|
|
175
175
|
images = dc.read_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
|
|
176
176
|
|
|
177
177
|
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
@@ -213,7 +213,7 @@ Python code:
|
|
|
213
213
|
return result.lower().startswith("success")
|
|
214
214
|
|
|
215
215
|
chain = (
|
|
216
|
-
dc.read_storage("gs://datachain-demo/chatbot-KiT/",
|
|
216
|
+
dc.read_storage("gs://datachain-demo/chatbot-KiT/", column="file", anon=True)
|
|
217
217
|
.settings(parallel=4, cache=True)
|
|
218
218
|
.map(is_success=eval_dialogue)
|
|
219
219
|
.save("mistral_files")
|