datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +20 -0
- datachain/asyn.py +11 -12
- datachain/cache.py +7 -7
- datachain/catalog/__init__.py +2 -2
- datachain/catalog/catalog.py +621 -507
- datachain/catalog/dependency.py +164 -0
- datachain/catalog/loader.py +28 -18
- datachain/checkpoint.py +43 -0
- datachain/cli/__init__.py +24 -33
- datachain/cli/commands/__init__.py +1 -8
- datachain/cli/commands/datasets.py +83 -52
- datachain/cli/commands/ls.py +17 -17
- datachain/cli/commands/show.py +4 -4
- datachain/cli/parser/__init__.py +8 -74
- datachain/cli/parser/job.py +95 -3
- datachain/cli/parser/studio.py +11 -4
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +2 -15
- datachain/client/azure.py +4 -4
- datachain/client/fsspec.py +45 -28
- datachain/client/gcs.py +6 -6
- datachain/client/hf.py +29 -2
- datachain/client/http.py +157 -0
- datachain/client/local.py +15 -11
- datachain/client/s3.py +17 -9
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +12 -6
- datachain/data_storage/job.py +5 -1
- datachain/data_storage/metastore.py +1252 -186
- datachain/data_storage/schema.py +58 -45
- datachain/data_storage/serializer.py +105 -15
- datachain/data_storage/sqlite.py +286 -127
- datachain/data_storage/warehouse.py +250 -113
- datachain/dataset.py +353 -148
- datachain/delta.py +391 -0
- datachain/diff/__init__.py +27 -29
- datachain/error.py +60 -0
- datachain/func/__init__.py +2 -1
- datachain/func/aggregate.py +66 -42
- datachain/func/array.py +242 -38
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +110 -60
- datachain/func/func.py +96 -45
- datachain/func/numeric.py +55 -38
- datachain/func/path.py +32 -20
- datachain/func/random.py +2 -2
- datachain/func/string.py +67 -37
- datachain/func/window.py +7 -8
- datachain/hash_utils.py +123 -0
- datachain/job.py +11 -7
- datachain/json.py +138 -0
- datachain/lib/arrow.py +58 -22
- datachain/lib/audio.py +245 -0
- datachain/lib/clip.py +14 -13
- datachain/lib/convert/flatten.py +5 -3
- datachain/lib/convert/python_to_sql.py +6 -10
- datachain/lib/convert/sql_to_python.py +8 -0
- datachain/lib/convert/values_to_tuples.py +156 -51
- datachain/lib/data_model.py +42 -20
- datachain/lib/dataset_info.py +36 -8
- datachain/lib/dc/__init__.py +8 -2
- datachain/lib/dc/csv.py +25 -28
- datachain/lib/dc/database.py +398 -0
- datachain/lib/dc/datachain.py +1289 -425
- datachain/lib/dc/datasets.py +320 -38
- datachain/lib/dc/hf.py +38 -24
- datachain/lib/dc/json.py +29 -32
- datachain/lib/dc/listings.py +112 -8
- datachain/lib/dc/pandas.py +16 -12
- datachain/lib/dc/parquet.py +35 -23
- datachain/lib/dc/records.py +31 -23
- datachain/lib/dc/storage.py +154 -64
- datachain/lib/dc/storage_pattern.py +251 -0
- datachain/lib/dc/utils.py +24 -16
- datachain/lib/dc/values.py +8 -9
- datachain/lib/file.py +622 -89
- datachain/lib/hf.py +69 -39
- datachain/lib/image.py +14 -14
- datachain/lib/listing.py +14 -11
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +3 -4
- datachain/lib/model_store.py +39 -7
- datachain/lib/namespaces.py +125 -0
- datachain/lib/projects.py +130 -0
- datachain/lib/pytorch.py +32 -21
- datachain/lib/settings.py +192 -56
- datachain/lib/signal_schema.py +427 -104
- datachain/lib/tar.py +1 -2
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +164 -76
- datachain/lib/udf_signature.py +60 -35
- datachain/lib/utils.py +118 -4
- datachain/lib/video.py +17 -9
- datachain/lib/webdataset.py +61 -56
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +22 -10
- datachain/model/bbox.py +3 -1
- datachain/model/ultralytics/bbox.py +16 -12
- datachain/model/ultralytics/pose.py +16 -12
- datachain/model/ultralytics/segment.py +16 -12
- datachain/namespace.py +84 -0
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +24 -0
- datachain/project.py +78 -0
- datachain/query/batch.py +40 -41
- datachain/query/dataset.py +604 -322
- datachain/query/dispatch.py +261 -154
- datachain/query/metrics.py +4 -6
- datachain/query/params.py +2 -3
- datachain/query/queue.py +3 -12
- datachain/query/schema.py +11 -6
- datachain/query/session.py +200 -33
- datachain/query/udf.py +34 -2
- datachain/remote/studio.py +171 -69
- datachain/script_meta.py +12 -12
- datachain/semver.py +68 -0
- datachain/sql/__init__.py +2 -0
- datachain/sql/functions/array.py +33 -1
- datachain/sql/postgresql_dialect.py +9 -0
- datachain/sql/postgresql_types.py +21 -0
- datachain/sql/sqlite/__init__.py +5 -1
- datachain/sql/sqlite/base.py +102 -29
- datachain/sql/sqlite/types.py +8 -13
- datachain/sql/types.py +70 -15
- datachain/studio.py +223 -46
- datachain/toolkit/split.py +31 -10
- datachain/utils.py +101 -59
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
- datachain-0.39.0.dist-info/RECORD +173 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
- datachain/cli/commands/query.py +0 -53
- datachain/query/utils.py +0 -42
- datachain-0.14.2.dist-info/RECORD +0 -158
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
|
@@ -9,6 +9,14 @@ def sql_to_python(sql_exp: ColumnElement) -> Any:
|
|
|
9
9
|
type_ = sql_exp.type.python_type
|
|
10
10
|
if type_ == Decimal:
|
|
11
11
|
type_ = float
|
|
12
|
+
elif type_ is list:
|
|
13
|
+
if hasattr(sql_exp.type, "item_type") and hasattr(
|
|
14
|
+
sql_exp.type.item_type, "python_type"
|
|
15
|
+
):
|
|
16
|
+
item_type = getattr(sql_exp.type.item_type, "python_type", Any)
|
|
17
|
+
type_ = list[item_type] # type: ignore[valid-type]
|
|
18
|
+
else:
|
|
19
|
+
type_ = list
|
|
12
20
|
except NotImplementedError:
|
|
13
21
|
type_ = str
|
|
14
22
|
return type_
|
|
@@ -1,62 +1,177 @@
|
|
|
1
|
+
import itertools
|
|
1
2
|
from collections.abc import Sequence
|
|
2
|
-
from typing import Any
|
|
3
|
-
|
|
4
|
-
from datachain.lib.data_model import
|
|
5
|
-
DataType,
|
|
6
|
-
DataTypeNames,
|
|
7
|
-
DataValue,
|
|
8
|
-
is_chain_type,
|
|
9
|
-
)
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from datachain.lib.data_model import DataType, DataTypeNames, DataValue, is_chain_type
|
|
10
6
|
from datachain.lib.utils import DataChainParamsError
|
|
11
7
|
|
|
12
8
|
|
|
13
9
|
class ValuesToTupleError(DataChainParamsError):
|
|
14
10
|
def __init__(self, ds_name: str, msg: str):
|
|
11
|
+
self.ds_name = ds_name
|
|
12
|
+
self.msg = msg
|
|
13
|
+
|
|
15
14
|
if ds_name:
|
|
16
15
|
ds_name = f"' {ds_name}'"
|
|
16
|
+
|
|
17
17
|
super().__init__(f"Cannot convert signals for dataset{ds_name}: {msg}")
|
|
18
18
|
|
|
19
|
+
def __reduce__(self):
|
|
20
|
+
return ValuesToTupleError, (self.ds_name, self.msg)
|
|
19
21
|
|
|
20
|
-
def values_to_tuples( # noqa: C901, PLR0912
|
|
21
|
-
ds_name: str = "",
|
|
22
|
-
output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
|
|
23
|
-
**fr_map: Sequence[DataValue],
|
|
24
|
-
) -> tuple[Any, Any, Any]:
|
|
25
|
-
if output:
|
|
26
|
-
if not isinstance(output, (Sequence, str, dict)):
|
|
27
|
-
if len(fr_map) != 1:
|
|
28
|
-
raise ValuesToTupleError(
|
|
29
|
-
ds_name,
|
|
30
|
-
f"only one output type was specified, {len(fr_map)} expected",
|
|
31
|
-
)
|
|
32
|
-
if not isinstance(output, type):
|
|
33
|
-
raise ValuesToTupleError(
|
|
34
|
-
ds_name,
|
|
35
|
-
f"output must specify a type while '{output}' was given",
|
|
36
|
-
)
|
|
37
22
|
|
|
38
|
-
|
|
39
|
-
|
|
23
|
+
def _find_first_non_none(sequence: Sequence[Any]) -> Any | None:
|
|
24
|
+
"""Find the first non-None element in a sequence."""
|
|
25
|
+
try:
|
|
26
|
+
return next(itertools.dropwhile(lambda i: i is None, sequence))
|
|
27
|
+
except StopIteration:
|
|
28
|
+
return None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _infer_list_item_type(lst: list) -> type:
|
|
32
|
+
"""Infer the item type of a list, handling None values and nested lists."""
|
|
33
|
+
if len(lst) == 0:
|
|
34
|
+
# Default to str when list is empty to avoid generic list
|
|
35
|
+
return str
|
|
36
|
+
|
|
37
|
+
first_item = _find_first_non_none(lst)
|
|
38
|
+
if first_item is None:
|
|
39
|
+
# Default to str when all items are None
|
|
40
|
+
return str
|
|
40
41
|
|
|
41
|
-
|
|
42
|
+
item_type = type(first_item)
|
|
43
|
+
|
|
44
|
+
# Handle nested lists one level deep
|
|
45
|
+
if isinstance(first_item, list) and len(first_item) > 0:
|
|
46
|
+
nested_item = _find_first_non_none(first_item)
|
|
47
|
+
if nested_item is not None:
|
|
48
|
+
return list[type(nested_item)] # type: ignore[misc, return-value]
|
|
49
|
+
# Default to str for nested lists with all None
|
|
50
|
+
return list[str] # type: ignore[return-value]
|
|
51
|
+
|
|
52
|
+
return item_type
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _infer_dict_value_type(dct: dict) -> type:
|
|
56
|
+
"""Infer the value type of a dict, handling None values and list values."""
|
|
57
|
+
if len(dct) == 0:
|
|
58
|
+
# Default to str when dict is empty to avoid generic dict values
|
|
59
|
+
return str
|
|
60
|
+
|
|
61
|
+
# Find first non-None value
|
|
62
|
+
first_value = None
|
|
63
|
+
for val in dct.values():
|
|
64
|
+
if val is not None:
|
|
65
|
+
first_value = val
|
|
66
|
+
break
|
|
67
|
+
|
|
68
|
+
if first_value is None:
|
|
69
|
+
# Default to str when all values are None
|
|
70
|
+
return str
|
|
71
|
+
|
|
72
|
+
# Handle list values
|
|
73
|
+
if isinstance(first_value, list) and len(first_value) > 0:
|
|
74
|
+
list_item = _find_first_non_none(first_value)
|
|
75
|
+
if list_item is not None:
|
|
76
|
+
return list[type(list_item)] # type: ignore[misc, return-value]
|
|
77
|
+
# Default to str for lists with all None
|
|
78
|
+
return list[str] # type: ignore[return-value]
|
|
79
|
+
|
|
80
|
+
return type(first_value)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _infer_type_from_sequence(
|
|
84
|
+
sequence: Sequence[DataValue], signal_name: str, ds_name: str
|
|
85
|
+
) -> type:
|
|
86
|
+
"""
|
|
87
|
+
Infer the type from a sequence of values.
|
|
88
|
+
|
|
89
|
+
Returns str if all values are None, otherwise infers from the first non-None value.
|
|
90
|
+
Handles lists and dicts with proper type inference for nested structures.
|
|
91
|
+
"""
|
|
92
|
+
first_element = _find_first_non_none(sequence)
|
|
93
|
+
|
|
94
|
+
if first_element is None:
|
|
95
|
+
# Default to str if column is empty or all values are None
|
|
96
|
+
return str
|
|
97
|
+
|
|
98
|
+
typ = type(first_element)
|
|
99
|
+
|
|
100
|
+
if not is_chain_type(typ):
|
|
101
|
+
raise ValuesToTupleError(
|
|
102
|
+
ds_name,
|
|
103
|
+
f"signal '{signal_name}' has unsupported type '{typ.__name__}'."
|
|
104
|
+
f" Please use DataModel types: {DataTypeNames}",
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
if isinstance(first_element, list):
|
|
108
|
+
item_type = _infer_list_item_type(first_element)
|
|
109
|
+
return list[item_type] # type: ignore[valid-type, return-value]
|
|
110
|
+
|
|
111
|
+
if isinstance(first_element, dict):
|
|
112
|
+
# If the first dict is empty, use str as default key/value types
|
|
113
|
+
if len(first_element) == 0:
|
|
114
|
+
return dict[str, str] # type: ignore[return-value]
|
|
115
|
+
first_key = next(iter(first_element.keys()))
|
|
116
|
+
value_type = _infer_dict_value_type(first_element)
|
|
117
|
+
return dict[type(first_key), value_type] # type: ignore[misc, return-value]
|
|
118
|
+
|
|
119
|
+
return typ
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _validate_and_normalize_output(
|
|
123
|
+
output: DataType | Sequence[str] | dict[str, DataType] | None,
|
|
124
|
+
fr_map: dict[str, Sequence[DataValue]],
|
|
125
|
+
ds_name: str,
|
|
126
|
+
) -> dict[str, DataType] | None:
|
|
127
|
+
"""Validate and normalize the output parameter to a dict format."""
|
|
128
|
+
if not output:
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
if not isinstance(output, (Sequence, str, dict)):
|
|
132
|
+
if len(fr_map) != 1:
|
|
42
133
|
raise ValuesToTupleError(
|
|
43
134
|
ds_name,
|
|
44
|
-
"output type
|
|
45
|
-
f"'{type(output).__name__}' is given",
|
|
135
|
+
f"only one output type was specified, {len(fr_map)} expected",
|
|
46
136
|
)
|
|
47
|
-
|
|
48
|
-
if len(output) != len(fr_map):
|
|
137
|
+
if not isinstance(output, type):
|
|
49
138
|
raise ValuesToTupleError(
|
|
50
139
|
ds_name,
|
|
51
|
-
f"
|
|
52
|
-
f" number of signals '{len(fr_map)}'",
|
|
140
|
+
f"output must specify a type while '{output}' was given",
|
|
53
141
|
)
|
|
54
142
|
|
|
143
|
+
key: str = next(iter(fr_map.keys()))
|
|
144
|
+
return {key: output} # type: ignore[dict-item]
|
|
145
|
+
|
|
146
|
+
if not isinstance(output, dict):
|
|
147
|
+
raise ValuesToTupleError(
|
|
148
|
+
ds_name,
|
|
149
|
+
"output type must be dict[str, DataType] while "
|
|
150
|
+
f"'{type(output).__name__}' is given",
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
if len(output) != len(fr_map):
|
|
154
|
+
raise ValuesToTupleError(
|
|
155
|
+
ds_name,
|
|
156
|
+
f"number of outputs '{len(output)}' should match"
|
|
157
|
+
f" number of signals '{len(fr_map)}'",
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
return output # type: ignore[return-value]
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def values_to_tuples(
|
|
164
|
+
ds_name: str = "",
|
|
165
|
+
output: DataType | Sequence[str] | dict[str, DataType] | None = None,
|
|
166
|
+
**fr_map: Sequence[DataValue],
|
|
167
|
+
) -> tuple[Any, Any, Any]:
|
|
168
|
+
output = _validate_and_normalize_output(output, fr_map, ds_name)
|
|
169
|
+
|
|
55
170
|
types_map: dict[str, type] = {}
|
|
56
171
|
length = -1
|
|
57
172
|
for k, v in fr_map.items():
|
|
58
173
|
if not isinstance(v, Sequence) or isinstance(v, str): # type: ignore[unreachable]
|
|
59
|
-
raise ValuesToTupleError(ds_name, f"
|
|
174
|
+
raise ValuesToTupleError(ds_name, f"signal '{k}' is not a sequence")
|
|
60
175
|
len_ = len(v)
|
|
61
176
|
|
|
62
177
|
if output:
|
|
@@ -66,21 +181,11 @@ def values_to_tuples( # noqa: C901, PLR0912
|
|
|
66
181
|
f"signal '{k}' is not present in the output",
|
|
67
182
|
)
|
|
68
183
|
else:
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
if not is_chain_type(typ):
|
|
75
|
-
raise ValuesToTupleError(
|
|
76
|
-
ds_name,
|
|
77
|
-
f"signal '{k}' has unsupported type '{typ.__name__}'."
|
|
78
|
-
f" Please use DataModel types: {DataTypeNames}",
|
|
79
|
-
)
|
|
80
|
-
if isinstance(first_element, list):
|
|
81
|
-
types_map[k] = list[type(first_element[0])] # type: ignore[assignment, misc]
|
|
82
|
-
else:
|
|
83
|
-
types_map[k] = typ
|
|
184
|
+
# FIXME: Stops as soon as it finds the first non-None value.
|
|
185
|
+
# If a non-None value appears early, it won't check the remaining items for
|
|
186
|
+
# `None` values.
|
|
187
|
+
typ = _infer_type_from_sequence(v, k, ds_name)
|
|
188
|
+
types_map[k] = typ
|
|
84
189
|
|
|
85
190
|
if length < 0:
|
|
86
191
|
length = len_
|
|
@@ -104,7 +209,7 @@ def values_to_tuples( # noqa: C901, PLR0912
|
|
|
104
209
|
if len(output) > 1: # type: ignore[arg-type]
|
|
105
210
|
tuple_type = tuple(output_types)
|
|
106
211
|
res_type = tuple[tuple_type] # type: ignore[valid-type]
|
|
107
|
-
res_values: Sequence[Any] = list(zip(*fr_map.values()))
|
|
212
|
+
res_values: Sequence[Any] = list(zip(*fr_map.values(), strict=False))
|
|
108
213
|
else:
|
|
109
214
|
res_type = output_types[0] # type: ignore[misc]
|
|
110
215
|
res_values = next(iter(fr_map.values()))
|
datachain/lib/data_model.py
CHANGED
|
@@ -1,25 +1,29 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
import types
|
|
3
|
+
import uuid
|
|
1
4
|
from collections.abc import Sequence
|
|
2
5
|
from datetime import datetime
|
|
3
|
-
from typing import ClassVar,
|
|
6
|
+
from typing import ClassVar, Union, get_args, get_origin
|
|
4
7
|
|
|
5
8
|
from pydantic import AliasChoices, BaseModel, Field, create_model
|
|
9
|
+
from pydantic.fields import FieldInfo
|
|
6
10
|
|
|
7
11
|
from datachain.lib.model_store import ModelStore
|
|
8
12
|
from datachain.lib.utils import normalize_col_names
|
|
9
13
|
|
|
10
|
-
StandardType =
|
|
11
|
-
type[int]
|
|
12
|
-
type[str]
|
|
13
|
-
type[float]
|
|
14
|
-
type[bool]
|
|
15
|
-
type[list]
|
|
16
|
-
type[dict]
|
|
17
|
-
type[bytes]
|
|
18
|
-
type[datetime]
|
|
19
|
-
|
|
20
|
-
DataType =
|
|
14
|
+
StandardType = (
|
|
15
|
+
type[int]
|
|
16
|
+
| type[str]
|
|
17
|
+
| type[float]
|
|
18
|
+
| type[bool]
|
|
19
|
+
| type[list]
|
|
20
|
+
| type[dict]
|
|
21
|
+
| type[bytes]
|
|
22
|
+
| type[datetime]
|
|
23
|
+
)
|
|
24
|
+
DataType = type[BaseModel] | StandardType
|
|
21
25
|
DataTypeNames = "BaseModel, int, str, float, bool, list, dict, bytes, datetime"
|
|
22
|
-
DataValue =
|
|
26
|
+
DataValue = BaseModel | int | str | float | bool | list | dict | bytes | datetime
|
|
23
27
|
|
|
24
28
|
|
|
25
29
|
class DataModel(BaseModel):
|
|
@@ -34,7 +38,7 @@ class DataModel(BaseModel):
|
|
|
34
38
|
ModelStore.register(cls)
|
|
35
39
|
|
|
36
40
|
@staticmethod
|
|
37
|
-
def register(models:
|
|
41
|
+
def register(models: DataType | Sequence[DataType]):
|
|
38
42
|
"""For registering classes manually. It accepts a single class or a sequence of
|
|
39
43
|
classes."""
|
|
40
44
|
if not isinstance(models, Sequence):
|
|
@@ -60,8 +64,11 @@ def is_chain_type(t: type) -> bool:
|
|
|
60
64
|
if orig is list and len(args) == 1:
|
|
61
65
|
return is_chain_type(get_args(t)[0])
|
|
62
66
|
|
|
63
|
-
if orig is
|
|
64
|
-
return is_chain_type(args[0])
|
|
67
|
+
if orig is dict and len(args) == 2:
|
|
68
|
+
return is_chain_type(args[0]) and is_chain_type(args[1])
|
|
69
|
+
|
|
70
|
+
if orig in (Union, types.UnionType) and len(args) == 2 and (type(None) in args):
|
|
71
|
+
return is_chain_type(args[0] if args[1] is type(None) else args[1])
|
|
65
72
|
|
|
66
73
|
return False
|
|
67
74
|
|
|
@@ -69,17 +76,19 @@ def is_chain_type(t: type) -> bool:
|
|
|
69
76
|
def dict_to_data_model(
|
|
70
77
|
name: str,
|
|
71
78
|
data_dict: dict[str, DataType],
|
|
72
|
-
original_names:
|
|
79
|
+
original_names: list[str] | None = None,
|
|
73
80
|
) -> type[BaseModel]:
|
|
74
81
|
if not original_names:
|
|
75
82
|
# Gets a map of a normalized_name -> original_name
|
|
76
83
|
columns = normalize_col_names(list(data_dict))
|
|
77
|
-
data_dict = dict(zip(columns.keys(), data_dict.values()))
|
|
84
|
+
data_dict = dict(zip(columns.keys(), data_dict.values(), strict=False))
|
|
78
85
|
original_names = list(columns.values())
|
|
79
86
|
|
|
80
87
|
fields = {
|
|
81
88
|
name: (
|
|
82
|
-
anno
|
|
89
|
+
anno
|
|
90
|
+
if inspect.isclass(anno) and issubclass(anno, BaseModel)
|
|
91
|
+
else anno | None,
|
|
83
92
|
Field(
|
|
84
93
|
validation_alias=AliasChoices(name, original_names[idx] or name),
|
|
85
94
|
default=None,
|
|
@@ -89,7 +98,20 @@ def dict_to_data_model(
|
|
|
89
98
|
}
|
|
90
99
|
|
|
91
100
|
class _DataModelStrict(BaseModel, extra="forbid"):
|
|
92
|
-
|
|
101
|
+
@classmethod
|
|
102
|
+
def _model_fields_by_aliases(cls) -> dict[str, tuple[str, FieldInfo]]:
|
|
103
|
+
"""Returns a map of aliases to original field names and info."""
|
|
104
|
+
field_info = {}
|
|
105
|
+
for _name, field in cls.model_fields.items():
|
|
106
|
+
assert isinstance(field.validation_alias, AliasChoices)
|
|
107
|
+
# Add mapping for all aliases (both normalized and original names)
|
|
108
|
+
for alias in field.validation_alias.choices:
|
|
109
|
+
field_info[str(alias)] = (_name, field)
|
|
110
|
+
return field_info
|
|
111
|
+
|
|
112
|
+
# Generate random unique name if not provided
|
|
113
|
+
if not name:
|
|
114
|
+
name = f"DataModel_{uuid.uuid4().hex[:8]}"
|
|
93
115
|
|
|
94
116
|
return create_model(
|
|
95
117
|
name,
|
datachain/lib/dataset_info.py
CHANGED
|
@@ -1,17 +1,19 @@
|
|
|
1
|
-
import json
|
|
2
1
|
from datetime import datetime
|
|
3
|
-
from typing import TYPE_CHECKING, Any
|
|
2
|
+
from typing import TYPE_CHECKING, Any
|
|
4
3
|
from uuid import uuid4
|
|
5
4
|
|
|
6
5
|
from pydantic import Field, field_validator
|
|
7
6
|
|
|
7
|
+
from datachain import json
|
|
8
8
|
from datachain.dataset import (
|
|
9
|
+
DEFAULT_DATASET_VERSION,
|
|
9
10
|
DatasetListRecord,
|
|
10
11
|
DatasetListVersion,
|
|
11
12
|
DatasetStatus,
|
|
12
13
|
)
|
|
13
14
|
from datachain.job import Job
|
|
14
15
|
from datachain.lib.data_model import DataModel
|
|
16
|
+
from datachain.query.session import Session
|
|
15
17
|
from datachain.utils import TIME_ZERO
|
|
16
18
|
|
|
17
19
|
if TYPE_CHECKING:
|
|
@@ -20,21 +22,44 @@ if TYPE_CHECKING:
|
|
|
20
22
|
|
|
21
23
|
class DatasetInfo(DataModel):
|
|
22
24
|
name: str
|
|
25
|
+
namespace: str
|
|
26
|
+
project: str
|
|
23
27
|
uuid: str = Field(default=str(uuid4()))
|
|
24
|
-
version:
|
|
28
|
+
version: str = Field(default=DEFAULT_DATASET_VERSION)
|
|
25
29
|
status: int = Field(default=DatasetStatus.CREATED)
|
|
26
30
|
created_at: datetime = Field(default=TIME_ZERO)
|
|
27
|
-
finished_at:
|
|
28
|
-
num_objects:
|
|
29
|
-
size:
|
|
31
|
+
finished_at: datetime | None = Field(default=None)
|
|
32
|
+
num_objects: int | None = Field(default=None)
|
|
33
|
+
size: int | None = Field(default=None)
|
|
30
34
|
params: dict[str, str] = Field(default={})
|
|
31
35
|
metrics: dict[str, Any] = Field(default={})
|
|
32
36
|
error_message: str = Field(default="")
|
|
33
37
|
error_stack: str = Field(default="")
|
|
38
|
+
attrs: list[str] = Field(default=[])
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def is_temp(self) -> bool:
|
|
42
|
+
return Session.is_temp_dataset(self.name)
|
|
43
|
+
|
|
44
|
+
def has_attr(self, attr: str) -> bool:
|
|
45
|
+
s = attr.split("=")
|
|
46
|
+
if len(s) == 1:
|
|
47
|
+
return attr in self.attrs
|
|
48
|
+
|
|
49
|
+
name = s[0]
|
|
50
|
+
value = s[1]
|
|
51
|
+
for a in self.attrs:
|
|
52
|
+
s = a.split("=")
|
|
53
|
+
if value == "*" and s[0] == name:
|
|
54
|
+
return True
|
|
55
|
+
if len(s) == 2 and s[0] == name and s[1] == value:
|
|
56
|
+
return True
|
|
57
|
+
|
|
58
|
+
return False
|
|
34
59
|
|
|
35
60
|
@staticmethod
|
|
36
61
|
def _validate_dict(
|
|
37
|
-
v:
|
|
62
|
+
v: str | dict | None,
|
|
38
63
|
) -> dict:
|
|
39
64
|
if v is None or v == "":
|
|
40
65
|
return {}
|
|
@@ -63,11 +88,13 @@ class DatasetInfo(DataModel):
|
|
|
63
88
|
cls,
|
|
64
89
|
dataset: DatasetListRecord,
|
|
65
90
|
version: DatasetListVersion,
|
|
66
|
-
job:
|
|
91
|
+
job: Job | None,
|
|
67
92
|
) -> "Self":
|
|
68
93
|
return cls(
|
|
69
94
|
uuid=version.uuid,
|
|
70
95
|
name=dataset.name,
|
|
96
|
+
namespace=dataset.project.namespace.name,
|
|
97
|
+
project=dataset.project.name,
|
|
71
98
|
version=version.version,
|
|
72
99
|
status=version.status,
|
|
73
100
|
created_at=version.created_at,
|
|
@@ -78,4 +105,5 @@ class DatasetInfo(DataModel):
|
|
|
78
105
|
metrics=job.metrics if job else {},
|
|
79
106
|
error_message=version.error_message,
|
|
80
107
|
error_stack=version.error_stack,
|
|
108
|
+
attrs=dataset.attrs,
|
|
81
109
|
)
|
datachain/lib/dc/__init__.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from .csv import read_csv
|
|
2
|
+
from .database import read_database
|
|
2
3
|
from .datachain import C, Column, DataChain
|
|
3
|
-
from .datasets import datasets, read_dataset
|
|
4
|
+
from .datasets import datasets, delete_dataset, move_dataset, read_dataset
|
|
4
5
|
from .hf import read_hf
|
|
5
6
|
from .json import read_json
|
|
6
7
|
from .listings import listings
|
|
@@ -8,7 +9,7 @@ from .pandas import read_pandas
|
|
|
8
9
|
from .parquet import read_parquet
|
|
9
10
|
from .records import read_records
|
|
10
11
|
from .storage import read_storage
|
|
11
|
-
from .utils import DatasetMergeError, DatasetPrepareError, Sys
|
|
12
|
+
from .utils import DatasetMergeError, DatasetPrepareError, Sys, is_local, is_studio
|
|
12
13
|
from .values import read_values
|
|
13
14
|
|
|
14
15
|
__all__ = [
|
|
@@ -19,8 +20,13 @@ __all__ = [
|
|
|
19
20
|
"DatasetPrepareError",
|
|
20
21
|
"Sys",
|
|
21
22
|
"datasets",
|
|
23
|
+
"delete_dataset",
|
|
24
|
+
"is_local",
|
|
25
|
+
"is_studio",
|
|
22
26
|
"listings",
|
|
27
|
+
"move_dataset",
|
|
23
28
|
"read_csv",
|
|
29
|
+
"read_database",
|
|
24
30
|
"read_dataset",
|
|
25
31
|
"read_hf",
|
|
26
32
|
"read_json",
|
datachain/lib/dc/csv.py
CHANGED
|
@@ -1,10 +1,6 @@
|
|
|
1
|
-
|
|
2
|
-
from
|
|
3
|
-
|
|
4
|
-
Callable,
|
|
5
|
-
Optional,
|
|
6
|
-
Union,
|
|
7
|
-
)
|
|
1
|
+
import os
|
|
2
|
+
from collections.abc import Callable, Sequence
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
8
4
|
|
|
9
5
|
from datachain.lib.dc.utils import DatasetPrepareError, OutputType
|
|
10
6
|
from datachain.lib.model_store import ModelStore
|
|
@@ -17,38 +13,38 @@ if TYPE_CHECKING:
|
|
|
17
13
|
|
|
18
14
|
|
|
19
15
|
def read_csv(
|
|
20
|
-
path,
|
|
21
|
-
delimiter:
|
|
16
|
+
path: str | os.PathLike[str] | list[str] | list[os.PathLike[str]],
|
|
17
|
+
delimiter: str | None = None,
|
|
22
18
|
header: bool = True,
|
|
23
19
|
output: OutputType = None,
|
|
24
|
-
|
|
20
|
+
column: str = "",
|
|
25
21
|
model_name: str = "",
|
|
26
22
|
source: bool = True,
|
|
27
|
-
nrows=None,
|
|
28
|
-
session:
|
|
29
|
-
settings:
|
|
30
|
-
column_types:
|
|
31
|
-
parse_options:
|
|
23
|
+
nrows: int | None = None,
|
|
24
|
+
session: Session | None = None,
|
|
25
|
+
settings: dict | None = None,
|
|
26
|
+
column_types: dict[str, "str | ArrowDataType"] | None = None,
|
|
27
|
+
parse_options: dict[str, str | bool | Callable] | None = None,
|
|
32
28
|
**kwargs,
|
|
33
29
|
) -> "DataChain":
|
|
34
30
|
"""Generate chain from csv files.
|
|
35
31
|
|
|
36
32
|
Parameters:
|
|
37
|
-
path
|
|
33
|
+
path: Storage URI with directory. URI must start with storage prefix such
|
|
38
34
|
as `s3://`, `gs://`, `az://` or "file:///".
|
|
39
|
-
delimiter
|
|
35
|
+
delimiter: Character for delimiting columns. Takes precedence if also
|
|
40
36
|
specified in `parse_options`. Defaults to ",".
|
|
41
|
-
header
|
|
42
|
-
output
|
|
37
|
+
header: Whether the files include a header row.
|
|
38
|
+
output: Dictionary or feature class defining column names and their
|
|
43
39
|
corresponding types. List of column names is also accepted, in which
|
|
44
40
|
case types will be inferred.
|
|
45
|
-
|
|
46
|
-
model_name
|
|
47
|
-
source
|
|
48
|
-
nrows
|
|
49
|
-
session
|
|
50
|
-
settings
|
|
51
|
-
column_types
|
|
41
|
+
column: Created column name.
|
|
42
|
+
model_name: Generated model name.
|
|
43
|
+
source: Whether to include info about the source file.
|
|
44
|
+
nrows: Optional row limit.
|
|
45
|
+
session: Session to use for the chain.
|
|
46
|
+
settings: Settings to use for the chain.
|
|
47
|
+
column_types: Dictionary of column names and their corresponding types.
|
|
52
48
|
It is passed to CSV reader and for each column specified type auto
|
|
53
49
|
inference is disabled.
|
|
54
50
|
parse_options: Tells the parser how to process lines.
|
|
@@ -67,7 +63,7 @@ def read_csv(
|
|
|
67
63
|
chain = dc.read_csv("s3://mybucket/dir")
|
|
68
64
|
```
|
|
69
65
|
"""
|
|
70
|
-
from pandas.
|
|
66
|
+
from pandas._libs.parsers import STR_NA_VALUES
|
|
71
67
|
from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
|
|
72
68
|
from pyarrow.dataset import CsvFileFormat
|
|
73
69
|
from pyarrow.lib import type_for_alias
|
|
@@ -119,9 +115,10 @@ def read_csv(
|
|
|
119
115
|
)
|
|
120
116
|
return chain.parse_tabular(
|
|
121
117
|
output=output,
|
|
122
|
-
|
|
118
|
+
column=column,
|
|
123
119
|
model_name=model_name,
|
|
124
120
|
source=source,
|
|
125
121
|
nrows=nrows,
|
|
126
122
|
format=format,
|
|
123
|
+
parse_options=parse_options,
|
|
127
124
|
)
|