datachain 0.13.0__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +28 -1
- datachain/catalog/catalog.py +18 -9
- datachain/cli/commands/ls.py +2 -2
- datachain/cli/commands/show.py +2 -3
- datachain/diff/__init__.py +8 -5
- datachain/lib/dc/__init__.py +32 -0
- datachain/lib/dc/csv.py +127 -0
- datachain/lib/{dc.py → dc/datachain.py} +144 -733
- datachain/lib/dc/datasets.py +149 -0
- datachain/lib/dc/hf.py +73 -0
- datachain/lib/dc/json.py +91 -0
- datachain/lib/dc/listings.py +43 -0
- datachain/lib/dc/pandas.py +56 -0
- datachain/lib/dc/parquet.py +65 -0
- datachain/lib/dc/records.py +90 -0
- datachain/lib/dc/storage.py +118 -0
- datachain/lib/dc/utils.py +128 -0
- datachain/lib/dc/values.py +53 -0
- datachain/lib/meta_formats.py +2 -4
- datachain/lib/pytorch.py +2 -2
- datachain/lib/udf.py +3 -3
- datachain/toolkit/split.py +2 -2
- {datachain-0.13.0.dist-info → datachain-0.14.0.dist-info}/METADATA +12 -11
- {datachain-0.13.0.dist-info → datachain-0.14.0.dist-info}/RECORD +28 -16
- {datachain-0.13.0.dist-info → datachain-0.14.0.dist-info}/WHEEL +1 -1
- {datachain-0.13.0.dist-info → datachain-0.14.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.13.0.dist-info → datachain-0.14.0.dist-info/licenses}/LICENSE +0 -0
- {datachain-0.13.0.dist-info → datachain-0.14.0.dist-info}/top_level.txt +0 -0
datachain/__init__.py
CHANGED
|
@@ -1,5 +1,21 @@
|
|
|
1
1
|
from datachain.lib.data_model import DataModel, DataType, is_chain_type
|
|
2
|
-
from datachain.lib.dc import
|
|
2
|
+
from datachain.lib.dc import (
|
|
3
|
+
C,
|
|
4
|
+
Column,
|
|
5
|
+
DataChain,
|
|
6
|
+
Sys,
|
|
7
|
+
datasets,
|
|
8
|
+
from_csv,
|
|
9
|
+
from_dataset,
|
|
10
|
+
from_hf,
|
|
11
|
+
from_json,
|
|
12
|
+
from_pandas,
|
|
13
|
+
from_parquet,
|
|
14
|
+
from_records,
|
|
15
|
+
from_storage,
|
|
16
|
+
from_values,
|
|
17
|
+
listings,
|
|
18
|
+
)
|
|
3
19
|
from datachain.lib.file import (
|
|
4
20
|
ArrowRow,
|
|
5
21
|
File,
|
|
@@ -44,7 +60,18 @@ __all__ = [
|
|
|
44
60
|
"VideoFile",
|
|
45
61
|
"VideoFragment",
|
|
46
62
|
"VideoFrame",
|
|
63
|
+
"datasets",
|
|
64
|
+
"from_csv",
|
|
65
|
+
"from_dataset",
|
|
66
|
+
"from_hf",
|
|
67
|
+
"from_json",
|
|
68
|
+
"from_pandas",
|
|
69
|
+
"from_parquet",
|
|
70
|
+
"from_records",
|
|
71
|
+
"from_storage",
|
|
72
|
+
"from_values",
|
|
47
73
|
"is_chain_type",
|
|
74
|
+
"listings",
|
|
48
75
|
"metrics",
|
|
49
76
|
"param",
|
|
50
77
|
]
|
datachain/catalog/catalog.py
CHANGED
|
@@ -583,10 +583,10 @@ class Catalog:
|
|
|
583
583
|
object_name="file",
|
|
584
584
|
skip_indexing=False,
|
|
585
585
|
) -> tuple[Optional["Listing"], "Client", str]:
|
|
586
|
-
from datachain
|
|
586
|
+
from datachain import from_storage
|
|
587
587
|
from datachain.listing import Listing
|
|
588
588
|
|
|
589
|
-
|
|
589
|
+
from_storage(
|
|
590
590
|
source, session=self.session, update=update, object_name=object_name
|
|
591
591
|
)
|
|
592
592
|
|
|
@@ -795,6 +795,19 @@ class Catalog:
|
|
|
795
795
|
try:
|
|
796
796
|
dataset = self.get_dataset(name)
|
|
797
797
|
default_version = dataset.next_version
|
|
798
|
+
|
|
799
|
+
if (description or labels) and (
|
|
800
|
+
dataset.description != description or dataset.labels != labels
|
|
801
|
+
):
|
|
802
|
+
description = description or dataset.description
|
|
803
|
+
labels = labels or dataset.labels
|
|
804
|
+
|
|
805
|
+
self.update_dataset(
|
|
806
|
+
dataset,
|
|
807
|
+
description=description,
|
|
808
|
+
labels=labels,
|
|
809
|
+
)
|
|
810
|
+
|
|
798
811
|
except DatasetNotFoundError:
|
|
799
812
|
schema = {
|
|
800
813
|
c.name: c.type.to_dict() for c in columns if isinstance(c.type, SQLType)
|
|
@@ -981,18 +994,14 @@ class Catalog:
|
|
|
981
994
|
if not sources:
|
|
982
995
|
raise ValueError("Sources needs to be non empty list")
|
|
983
996
|
|
|
984
|
-
from datachain
|
|
997
|
+
from datachain import from_dataset, from_storage
|
|
985
998
|
|
|
986
999
|
chains = []
|
|
987
1000
|
for source in sources:
|
|
988
1001
|
if source.startswith(DATASET_PREFIX):
|
|
989
|
-
dc =
|
|
990
|
-
source[len(DATASET_PREFIX) :], session=self.session
|
|
991
|
-
)
|
|
1002
|
+
dc = from_dataset(source[len(DATASET_PREFIX) :], session=self.session)
|
|
992
1003
|
else:
|
|
993
|
-
dc =
|
|
994
|
-
source, session=self.session, recursive=recursive
|
|
995
|
-
)
|
|
1004
|
+
dc = from_storage(source, session=self.session, recursive=recursive)
|
|
996
1005
|
|
|
997
1006
|
chains.append(dc)
|
|
998
1007
|
|
datachain/cli/commands/ls.py
CHANGED
|
@@ -36,7 +36,7 @@ def ls_local(
|
|
|
36
36
|
client_config=None,
|
|
37
37
|
**kwargs,
|
|
38
38
|
):
|
|
39
|
-
from datachain import
|
|
39
|
+
from datachain import listings
|
|
40
40
|
|
|
41
41
|
if sources:
|
|
42
42
|
if catalog is None:
|
|
@@ -63,7 +63,7 @@ def ls_local(
|
|
|
63
63
|
print(format_ls_entry(entry))
|
|
64
64
|
else:
|
|
65
65
|
# Collect results in a list here to prevent interference from `tqdm` and `print`
|
|
66
|
-
listing = list(
|
|
66
|
+
listing = list(listings().collect("listing"))
|
|
67
67
|
for ls in listing:
|
|
68
68
|
print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
|
|
69
69
|
|
datachain/cli/commands/show.py
CHANGED
|
@@ -18,8 +18,7 @@ def show(
|
|
|
18
18
|
schema: bool = False,
|
|
19
19
|
include_hidden: bool = False,
|
|
20
20
|
) -> None:
|
|
21
|
-
from datachain import Session
|
|
22
|
-
from datachain.lib.dc import DataChain
|
|
21
|
+
from datachain import Session, from_dataset
|
|
23
22
|
from datachain.query.dataset import DatasetQuery
|
|
24
23
|
from datachain.utils import show_records
|
|
25
24
|
|
|
@@ -52,5 +51,5 @@ def show(
|
|
|
52
51
|
if schema and dataset_version.feature_schema:
|
|
53
52
|
print("\nSchema:")
|
|
54
53
|
session = Session.get(catalog=catalog)
|
|
55
|
-
dc =
|
|
54
|
+
dc = from_dataset(name=name, version=version, session=session)
|
|
56
55
|
dc.print_schema()
|
datachain/diff/__init__.py
CHANGED
|
@@ -74,6 +74,7 @@ def _compare( # noqa: C901
|
|
|
74
74
|
# all left and right columns
|
|
75
75
|
cols = left.signals_schema.clone_without_sys_signals().db_signals()
|
|
76
76
|
right_cols = right.signals_schema.clone_without_sys_signals().db_signals()
|
|
77
|
+
cols_select = list(left.signals_schema.clone_without_sys_signals().values.keys())
|
|
77
78
|
|
|
78
79
|
# getting correct on and right_on column names
|
|
79
80
|
on = left.signals_schema.resolve(*on).db_signals() # type: ignore[assignment]
|
|
@@ -131,10 +132,12 @@ def _compare( # noqa: C901
|
|
|
131
132
|
# when the row is deleted, we need to take column values from the right chain
|
|
132
133
|
.mutate(
|
|
133
134
|
**{
|
|
134
|
-
f"{
|
|
135
|
-
C(diff_col) == CompareStatus.DELETED,
|
|
135
|
+
f"{l_on}": ifelse(
|
|
136
|
+
C(diff_col) == CompareStatus.DELETED,
|
|
137
|
+
C(f"{rname + l_on if on == right_on else r_on}"),
|
|
138
|
+
C(l_on),
|
|
136
139
|
)
|
|
137
|
-
for
|
|
140
|
+
for l_on, r_on in zip(on, right_on) # type: ignore[arg-type]
|
|
138
141
|
}
|
|
139
142
|
)
|
|
140
143
|
.select_except(ldiff_col, rdiff_col)
|
|
@@ -150,9 +153,9 @@ def _compare( # noqa: C901
|
|
|
150
153
|
dc_diff = dc_diff.filter(C(diff_col) != CompareStatus.DELETED)
|
|
151
154
|
|
|
152
155
|
if status_col:
|
|
153
|
-
|
|
156
|
+
cols_select.append(diff_col)
|
|
154
157
|
|
|
155
|
-
dc_diff = dc_diff.select(*
|
|
158
|
+
dc_diff = dc_diff.select(*cols_select)
|
|
156
159
|
|
|
157
160
|
# final schema is schema from the left chain with status column added if needed
|
|
158
161
|
dc_diff.signals_schema = (
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from .csv import from_csv
|
|
2
|
+
from .datachain import C, Column, DataChain
|
|
3
|
+
from .datasets import datasets, from_dataset
|
|
4
|
+
from .hf import from_hf
|
|
5
|
+
from .json import from_json
|
|
6
|
+
from .listings import listings
|
|
7
|
+
from .pandas import from_pandas
|
|
8
|
+
from .parquet import from_parquet
|
|
9
|
+
from .records import from_records
|
|
10
|
+
from .storage import from_storage
|
|
11
|
+
from .utils import DatasetMergeError, DatasetPrepareError, Sys
|
|
12
|
+
from .values import from_values
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"C",
|
|
16
|
+
"Column",
|
|
17
|
+
"DataChain",
|
|
18
|
+
"DatasetMergeError",
|
|
19
|
+
"DatasetPrepareError",
|
|
20
|
+
"Sys",
|
|
21
|
+
"datasets",
|
|
22
|
+
"from_csv",
|
|
23
|
+
"from_dataset",
|
|
24
|
+
"from_hf",
|
|
25
|
+
"from_json",
|
|
26
|
+
"from_pandas",
|
|
27
|
+
"from_parquet",
|
|
28
|
+
"from_records",
|
|
29
|
+
"from_storage",
|
|
30
|
+
"from_values",
|
|
31
|
+
"listings",
|
|
32
|
+
]
|
datachain/lib/dc/csv.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
from collections.abc import Sequence
|
|
2
|
+
from typing import (
|
|
3
|
+
TYPE_CHECKING,
|
|
4
|
+
Callable,
|
|
5
|
+
Optional,
|
|
6
|
+
Union,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
from datachain.lib.dc.utils import DatasetPrepareError, OutputType
|
|
10
|
+
from datachain.lib.model_store import ModelStore
|
|
11
|
+
from datachain.query import Session
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from pyarrow import DataType as ArrowDataType
|
|
15
|
+
|
|
16
|
+
from .datachain import DataChain
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def from_csv(
|
|
20
|
+
path,
|
|
21
|
+
delimiter: Optional[str] = None,
|
|
22
|
+
header: bool = True,
|
|
23
|
+
output: OutputType = None,
|
|
24
|
+
object_name: str = "",
|
|
25
|
+
model_name: str = "",
|
|
26
|
+
source: bool = True,
|
|
27
|
+
nrows=None,
|
|
28
|
+
session: Optional[Session] = None,
|
|
29
|
+
settings: Optional[dict] = None,
|
|
30
|
+
column_types: Optional[dict[str, "Union[str, ArrowDataType]"]] = None,
|
|
31
|
+
parse_options: Optional[dict[str, "Union[str, Union[bool, Callable]]"]] = None,
|
|
32
|
+
**kwargs,
|
|
33
|
+
) -> "DataChain":
|
|
34
|
+
"""Generate chain from csv files.
|
|
35
|
+
|
|
36
|
+
Parameters:
|
|
37
|
+
path : Storage URI with directory. URI must start with storage prefix such
|
|
38
|
+
as `s3://`, `gs://`, `az://` or "file:///".
|
|
39
|
+
delimiter : Character for delimiting columns. Takes precedence if also
|
|
40
|
+
specified in `parse_options`. Defaults to ",".
|
|
41
|
+
header : Whether the files include a header row.
|
|
42
|
+
output : Dictionary or feature class defining column names and their
|
|
43
|
+
corresponding types. List of column names is also accepted, in which
|
|
44
|
+
case types will be inferred.
|
|
45
|
+
object_name : Created object column name.
|
|
46
|
+
model_name : Generated model name.
|
|
47
|
+
source : Whether to include info about the source file.
|
|
48
|
+
nrows : Optional row limit.
|
|
49
|
+
session : Session to use for the chain.
|
|
50
|
+
settings : Settings to use for the chain.
|
|
51
|
+
column_types : Dictionary of column names and their corresponding types.
|
|
52
|
+
It is passed to CSV reader and for each column specified type auto
|
|
53
|
+
inference is disabled.
|
|
54
|
+
parse_options: Tells the parser how to process lines.
|
|
55
|
+
See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html
|
|
56
|
+
|
|
57
|
+
Example:
|
|
58
|
+
Reading a csv file:
|
|
59
|
+
```py
|
|
60
|
+
import datachain as dc
|
|
61
|
+
chain = dc.from_csv("s3://mybucket/file.csv")
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Reading csv files from a directory as a combined dataset:
|
|
65
|
+
```py
|
|
66
|
+
import datachain as dc
|
|
67
|
+
chain = dc.from_csv("s3://mybucket/dir")
|
|
68
|
+
```
|
|
69
|
+
"""
|
|
70
|
+
from pandas.io.parsers.readers import STR_NA_VALUES
|
|
71
|
+
from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
|
|
72
|
+
from pyarrow.dataset import CsvFileFormat
|
|
73
|
+
from pyarrow.lib import type_for_alias
|
|
74
|
+
|
|
75
|
+
from .storage import from_storage
|
|
76
|
+
|
|
77
|
+
parse_options = parse_options or {}
|
|
78
|
+
if "delimiter" not in parse_options:
|
|
79
|
+
parse_options["delimiter"] = ","
|
|
80
|
+
if delimiter:
|
|
81
|
+
parse_options["delimiter"] = delimiter
|
|
82
|
+
|
|
83
|
+
if column_types:
|
|
84
|
+
column_types = {
|
|
85
|
+
name: type_for_alias(typ) if isinstance(typ, str) else typ
|
|
86
|
+
for name, typ in column_types.items()
|
|
87
|
+
}
|
|
88
|
+
else:
|
|
89
|
+
column_types = {}
|
|
90
|
+
|
|
91
|
+
chain = from_storage(path, session=session, settings=settings, **kwargs)
|
|
92
|
+
|
|
93
|
+
column_names = None
|
|
94
|
+
if not header:
|
|
95
|
+
if not output:
|
|
96
|
+
msg = "error parsing csv - provide output if no header"
|
|
97
|
+
raise DatasetPrepareError(chain.name, msg)
|
|
98
|
+
if isinstance(output, Sequence):
|
|
99
|
+
column_names = output # type: ignore[assignment]
|
|
100
|
+
elif isinstance(output, dict):
|
|
101
|
+
column_names = list(output.keys())
|
|
102
|
+
elif (fr := ModelStore.to_pydantic(output)) is not None:
|
|
103
|
+
column_names = list(fr.model_fields.keys())
|
|
104
|
+
else:
|
|
105
|
+
msg = f"error parsing csv - incompatible output type {type(output)}"
|
|
106
|
+
raise DatasetPrepareError(chain.name, msg)
|
|
107
|
+
|
|
108
|
+
parse_options = ParseOptions(**parse_options)
|
|
109
|
+
read_options = ReadOptions(column_names=column_names)
|
|
110
|
+
convert_options = ConvertOptions(
|
|
111
|
+
strings_can_be_null=True,
|
|
112
|
+
null_values=STR_NA_VALUES,
|
|
113
|
+
column_types=column_types,
|
|
114
|
+
)
|
|
115
|
+
format = CsvFileFormat(
|
|
116
|
+
parse_options=parse_options,
|
|
117
|
+
read_options=read_options,
|
|
118
|
+
convert_options=convert_options,
|
|
119
|
+
)
|
|
120
|
+
return chain.parse_tabular(
|
|
121
|
+
output=output,
|
|
122
|
+
object_name=object_name,
|
|
123
|
+
model_name=model_name,
|
|
124
|
+
source=source,
|
|
125
|
+
nrows=nrows,
|
|
126
|
+
format=format,
|
|
127
|
+
)
|