datachain 0.13.1__py3-none-any.whl → 0.14.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +28 -1
- datachain/catalog/catalog.py +6 -10
- datachain/cli/commands/ls.py +2 -2
- datachain/cli/commands/show.py +2 -3
- datachain/client/fsspec.py +3 -3
- datachain/lib/dc/__init__.py +32 -0
- datachain/lib/dc/csv.py +127 -0
- datachain/lib/{dc.py → dc/datachain.py} +144 -733
- datachain/lib/dc/datasets.py +149 -0
- datachain/lib/dc/hf.py +73 -0
- datachain/lib/dc/json.py +91 -0
- datachain/lib/dc/listings.py +43 -0
- datachain/lib/dc/pandas.py +56 -0
- datachain/lib/dc/parquet.py +65 -0
- datachain/lib/dc/records.py +90 -0
- datachain/lib/dc/storage.py +170 -0
- datachain/lib/dc/utils.py +128 -0
- datachain/lib/dc/values.py +53 -0
- datachain/lib/meta_formats.py +2 -4
- datachain/lib/pytorch.py +2 -2
- datachain/lib/udf.py +3 -3
- datachain/query/dataset.py +39 -16
- datachain/toolkit/split.py +2 -2
- {datachain-0.13.1.dist-info → datachain-0.14.1.dist-info}/METADATA +11 -11
- {datachain-0.13.1.dist-info → datachain-0.14.1.dist-info}/RECORD +29 -17
- {datachain-0.13.1.dist-info → datachain-0.14.1.dist-info}/WHEEL +1 -1
- {datachain-0.13.1.dist-info → datachain-0.14.1.dist-info}/entry_points.txt +0 -0
- {datachain-0.13.1.dist-info → datachain-0.14.1.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.13.1.dist-info → datachain-0.14.1.dist-info}/top_level.txt +0 -0
datachain/__init__.py
CHANGED
|
@@ -1,5 +1,21 @@
|
|
|
1
1
|
from datachain.lib.data_model import DataModel, DataType, is_chain_type
|
|
2
|
-
from datachain.lib.dc import
|
|
2
|
+
from datachain.lib.dc import (
|
|
3
|
+
C,
|
|
4
|
+
Column,
|
|
5
|
+
DataChain,
|
|
6
|
+
Sys,
|
|
7
|
+
datasets,
|
|
8
|
+
from_csv,
|
|
9
|
+
from_dataset,
|
|
10
|
+
from_hf,
|
|
11
|
+
from_json,
|
|
12
|
+
from_pandas,
|
|
13
|
+
from_parquet,
|
|
14
|
+
from_records,
|
|
15
|
+
from_storage,
|
|
16
|
+
from_values,
|
|
17
|
+
listings,
|
|
18
|
+
)
|
|
3
19
|
from datachain.lib.file import (
|
|
4
20
|
ArrowRow,
|
|
5
21
|
File,
|
|
@@ -44,7 +60,18 @@ __all__ = [
|
|
|
44
60
|
"VideoFile",
|
|
45
61
|
"VideoFragment",
|
|
46
62
|
"VideoFrame",
|
|
63
|
+
"datasets",
|
|
64
|
+
"from_csv",
|
|
65
|
+
"from_dataset",
|
|
66
|
+
"from_hf",
|
|
67
|
+
"from_json",
|
|
68
|
+
"from_pandas",
|
|
69
|
+
"from_parquet",
|
|
70
|
+
"from_records",
|
|
71
|
+
"from_storage",
|
|
72
|
+
"from_values",
|
|
47
73
|
"is_chain_type",
|
|
74
|
+
"listings",
|
|
48
75
|
"metrics",
|
|
49
76
|
"param",
|
|
50
77
|
]
|
datachain/catalog/catalog.py
CHANGED
|
@@ -583,12 +583,12 @@ class Catalog:
|
|
|
583
583
|
object_name="file",
|
|
584
584
|
skip_indexing=False,
|
|
585
585
|
) -> tuple[Optional["Listing"], "Client", str]:
|
|
586
|
-
from datachain
|
|
586
|
+
from datachain import from_storage
|
|
587
587
|
from datachain.listing import Listing
|
|
588
588
|
|
|
589
|
-
|
|
589
|
+
from_storage(
|
|
590
590
|
source, session=self.session, update=update, object_name=object_name
|
|
591
|
-
)
|
|
591
|
+
).exec()
|
|
592
592
|
|
|
593
593
|
list_ds_name, list_uri, list_path, _ = get_listing(
|
|
594
594
|
source, self.session, update=update
|
|
@@ -994,18 +994,14 @@ class Catalog:
|
|
|
994
994
|
if not sources:
|
|
995
995
|
raise ValueError("Sources needs to be non empty list")
|
|
996
996
|
|
|
997
|
-
from datachain
|
|
997
|
+
from datachain import from_dataset, from_storage
|
|
998
998
|
|
|
999
999
|
chains = []
|
|
1000
1000
|
for source in sources:
|
|
1001
1001
|
if source.startswith(DATASET_PREFIX):
|
|
1002
|
-
dc =
|
|
1003
|
-
source[len(DATASET_PREFIX) :], session=self.session
|
|
1004
|
-
)
|
|
1002
|
+
dc = from_dataset(source[len(DATASET_PREFIX) :], session=self.session)
|
|
1005
1003
|
else:
|
|
1006
|
-
dc =
|
|
1007
|
-
source, session=self.session, recursive=recursive
|
|
1008
|
-
)
|
|
1004
|
+
dc = from_storage(source, session=self.session, recursive=recursive)
|
|
1009
1005
|
|
|
1010
1006
|
chains.append(dc)
|
|
1011
1007
|
|
datachain/cli/commands/ls.py
CHANGED
|
@@ -36,7 +36,7 @@ def ls_local(
|
|
|
36
36
|
client_config=None,
|
|
37
37
|
**kwargs,
|
|
38
38
|
):
|
|
39
|
-
from datachain import
|
|
39
|
+
from datachain import listings
|
|
40
40
|
|
|
41
41
|
if sources:
|
|
42
42
|
if catalog is None:
|
|
@@ -63,7 +63,7 @@ def ls_local(
|
|
|
63
63
|
print(format_ls_entry(entry))
|
|
64
64
|
else:
|
|
65
65
|
# Collect results in a list here to prevent interference from `tqdm` and `print`
|
|
66
|
-
listing = list(
|
|
66
|
+
listing = list(listings().collect("listing"))
|
|
67
67
|
for ls in listing:
|
|
68
68
|
print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
|
|
69
69
|
|
datachain/cli/commands/show.py
CHANGED
|
@@ -18,8 +18,7 @@ def show(
|
|
|
18
18
|
schema: bool = False,
|
|
19
19
|
include_hidden: bool = False,
|
|
20
20
|
) -> None:
|
|
21
|
-
from datachain import Session
|
|
22
|
-
from datachain.lib.dc import DataChain
|
|
21
|
+
from datachain import Session, from_dataset
|
|
23
22
|
from datachain.query.dataset import DatasetQuery
|
|
24
23
|
from datachain.utils import show_records
|
|
25
24
|
|
|
@@ -52,5 +51,5 @@ def show(
|
|
|
52
51
|
if schema and dataset_version.feature_schema:
|
|
53
52
|
print("\nSchema:")
|
|
54
53
|
session = Session.get(catalog=catalog)
|
|
55
|
-
dc =
|
|
54
|
+
dc = from_dataset(name=name, version=version, session=session)
|
|
56
55
|
dc.print_schema()
|
datachain/client/fsspec.py
CHANGED
|
@@ -89,9 +89,9 @@ class Client(ABC):
|
|
|
89
89
|
from .local import FileClient
|
|
90
90
|
from .s3 import ClientS3
|
|
91
91
|
|
|
92
|
-
protocol = urlparse(
|
|
92
|
+
protocol = urlparse(os.fspath(url)).scheme
|
|
93
93
|
|
|
94
|
-
if not protocol or _is_win_local_path(
|
|
94
|
+
if not protocol or _is_win_local_path(os.fspath(url)):
|
|
95
95
|
return FileClient
|
|
96
96
|
if protocol == ClientS3.protocol:
|
|
97
97
|
return ClientS3
|
|
@@ -122,7 +122,7 @@ class Client(ABC):
|
|
|
122
122
|
source: Union[str, os.PathLike[str]], cache: Cache, **kwargs
|
|
123
123
|
) -> "Client":
|
|
124
124
|
cls = Client.get_implementation(source)
|
|
125
|
-
storage_url, _ = cls.split_url(
|
|
125
|
+
storage_url, _ = cls.split_url(os.fspath(source))
|
|
126
126
|
if os.name == "nt":
|
|
127
127
|
storage_url = storage_url.removeprefix("/")
|
|
128
128
|
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from .csv import from_csv
|
|
2
|
+
from .datachain import C, Column, DataChain
|
|
3
|
+
from .datasets import datasets, from_dataset
|
|
4
|
+
from .hf import from_hf
|
|
5
|
+
from .json import from_json
|
|
6
|
+
from .listings import listings
|
|
7
|
+
from .pandas import from_pandas
|
|
8
|
+
from .parquet import from_parquet
|
|
9
|
+
from .records import from_records
|
|
10
|
+
from .storage import from_storage
|
|
11
|
+
from .utils import DatasetMergeError, DatasetPrepareError, Sys
|
|
12
|
+
from .values import from_values
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"C",
|
|
16
|
+
"Column",
|
|
17
|
+
"DataChain",
|
|
18
|
+
"DatasetMergeError",
|
|
19
|
+
"DatasetPrepareError",
|
|
20
|
+
"Sys",
|
|
21
|
+
"datasets",
|
|
22
|
+
"from_csv",
|
|
23
|
+
"from_dataset",
|
|
24
|
+
"from_hf",
|
|
25
|
+
"from_json",
|
|
26
|
+
"from_pandas",
|
|
27
|
+
"from_parquet",
|
|
28
|
+
"from_records",
|
|
29
|
+
"from_storage",
|
|
30
|
+
"from_values",
|
|
31
|
+
"listings",
|
|
32
|
+
]
|
datachain/lib/dc/csv.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
from collections.abc import Sequence
|
|
2
|
+
from typing import (
|
|
3
|
+
TYPE_CHECKING,
|
|
4
|
+
Callable,
|
|
5
|
+
Optional,
|
|
6
|
+
Union,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
from datachain.lib.dc.utils import DatasetPrepareError, OutputType
|
|
10
|
+
from datachain.lib.model_store import ModelStore
|
|
11
|
+
from datachain.query import Session
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from pyarrow import DataType as ArrowDataType
|
|
15
|
+
|
|
16
|
+
from .datachain import DataChain
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def from_csv(
|
|
20
|
+
path,
|
|
21
|
+
delimiter: Optional[str] = None,
|
|
22
|
+
header: bool = True,
|
|
23
|
+
output: OutputType = None,
|
|
24
|
+
object_name: str = "",
|
|
25
|
+
model_name: str = "",
|
|
26
|
+
source: bool = True,
|
|
27
|
+
nrows=None,
|
|
28
|
+
session: Optional[Session] = None,
|
|
29
|
+
settings: Optional[dict] = None,
|
|
30
|
+
column_types: Optional[dict[str, "Union[str, ArrowDataType]"]] = None,
|
|
31
|
+
parse_options: Optional[dict[str, "Union[str, Union[bool, Callable]]"]] = None,
|
|
32
|
+
**kwargs,
|
|
33
|
+
) -> "DataChain":
|
|
34
|
+
"""Generate chain from csv files.
|
|
35
|
+
|
|
36
|
+
Parameters:
|
|
37
|
+
path : Storage URI with directory. URI must start with storage prefix such
|
|
38
|
+
as `s3://`, `gs://`, `az://` or "file:///".
|
|
39
|
+
delimiter : Character for delimiting columns. Takes precedence if also
|
|
40
|
+
specified in `parse_options`. Defaults to ",".
|
|
41
|
+
header : Whether the files include a header row.
|
|
42
|
+
output : Dictionary or feature class defining column names and their
|
|
43
|
+
corresponding types. List of column names is also accepted, in which
|
|
44
|
+
case types will be inferred.
|
|
45
|
+
object_name : Created object column name.
|
|
46
|
+
model_name : Generated model name.
|
|
47
|
+
source : Whether to include info about the source file.
|
|
48
|
+
nrows : Optional row limit.
|
|
49
|
+
session : Session to use for the chain.
|
|
50
|
+
settings : Settings to use for the chain.
|
|
51
|
+
column_types : Dictionary of column names and their corresponding types.
|
|
52
|
+
It is passed to CSV reader and for each column specified type auto
|
|
53
|
+
inference is disabled.
|
|
54
|
+
parse_options: Tells the parser how to process lines.
|
|
55
|
+
See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html
|
|
56
|
+
|
|
57
|
+
Example:
|
|
58
|
+
Reading a csv file:
|
|
59
|
+
```py
|
|
60
|
+
import datachain as dc
|
|
61
|
+
chain = dc.from_csv("s3://mybucket/file.csv")
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Reading csv files from a directory as a combined dataset:
|
|
65
|
+
```py
|
|
66
|
+
import datachain as dc
|
|
67
|
+
chain = dc.from_csv("s3://mybucket/dir")
|
|
68
|
+
```
|
|
69
|
+
"""
|
|
70
|
+
from pandas.io.parsers.readers import STR_NA_VALUES
|
|
71
|
+
from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
|
|
72
|
+
from pyarrow.dataset import CsvFileFormat
|
|
73
|
+
from pyarrow.lib import type_for_alias
|
|
74
|
+
|
|
75
|
+
from .storage import from_storage
|
|
76
|
+
|
|
77
|
+
parse_options = parse_options or {}
|
|
78
|
+
if "delimiter" not in parse_options:
|
|
79
|
+
parse_options["delimiter"] = ","
|
|
80
|
+
if delimiter:
|
|
81
|
+
parse_options["delimiter"] = delimiter
|
|
82
|
+
|
|
83
|
+
if column_types:
|
|
84
|
+
column_types = {
|
|
85
|
+
name: type_for_alias(typ) if isinstance(typ, str) else typ
|
|
86
|
+
for name, typ in column_types.items()
|
|
87
|
+
}
|
|
88
|
+
else:
|
|
89
|
+
column_types = {}
|
|
90
|
+
|
|
91
|
+
chain = from_storage(path, session=session, settings=settings, **kwargs)
|
|
92
|
+
|
|
93
|
+
column_names = None
|
|
94
|
+
if not header:
|
|
95
|
+
if not output:
|
|
96
|
+
msg = "error parsing csv - provide output if no header"
|
|
97
|
+
raise DatasetPrepareError(chain.name, msg)
|
|
98
|
+
if isinstance(output, Sequence):
|
|
99
|
+
column_names = output # type: ignore[assignment]
|
|
100
|
+
elif isinstance(output, dict):
|
|
101
|
+
column_names = list(output.keys())
|
|
102
|
+
elif (fr := ModelStore.to_pydantic(output)) is not None:
|
|
103
|
+
column_names = list(fr.model_fields.keys())
|
|
104
|
+
else:
|
|
105
|
+
msg = f"error parsing csv - incompatible output type {type(output)}"
|
|
106
|
+
raise DatasetPrepareError(chain.name, msg)
|
|
107
|
+
|
|
108
|
+
parse_options = ParseOptions(**parse_options)
|
|
109
|
+
read_options = ReadOptions(column_names=column_names)
|
|
110
|
+
convert_options = ConvertOptions(
|
|
111
|
+
strings_can_be_null=True,
|
|
112
|
+
null_values=STR_NA_VALUES,
|
|
113
|
+
column_types=column_types,
|
|
114
|
+
)
|
|
115
|
+
format = CsvFileFormat(
|
|
116
|
+
parse_options=parse_options,
|
|
117
|
+
read_options=read_options,
|
|
118
|
+
convert_options=convert_options,
|
|
119
|
+
)
|
|
120
|
+
return chain.parse_tabular(
|
|
121
|
+
output=output,
|
|
122
|
+
object_name=object_name,
|
|
123
|
+
model_name=model_name,
|
|
124
|
+
source=source,
|
|
125
|
+
nrows=nrows,
|
|
126
|
+
format=format,
|
|
127
|
+
)
|