datachain 0.13.1__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -0,0 +1,149 @@
1
+ from typing import (
2
+ TYPE_CHECKING,
3
+ Optional,
4
+ )
5
+
6
+ from datachain.lib.dataset_info import DatasetInfo
7
+ from datachain.lib.file import (
8
+ File,
9
+ )
10
+ from datachain.lib.settings import Settings
11
+ from datachain.lib.signal_schema import SignalSchema
12
+ from datachain.query import Session
13
+ from datachain.query.dataset import DatasetQuery
14
+
15
+ from .utils import Sys
16
+ from .values import from_values
17
+
18
+ if TYPE_CHECKING:
19
+ from typing_extensions import ParamSpec
20
+
21
+ from .datachain import DataChain
22
+
23
+ P = ParamSpec("P")
24
+
25
+
26
+ def from_dataset(
27
+ name: str,
28
+ version: Optional[int] = None,
29
+ session: Optional[Session] = None,
30
+ settings: Optional[dict] = None,
31
+ fallback_to_studio: bool = True,
32
+ ) -> "DataChain":
33
+ """Get data from a saved Dataset. It returns the chain itself.
34
+ If dataset or version is not found locally, it will try to pull it from Studio.
35
+
36
+ Parameters:
37
+ name : dataset name
38
+ version : dataset version
39
+ session : Session to use for the chain.
40
+ settings : Settings to use for the chain.
41
+ fallback_to_studio : Try to pull dataset from Studio if not found locally.
42
+ Default is True.
43
+
44
+ Example:
45
+ ```py
46
+ import datachain as dc
47
+ chain = dc.from_dataset("my_cats")
48
+ ```
49
+
50
+ ```py
51
+ chain = dc.from_dataset("my_cats", fallback_to_studio=False)
52
+ ```
53
+
54
+ ```py
55
+ chain = dc.from_dataset("my_cats", version=1)
56
+ ```
57
+
58
+ ```py
59
+ session = Session.get(client_config={"aws_endpoint_url": "<minio-url>"})
60
+ settings = {
61
+ "cache": True,
62
+ "parallel": 4,
63
+ "workers": 4,
64
+ "min_task_size": 1000,
65
+ "prefetch": 10,
66
+ }
67
+ chain = dc.from_dataset(
68
+ name="my_cats",
69
+ version=1,
70
+ session=session,
71
+ settings=settings,
72
+ fallback_to_studio=True,
73
+ )
74
+ ```
75
+ """
76
+ from datachain.telemetry import telemetry
77
+
78
+ from .datachain import DataChain
79
+
80
+ query = DatasetQuery(
81
+ name=name,
82
+ version=version,
83
+ session=session,
84
+ indexing_column_types=File._datachain_column_types,
85
+ fallback_to_studio=fallback_to_studio,
86
+ )
87
+ telemetry.send_event_once("class", "datachain_init", name=name, version=version)
88
+ if settings:
89
+ _settings = Settings(**settings)
90
+ else:
91
+ _settings = Settings()
92
+
93
+ signals_schema = SignalSchema({"sys": Sys})
94
+ if query.feature_schema:
95
+ signals_schema |= SignalSchema.deserialize(query.feature_schema)
96
+ else:
97
+ signals_schema |= SignalSchema.from_column_types(query.column_types or {})
98
+ return DataChain(query, _settings, signals_schema)
99
+
100
+
101
+ def datasets(
102
+ session: Optional[Session] = None,
103
+ settings: Optional[dict] = None,
104
+ in_memory: bool = False,
105
+ object_name: str = "dataset",
106
+ include_listing: bool = False,
107
+ studio: bool = False,
108
+ ) -> "DataChain":
109
+ """Generate chain with list of registered datasets.
110
+
111
+ Args:
112
+ session: Optional session instance. If not provided, uses default session.
113
+ settings: Optional dictionary of settings to configure the chain.
114
+ in_memory: If True, creates an in-memory session. Defaults to False.
115
+ object_name: Name of the output object in the chain. Defaults to "dataset".
116
+ include_listing: If True, includes listing datasets. Defaults to False.
117
+ studio: If True, returns datasets from Studio only,
118
+ otherwise returns all local datasets. Defaults to False.
119
+
120
+ Returns:
121
+ DataChain: A new DataChain instance containing dataset information.
122
+
123
+ Example:
124
+ ```py
125
+ import datachain as dc
126
+
127
+ chain = dc.datasets()
128
+ for ds in chain.collect("dataset"):
129
+ print(f"{ds.name}@v{ds.version}")
130
+ ```
131
+ """
132
+
133
+ session = Session.get(session, in_memory=in_memory)
134
+ catalog = session.catalog
135
+
136
+ datasets_values = [
137
+ DatasetInfo.from_models(d, v, j)
138
+ for d, v, j in catalog.list_datasets_versions(
139
+ include_listing=include_listing, studio=studio
140
+ )
141
+ ]
142
+
143
+ return from_values(
144
+ session=session,
145
+ settings=settings,
146
+ in_memory=in_memory,
147
+ output={object_name: DatasetInfo},
148
+ **{object_name: datasets_values}, # type: ignore[arg-type]
149
+ )
datachain/lib/dc/hf.py ADDED
@@ -0,0 +1,73 @@
1
+ from typing import (
2
+ TYPE_CHECKING,
3
+ Optional,
4
+ Union,
5
+ )
6
+
7
+ from datachain.lib.data_model import dict_to_data_model
8
+ from datachain.query import Session
9
+
10
+ if TYPE_CHECKING:
11
+ from typing_extensions import ParamSpec
12
+
13
+ from datachain.lib.data_model import DataType
14
+ from datachain.lib.hf import HFDatasetType
15
+
16
+ from .datachain import DataChain
17
+
18
+ P = ParamSpec("P")
19
+
20
+
21
+ def from_hf(
22
+ dataset: Union[str, "HFDatasetType"],
23
+ *args,
24
+ session: Optional[Session] = None,
25
+ settings: Optional[dict] = None,
26
+ object_name: str = "",
27
+ model_name: str = "",
28
+ **kwargs,
29
+ ) -> "DataChain":
30
+ """Generate chain from huggingface hub dataset.
31
+
32
+ Parameters:
33
+ dataset : Path or name of the dataset to read from Hugging Face Hub,
34
+ or an instance of `datasets.Dataset`-like object.
35
+ session : Session to use for the chain.
36
+ settings : Settings to use for the chain.
37
+ object_name : Generated object column name.
38
+ model_name : Generated model name.
39
+ kwargs : Parameters to pass to datasets.load_dataset.
40
+
41
+ Example:
42
+ Load from Hugging Face Hub:
43
+ ```py
44
+ import datachain as dc
45
+ chain = dc.from_hf("beans", split="train")
46
+ ```
47
+
48
+ Generate chain from loaded dataset:
49
+ ```py
50
+ from datasets import load_dataset
51
+ ds = load_dataset("beans", split="train")
52
+ import datachain as dc
53
+ chain = dc.from_hf(ds)
54
+ ```
55
+ """
56
+ from datachain.lib.hf import HFGenerator, get_output_schema, stream_splits
57
+
58
+ from .values import from_values
59
+
60
+ output: dict[str, DataType] = {}
61
+ ds_dict = stream_splits(dataset, *args, **kwargs)
62
+ if len(ds_dict) > 1:
63
+ output = {"split": str}
64
+
65
+ model_name = model_name or object_name or ""
66
+ hf_features = next(iter(ds_dict.values())).features
67
+ output = output | get_output_schema(hf_features)
68
+ model = dict_to_data_model(model_name, output)
69
+ if object_name:
70
+ output = {object_name: model}
71
+
72
+ chain = from_values(split=list(ds_dict.keys()), session=session, settings=settings)
73
+ return chain.gen(HFGenerator(dataset, model, *args, **kwargs), output=output)
@@ -0,0 +1,91 @@
1
+ import os
2
+ import os.path
3
+ import re
4
+ from typing import (
5
+ TYPE_CHECKING,
6
+ Optional,
7
+ Union,
8
+ )
9
+
10
+ from datachain.lib.data_model import DataType
11
+ from datachain.lib.file import (
12
+ File,
13
+ FileType,
14
+ )
15
+ from datachain.lib.meta_formats import read_meta
16
+
17
+ if TYPE_CHECKING:
18
+ from typing_extensions import ParamSpec
19
+
20
+ from .datachain import DataChain
21
+
22
+ P = ParamSpec("P")
23
+
24
+
25
+ def from_json(
26
+ path: Union[str, os.PathLike[str]],
27
+ type: FileType = "text",
28
+ spec: Optional[DataType] = None,
29
+ schema_from: Optional[str] = "auto",
30
+ jmespath: Optional[str] = None,
31
+ object_name: Optional[str] = "",
32
+ model_name: Optional[str] = None,
33
+ format: Optional[str] = "json",
34
+ nrows=None,
35
+ **kwargs,
36
+ ) -> "DataChain":
37
+ """Get data from JSON. It returns the chain itself.
38
+
39
+ Parameters:
40
+ path : storage URI with directory. URI must start with storage prefix such
41
+ as `s3://`, `gs://`, `az://` or "file:///"
42
+ type : read file as "binary", "text", or "image" data. Default is "text".
43
+ spec : optional Data Model
44
+ schema_from : path to sample to infer spec (if schema not provided)
45
+ object_name : generated object column name
46
+ model_name : optional generated model name
47
+ format: "json", "jsonl"
48
+ jmespath : optional JMESPATH expression to reduce JSON
49
+ nrows : optional row limit for jsonl and JSON arrays
50
+
51
+ Example:
52
+ infer JSON schema from data, reduce using JMESPATH
53
+ ```py
54
+ import datachain as dc
55
+ chain = dc.from_json("gs://json", jmespath="key1.key2")
56
+ ```
57
+
58
+ infer JSON schema from a particular path
59
+ ```py
60
+ import datachain as dc
61
+ chain = dc.from_json("gs://json_ds", schema_from="gs://json/my.json")
62
+ ```
63
+ """
64
+ from .storage import from_storage
65
+
66
+ if schema_from == "auto":
67
+ schema_from = str(path)
68
+
69
+ def jmespath_to_name(s: str):
70
+ name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
71
+ return s[:name_end]
72
+
73
+ if (not object_name) and jmespath:
74
+ object_name = jmespath_to_name(jmespath)
75
+ if not object_name:
76
+ object_name = format
77
+ chain = from_storage(uri=path, type=type, **kwargs)
78
+ signal_dict = {
79
+ object_name: read_meta(
80
+ schema_from=schema_from,
81
+ format=format,
82
+ spec=spec,
83
+ model_name=model_name,
84
+ jmespath=jmespath,
85
+ nrows=nrows,
86
+ ),
87
+ "params": {"file": File},
88
+ }
89
+ # disable prefetch if nrows is set
90
+ settings = {"prefetch": 0} if nrows else {}
91
+ return chain.settings(**settings).gen(**signal_dict) # type: ignore[misc, arg-type]
@@ -0,0 +1,43 @@
1
+ from typing import (
2
+ TYPE_CHECKING,
3
+ Optional,
4
+ )
5
+
6
+ from datachain.lib.listing_info import ListingInfo
7
+ from datachain.query import Session
8
+
9
+ from .values import from_values
10
+
11
+ if TYPE_CHECKING:
12
+ from typing_extensions import ParamSpec
13
+
14
+ from .datachain import DataChain
15
+
16
+ P = ParamSpec("P")
17
+
18
+
19
+ def listings(
20
+ session: Optional[Session] = None,
21
+ in_memory: bool = False,
22
+ object_name: str = "listing",
23
+ **kwargs,
24
+ ) -> "DataChain":
25
+ """Generate chain with list of cached listings.
26
+ Listing is a special kind of dataset which has directory listing data of
27
+ some underlying storage (e.g S3 bucket).
28
+
29
+ Example:
30
+ ```py
31
+ import datachain as dc
32
+ dc.listings().show()
33
+ ```
34
+ """
35
+ session = Session.get(session, in_memory=in_memory)
36
+ catalog = kwargs.get("catalog") or session.catalog
37
+
38
+ return from_values(
39
+ session=session,
40
+ in_memory=in_memory,
41
+ output={object_name: ListingInfo},
42
+ **{object_name: catalog.listings()}, # type: ignore[arg-type]
43
+ )
@@ -0,0 +1,56 @@
1
+ from typing import (
2
+ TYPE_CHECKING,
3
+ Optional,
4
+ )
5
+
6
+ from datachain.query import Session
7
+
8
+ from .values import from_values
9
+
10
+ if TYPE_CHECKING:
11
+ import pandas as pd
12
+ from typing_extensions import ParamSpec
13
+
14
+ from .datachain import DataChain
15
+
16
+ P = ParamSpec("P")
17
+
18
+
19
+ def from_pandas( # type: ignore[override]
20
+ df: "pd.DataFrame",
21
+ name: str = "",
22
+ session: Optional[Session] = None,
23
+ settings: Optional[dict] = None,
24
+ in_memory: bool = False,
25
+ object_name: str = "",
26
+ ) -> "DataChain":
27
+ """Generate chain from pandas data-frame.
28
+
29
+ Example:
30
+ ```py
31
+ import pandas as pd
32
+ import datachain as dc
33
+
34
+ df = pd.DataFrame({"fib": [1, 2, 3, 5, 8]})
35
+ dc.from_pandas(df)
36
+ ```
37
+ """
38
+ from .utils import DatasetPrepareError
39
+
40
+ fr_map = {col.lower(): df[col].tolist() for col in df.columns}
41
+
42
+ for column in fr_map:
43
+ if not column.isidentifier():
44
+ raise DatasetPrepareError(
45
+ name,
46
+ f"import from pandas error - '{column}' cannot be a column name",
47
+ )
48
+
49
+ return from_values(
50
+ name,
51
+ session,
52
+ settings=settings,
53
+ object_name=object_name,
54
+ in_memory=in_memory,
55
+ **fr_map,
56
+ )
@@ -0,0 +1,65 @@
1
+ from typing import (
2
+ TYPE_CHECKING,
3
+ Any,
4
+ Optional,
5
+ )
6
+
7
+ from datachain.lib.data_model import DataType
8
+ from datachain.query import Session
9
+
10
+ if TYPE_CHECKING:
11
+ from typing_extensions import ParamSpec
12
+
13
+ from .datachain import DataChain
14
+
15
+ P = ParamSpec("P")
16
+
17
+
18
+ def from_parquet(
19
+ path,
20
+ partitioning: Any = "hive",
21
+ output: Optional[dict[str, DataType]] = None,
22
+ object_name: str = "",
23
+ model_name: str = "",
24
+ source: bool = True,
25
+ session: Optional[Session] = None,
26
+ settings: Optional[dict] = None,
27
+ **kwargs,
28
+ ) -> "DataChain":
29
+ """Generate chain from parquet files.
30
+
31
+ Parameters:
32
+ path : Storage URI with directory. URI must start with storage prefix such
33
+ as `s3://`, `gs://`, `az://` or "file:///".
34
+ partitioning : Any pyarrow partitioning schema.
35
+ output : Dictionary defining column names and their corresponding types.
36
+ object_name : Created object column name.
37
+ model_name : Generated model name.
38
+ source : Whether to include info about the source file.
39
+ session : Session to use for the chain.
40
+ settings : Settings to use for the chain.
41
+
42
+ Example:
43
+ Reading a single file:
44
+ ```py
45
+ import datachain as dc
46
+ dc.from_parquet("s3://mybucket/file.parquet")
47
+ ```
48
+
49
+ Reading a partitioned dataset from a directory:
50
+ ```py
51
+ import datachain as dc
52
+ dc.from_parquet("s3://mybucket/dir")
53
+ ```
54
+ """
55
+ from .storage import from_storage
56
+
57
+ chain = from_storage(path, session=session, settings=settings, **kwargs)
58
+ return chain.parse_tabular(
59
+ output=output,
60
+ object_name=object_name,
61
+ model_name=model_name,
62
+ source=source,
63
+ format="parquet",
64
+ partitioning=partitioning,
65
+ )
@@ -0,0 +1,90 @@
1
+ from typing import (
2
+ TYPE_CHECKING,
3
+ Optional,
4
+ Union,
5
+ )
6
+
7
+ import sqlalchemy
8
+
9
+ from datachain.lib.data_model import DataType
10
+ from datachain.lib.file import (
11
+ File,
12
+ )
13
+ from datachain.lib.signal_schema import SignalSchema
14
+ from datachain.query import Session
15
+
16
+ if TYPE_CHECKING:
17
+ from typing_extensions import ParamSpec
18
+
19
+ from .datachain import DataChain
20
+
21
+ P = ParamSpec("P")
22
+
23
+
24
+ def from_records(
25
+ to_insert: Optional[Union[dict, list[dict]]],
26
+ session: Optional[Session] = None,
27
+ settings: Optional[dict] = None,
28
+ in_memory: bool = False,
29
+ schema: Optional[dict[str, DataType]] = None,
30
+ ) -> "DataChain":
31
+ """Create a DataChain from the provided records. This method can be used for
32
+ programmatically generating a chain in contrast of reading data from storages
33
+ or other sources.
34
+
35
+ Parameters:
36
+ to_insert : records (or a single record) to insert. Each record is
37
+ a dictionary of signals and theirs values.
38
+ schema : describes chain signals and their corresponding types
39
+
40
+ Example:
41
+ ```py
42
+ import datachain as dc
43
+ single_record = dc.from_records(dc.DEFAULT_FILE_RECORD)
44
+ ```
45
+ """
46
+ from .datasets import from_dataset
47
+
48
+ session = Session.get(session, in_memory=in_memory)
49
+ catalog = session.catalog
50
+
51
+ name = session.generate_temp_dataset_name()
52
+ signal_schema = None
53
+ columns: list[sqlalchemy.Column] = []
54
+
55
+ if schema:
56
+ signal_schema = SignalSchema(schema)
57
+ columns = [
58
+ sqlalchemy.Column(c.name, c.type) # type: ignore[union-attr]
59
+ for c in signal_schema.db_signals(as_columns=True) # type: ignore[assignment]
60
+ ]
61
+ else:
62
+ columns = [
63
+ sqlalchemy.Column(name, typ)
64
+ for name, typ in File._datachain_column_types.items()
65
+ ]
66
+
67
+ dsr = catalog.create_dataset(
68
+ name,
69
+ columns=columns,
70
+ feature_schema=(
71
+ signal_schema.clone_without_sys_signals().serialize()
72
+ if signal_schema
73
+ else None
74
+ ),
75
+ )
76
+
77
+ session.add_dataset_version(dsr, dsr.latest_version)
78
+
79
+ if isinstance(to_insert, dict):
80
+ to_insert = [to_insert]
81
+ elif not to_insert:
82
+ to_insert = []
83
+
84
+ warehouse = catalog.warehouse
85
+ dr = warehouse.dataset_rows(dsr)
86
+ db = warehouse.db
87
+ insert_q = dr.get_table().insert()
88
+ for record in to_insert:
89
+ db.execute(insert_q.values(**record))
90
+ return from_dataset(name=dsr.name, session=session, settings=settings)
@@ -0,0 +1,118 @@
1
+ import os.path
2
+ from typing import (
3
+ TYPE_CHECKING,
4
+ Optional,
5
+ Union,
6
+ )
7
+
8
+ from datachain.lib.file import (
9
+ File,
10
+ FileType,
11
+ get_file_type,
12
+ )
13
+ from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
14
+ from datachain.query import Session
15
+
16
+ if TYPE_CHECKING:
17
+ from .datachain import DataChain
18
+
19
+
20
+ def from_storage(
21
+ uri: Union[str, os.PathLike[str]],
22
+ *,
23
+ type: FileType = "binary",
24
+ session: Optional[Session] = None,
25
+ settings: Optional[dict] = None,
26
+ in_memory: bool = False,
27
+ recursive: Optional[bool] = True,
28
+ object_name: str = "file",
29
+ update: bool = False,
30
+ anon: bool = False,
31
+ client_config: Optional[dict] = None,
32
+ ) -> "DataChain":
33
+ """Get data from a storage as a list of file with all file attributes.
34
+ It returns the chain itself as usual.
35
+
36
+ Parameters:
37
+ uri : storage URI with directory. URI must start with storage prefix such
38
+ as `s3://`, `gs://`, `az://` or "file:///"
39
+ type : read file as "binary", "text", or "image" data. Default is "binary".
40
+ recursive : search recursively for the given path.
41
+ object_name : Created object column name.
42
+ update : force storage reindexing. Default is False.
43
+ anon : If True, we will treat cloud bucket as public one
44
+ client_config : Optional client configuration for the storage client.
45
+
46
+ Example:
47
+ Simple call from s3
48
+ ```py
49
+ import datachain as dc
50
+ chain = dc.from_storage("s3://my-bucket/my-dir")
51
+ ```
52
+
53
+ With AWS S3-compatible storage
54
+ ```py
55
+ import datachain as dc
56
+ chain = dc.from_storage(
57
+ "s3://my-bucket/my-dir",
58
+ client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
59
+ )
60
+ ```
61
+
62
+ Pass existing session
63
+ ```py
64
+ session = Session.get()
65
+ import datachain as dc
66
+ chain = dc.from_storage("s3://my-bucket/my-dir", session=session)
67
+ ```
68
+ """
69
+ from .datachain import DataChain
70
+ from .datasets import from_dataset
71
+ from .records import from_records
72
+ from .values import from_values
73
+
74
+ file_type = get_file_type(type)
75
+
76
+ if anon:
77
+ client_config = (client_config or {}) | {"anon": True}
78
+ session = Session.get(session, client_config=client_config, in_memory=in_memory)
79
+ cache = session.catalog.cache
80
+ client_config = session.catalog.client_config
81
+
82
+ list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
83
+ uri, session, update=update
84
+ )
85
+
86
+ # ds_name is None if object is a file, we don't want to use cache
87
+ # or do listing in that case - just read that single object
88
+ if not list_ds_name:
89
+ dc = from_values(
90
+ session=session,
91
+ settings=settings,
92
+ in_memory=in_memory,
93
+ file=[get_file_info(list_uri, cache, client_config=client_config)],
94
+ )
95
+ dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
96
+ return dc
97
+
98
+ if update or not list_ds_exists:
99
+ # disable prefetch for listing, as it pre-downloads all files
100
+ (
101
+ from_records(
102
+ DataChain.DEFAULT_FILE_RECORD,
103
+ session=session,
104
+ settings=settings,
105
+ in_memory=in_memory,
106
+ )
107
+ .settings(prefetch=0)
108
+ .gen(
109
+ list_bucket(list_uri, cache, client_config=client_config),
110
+ output={f"{object_name}": File},
111
+ )
112
+ .save(list_ds_name, listing=True)
113
+ )
114
+
115
+ dc = from_dataset(list_ds_name, session=session, settings=settings)
116
+ dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
117
+
118
+ return ls(dc, list_path, recursive=recursive, object_name=object_name)