datachain 0.14.0__py3-none-any.whl → 0.14.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +18 -18
- datachain/catalog/catalog.py +6 -6
- datachain/cli/commands/show.py +2 -2
- datachain/client/fsspec.py +3 -3
- datachain/lib/dc/__init__.py +18 -18
- datachain/lib/dc/csv.py +5 -5
- datachain/lib/dc/datachain.py +42 -42
- datachain/lib/dc/datasets.py +7 -7
- datachain/lib/dc/hf.py +5 -5
- datachain/lib/dc/json.py +6 -6
- datachain/lib/dc/listings.py +2 -2
- datachain/lib/dc/pandas.py +4 -4
- datachain/lib/dc/parquet.py +5 -5
- datachain/lib/dc/records.py +4 -4
- datachain/lib/dc/storage.py +101 -48
- datachain/lib/dc/values.py +4 -4
- datachain/lib/listing.py +11 -0
- datachain/lib/meta_formats.py +2 -2
- datachain/lib/pytorch.py +2 -2
- datachain/lib/udf.py +1 -1
- datachain/query/dataset.py +52 -16
- datachain/toolkit/split.py +1 -1
- {datachain-0.14.0.dist-info → datachain-0.14.2.dist-info}/METADATA +6 -6
- {datachain-0.14.0.dist-info → datachain-0.14.2.dist-info}/RECORD +28 -28
- {datachain-0.14.0.dist-info → datachain-0.14.2.dist-info}/WHEEL +0 -0
- {datachain-0.14.0.dist-info → datachain-0.14.2.dist-info}/entry_points.txt +0 -0
- {datachain-0.14.0.dist-info → datachain-0.14.2.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.14.0.dist-info → datachain-0.14.2.dist-info}/top_level.txt +0 -0
datachain/lib/dc/parquet.py
CHANGED
|
@@ -15,7 +15,7 @@ if TYPE_CHECKING:
|
|
|
15
15
|
P = ParamSpec("P")
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
def
|
|
18
|
+
def read_parquet(
|
|
19
19
|
path,
|
|
20
20
|
partitioning: Any = "hive",
|
|
21
21
|
output: Optional[dict[str, DataType]] = None,
|
|
@@ -43,18 +43,18 @@ def from_parquet(
|
|
|
43
43
|
Reading a single file:
|
|
44
44
|
```py
|
|
45
45
|
import datachain as dc
|
|
46
|
-
dc.
|
|
46
|
+
dc.read_parquet("s3://mybucket/file.parquet")
|
|
47
47
|
```
|
|
48
48
|
|
|
49
49
|
Reading a partitioned dataset from a directory:
|
|
50
50
|
```py
|
|
51
51
|
import datachain as dc
|
|
52
|
-
dc.
|
|
52
|
+
dc.read_parquet("s3://mybucket/dir")
|
|
53
53
|
```
|
|
54
54
|
"""
|
|
55
|
-
from .storage import
|
|
55
|
+
from .storage import read_storage
|
|
56
56
|
|
|
57
|
-
chain =
|
|
57
|
+
chain = read_storage(path, session=session, settings=settings, **kwargs)
|
|
58
58
|
return chain.parse_tabular(
|
|
59
59
|
output=output,
|
|
60
60
|
object_name=object_name,
|
datachain/lib/dc/records.py
CHANGED
|
@@ -21,7 +21,7 @@ if TYPE_CHECKING:
|
|
|
21
21
|
P = ParamSpec("P")
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
def
|
|
24
|
+
def read_records(
|
|
25
25
|
to_insert: Optional[Union[dict, list[dict]]],
|
|
26
26
|
session: Optional[Session] = None,
|
|
27
27
|
settings: Optional[dict] = None,
|
|
@@ -40,10 +40,10 @@ def from_records(
|
|
|
40
40
|
Example:
|
|
41
41
|
```py
|
|
42
42
|
import datachain as dc
|
|
43
|
-
single_record = dc.
|
|
43
|
+
single_record = dc.read_records(dc.DEFAULT_FILE_RECORD)
|
|
44
44
|
```
|
|
45
45
|
"""
|
|
46
|
-
from .datasets import
|
|
46
|
+
from .datasets import read_dataset
|
|
47
47
|
|
|
48
48
|
session = Session.get(session, in_memory=in_memory)
|
|
49
49
|
catalog = session.catalog
|
|
@@ -87,4 +87,4 @@ def from_records(
|
|
|
87
87
|
insert_q = dr.get_table().insert()
|
|
88
88
|
for record in to_insert:
|
|
89
89
|
db.execute(insert_q.values(**record))
|
|
90
|
-
return
|
|
90
|
+
return read_dataset(name=dsr.name, session=session, settings=settings)
|
datachain/lib/dc/storage.py
CHANGED
|
@@ -6,19 +6,23 @@ from typing import (
|
|
|
6
6
|
)
|
|
7
7
|
|
|
8
8
|
from datachain.lib.file import (
|
|
9
|
-
File,
|
|
10
9
|
FileType,
|
|
11
10
|
get_file_type,
|
|
12
11
|
)
|
|
13
|
-
from datachain.lib.listing import
|
|
12
|
+
from datachain.lib.listing import (
|
|
13
|
+
get_file_info,
|
|
14
|
+
get_listing,
|
|
15
|
+
list_bucket,
|
|
16
|
+
ls,
|
|
17
|
+
)
|
|
14
18
|
from datachain.query import Session
|
|
15
19
|
|
|
16
20
|
if TYPE_CHECKING:
|
|
17
21
|
from .datachain import DataChain
|
|
18
22
|
|
|
19
23
|
|
|
20
|
-
def
|
|
21
|
-
uri: Union[str, os.PathLike[str]],
|
|
24
|
+
def read_storage(
|
|
25
|
+
uri: Union[str, os.PathLike[str], list[str], list[os.PathLike[str]]],
|
|
22
26
|
*,
|
|
23
27
|
type: FileType = "binary",
|
|
24
28
|
session: Optional[Session] = None,
|
|
@@ -30,11 +34,12 @@ def from_storage(
|
|
|
30
34
|
anon: bool = False,
|
|
31
35
|
client_config: Optional[dict] = None,
|
|
32
36
|
) -> "DataChain":
|
|
33
|
-
"""Get data from
|
|
37
|
+
"""Get data from storage(s) as a list of file with all file attributes.
|
|
34
38
|
It returns the chain itself as usual.
|
|
35
39
|
|
|
36
40
|
Parameters:
|
|
37
|
-
uri : storage URI with directory
|
|
41
|
+
uri : storage URI with directory or list of URIs.
|
|
42
|
+
URIs must start with storage prefix such
|
|
38
43
|
as `s3://`, `gs://`, `az://` or "file:///"
|
|
39
44
|
type : read file as "binary", "text", or "image" data. Default is "binary".
|
|
40
45
|
recursive : search recursively for the given path.
|
|
@@ -43,17 +48,27 @@ def from_storage(
|
|
|
43
48
|
anon : If True, we will treat cloud bucket as public one
|
|
44
49
|
client_config : Optional client configuration for the storage client.
|
|
45
50
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
51
|
+
Returns:
|
|
52
|
+
DataChain: A DataChain object containing the file information.
|
|
53
|
+
|
|
54
|
+
Examples:
|
|
55
|
+
Simple call from s3:
|
|
56
|
+
```python
|
|
49
57
|
import datachain as dc
|
|
50
|
-
chain = dc.
|
|
58
|
+
chain = dc.read_storage("s3://my-bucket/my-dir")
|
|
51
59
|
```
|
|
52
60
|
|
|
53
|
-
|
|
54
|
-
```
|
|
55
|
-
|
|
56
|
-
|
|
61
|
+
Multiple URIs:
|
|
62
|
+
```python
|
|
63
|
+
chain = dc.read_storage([
|
|
64
|
+
"s3://bucket1/dir1",
|
|
65
|
+
"s3://bucket2/dir2"
|
|
66
|
+
])
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
With AWS S3-compatible storage:
|
|
70
|
+
```python
|
|
71
|
+
chain = dc.read_storage(
|
|
57
72
|
"s3://my-bucket/my-dir",
|
|
58
73
|
client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
|
|
59
74
|
)
|
|
@@ -62,14 +77,20 @@ def from_storage(
|
|
|
62
77
|
Pass existing session
|
|
63
78
|
```py
|
|
64
79
|
session = Session.get()
|
|
65
|
-
|
|
66
|
-
|
|
80
|
+
chain = dc.read_storage([
|
|
81
|
+
"path/to/dir1",
|
|
82
|
+
"path/to/dir2"
|
|
83
|
+
], session=session, recursive=True)
|
|
67
84
|
```
|
|
85
|
+
|
|
86
|
+
Note:
|
|
87
|
+
When using multiple URIs with `update=True`, the function optimizes by
|
|
88
|
+
avoiding redundant updates for URIs pointing to the same storage location.
|
|
68
89
|
"""
|
|
69
90
|
from .datachain import DataChain
|
|
70
|
-
from .datasets import
|
|
71
|
-
from .records import
|
|
72
|
-
from .values import
|
|
91
|
+
from .datasets import read_dataset
|
|
92
|
+
from .records import read_records
|
|
93
|
+
from .values import read_values
|
|
73
94
|
|
|
74
95
|
file_type = get_file_type(type)
|
|
75
96
|
|
|
@@ -79,40 +100,72 @@ def from_storage(
|
|
|
79
100
|
cache = session.catalog.cache
|
|
80
101
|
client_config = session.catalog.client_config
|
|
81
102
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
103
|
+
uris = uri if isinstance(uri, (list, tuple)) else [uri]
|
|
104
|
+
|
|
105
|
+
if not uris:
|
|
106
|
+
raise ValueError("No URIs provided")
|
|
107
|
+
|
|
108
|
+
storage_chain = None
|
|
109
|
+
listed_ds_name = set()
|
|
110
|
+
file_values = []
|
|
111
|
+
|
|
112
|
+
for single_uri in uris:
|
|
113
|
+
list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
|
|
114
|
+
single_uri, session, update=update
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# list_ds_name is None if object is a file, we don't want to use cache
|
|
118
|
+
# or do listing in that case - just read that single object
|
|
119
|
+
if not list_ds_name:
|
|
120
|
+
file_values.append(
|
|
121
|
+
get_file_info(list_uri, cache, client_config=client_config)
|
|
122
|
+
)
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
dc = read_dataset(list_ds_name, session=session, settings=settings)
|
|
126
|
+
dc._query.update = update
|
|
127
|
+
dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
|
|
85
128
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
129
|
+
if update or not list_ds_exists:
|
|
130
|
+
|
|
131
|
+
def lst_fn(ds_name, lst_uri):
|
|
132
|
+
# disable prefetch for listing, as it pre-downloads all files
|
|
133
|
+
(
|
|
134
|
+
read_records(
|
|
135
|
+
DataChain.DEFAULT_FILE_RECORD,
|
|
136
|
+
session=session,
|
|
137
|
+
settings=settings,
|
|
138
|
+
in_memory=in_memory,
|
|
139
|
+
)
|
|
140
|
+
.settings(prefetch=0)
|
|
141
|
+
.gen(
|
|
142
|
+
list_bucket(lst_uri, cache, client_config=client_config),
|
|
143
|
+
output={f"{object_name}": file_type},
|
|
144
|
+
)
|
|
145
|
+
.save(ds_name, listing=True)
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
dc._query.set_listing_fn(
|
|
149
|
+
lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
chain = ls(dc, list_path, recursive=recursive, object_name=object_name)
|
|
153
|
+
|
|
154
|
+
storage_chain = storage_chain.union(chain) if storage_chain else chain
|
|
155
|
+
listed_ds_name.add(list_ds_name)
|
|
156
|
+
|
|
157
|
+
if file_values:
|
|
158
|
+
file_chain = read_values(
|
|
90
159
|
session=session,
|
|
91
160
|
settings=settings,
|
|
92
161
|
in_memory=in_memory,
|
|
93
|
-
file=
|
|
162
|
+
file=file_values,
|
|
94
163
|
)
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
if update or not list_ds_exists:
|
|
99
|
-
# disable prefetch for listing, as it pre-downloads all files
|
|
100
|
-
(
|
|
101
|
-
from_records(
|
|
102
|
-
DataChain.DEFAULT_FILE_RECORD,
|
|
103
|
-
session=session,
|
|
104
|
-
settings=settings,
|
|
105
|
-
in_memory=in_memory,
|
|
106
|
-
)
|
|
107
|
-
.settings(prefetch=0)
|
|
108
|
-
.gen(
|
|
109
|
-
list_bucket(list_uri, cache, client_config=client_config),
|
|
110
|
-
output={f"{object_name}": File},
|
|
111
|
-
)
|
|
112
|
-
.save(list_ds_name, listing=True)
|
|
164
|
+
file_chain.signals_schema = file_chain.signals_schema.mutate(
|
|
165
|
+
{f"{object_name}": file_type}
|
|
113
166
|
)
|
|
167
|
+
storage_chain = storage_chain.union(file_chain) if storage_chain else file_chain
|
|
114
168
|
|
|
115
|
-
|
|
116
|
-
dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
|
|
169
|
+
assert storage_chain is not None
|
|
117
170
|
|
|
118
|
-
return
|
|
171
|
+
return storage_chain
|
datachain/lib/dc/values.py
CHANGED
|
@@ -6,7 +6,7 @@ from typing import (
|
|
|
6
6
|
|
|
7
7
|
from datachain.lib.convert.values_to_tuples import values_to_tuples
|
|
8
8
|
from datachain.lib.data_model import dict_to_data_model
|
|
9
|
-
from datachain.lib.dc.records import
|
|
9
|
+
from datachain.lib.dc.records import read_records
|
|
10
10
|
from datachain.lib.dc.utils import OutputType
|
|
11
11
|
from datachain.query import Session
|
|
12
12
|
|
|
@@ -18,7 +18,7 @@ if TYPE_CHECKING:
|
|
|
18
18
|
P = ParamSpec("P")
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
def
|
|
21
|
+
def read_values(
|
|
22
22
|
ds_name: str = "",
|
|
23
23
|
session: Optional[Session] = None,
|
|
24
24
|
settings: Optional[dict] = None,
|
|
@@ -32,7 +32,7 @@ def from_values(
|
|
|
32
32
|
Example:
|
|
33
33
|
```py
|
|
34
34
|
import datachain as dc
|
|
35
|
-
dc.
|
|
35
|
+
dc.read_values(fib=[1, 2, 3, 5, 8])
|
|
36
36
|
```
|
|
37
37
|
"""
|
|
38
38
|
from .datachain import DataChain
|
|
@@ -42,7 +42,7 @@ def from_values(
|
|
|
42
42
|
def _func_fr() -> Iterator[tuple_type]: # type: ignore[valid-type]
|
|
43
43
|
yield from tuples
|
|
44
44
|
|
|
45
|
-
chain =
|
|
45
|
+
chain = read_records(
|
|
46
46
|
DataChain.DEFAULT_FILE_RECORD,
|
|
47
47
|
session=session,
|
|
48
48
|
settings=settings,
|
datachain/lib/listing.py
CHANGED
|
@@ -4,6 +4,7 @@ import os
|
|
|
4
4
|
import posixpath
|
|
5
5
|
from collections.abc import Iterator
|
|
6
6
|
from contextlib import contextmanager
|
|
7
|
+
from datetime import datetime, timedelta, timezone
|
|
7
8
|
from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
|
|
8
9
|
|
|
9
10
|
from fsspec.asyn import get_loop
|
|
@@ -32,6 +33,16 @@ logging.getLogger("aiobotocore.credentials").setLevel(logging.CRITICAL)
|
|
|
32
33
|
logging.getLogger("gcsfs").setLevel(logging.CRITICAL)
|
|
33
34
|
|
|
34
35
|
|
|
36
|
+
def listing_dataset_expired(lst_ds) -> bool:
|
|
37
|
+
"""Function that checks if listing dataset is expired or not"""
|
|
38
|
+
lst_version = lst_ds.versions[-1]
|
|
39
|
+
if not lst_version.finished_at:
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
expires = lst_version.finished_at + timedelta(seconds=LISTING_TTL)
|
|
43
|
+
return datetime.now(timezone.utc) > expires
|
|
44
|
+
|
|
45
|
+
|
|
35
46
|
def list_bucket(uri: str, cache, client_config=None) -> Callable:
|
|
36
47
|
"""
|
|
37
48
|
Function that returns another generator function that yields File objects
|
datachain/lib/meta_formats.py
CHANGED
|
@@ -103,10 +103,10 @@ def read_meta( # noqa: C901
|
|
|
103
103
|
model_name=None,
|
|
104
104
|
nrows=None,
|
|
105
105
|
) -> Callable:
|
|
106
|
-
from datachain import
|
|
106
|
+
from datachain import read_storage
|
|
107
107
|
|
|
108
108
|
if schema_from:
|
|
109
|
-
file = next(
|
|
109
|
+
file = next(read_storage(schema_from, type="text").limit(1).collect("file"))
|
|
110
110
|
model_code = gen_datamodel_code(
|
|
111
111
|
file, format=format, jmespath=jmespath, model_name=model_name
|
|
112
112
|
)
|
datachain/lib/pytorch.py
CHANGED
|
@@ -14,7 +14,7 @@ from torchvision.transforms import v2
|
|
|
14
14
|
from datachain import Session
|
|
15
15
|
from datachain.cache import get_temp_cache
|
|
16
16
|
from datachain.catalog import Catalog, get_catalog
|
|
17
|
-
from datachain.lib.dc.datasets import
|
|
17
|
+
from datachain.lib.dc.datasets import read_dataset
|
|
18
18
|
from datachain.lib.settings import Settings
|
|
19
19
|
from datachain.lib.text import convert_text
|
|
20
20
|
from datachain.progress import CombinedDownloadCallback
|
|
@@ -122,7 +122,7 @@ class PytorchDataset(IterableDataset):
|
|
|
122
122
|
) -> Generator[tuple[Any, ...], None, None]:
|
|
123
123
|
catalog = self._get_catalog()
|
|
124
124
|
session = Session("PyTorch", catalog=catalog)
|
|
125
|
-
ds =
|
|
125
|
+
ds = read_dataset(
|
|
126
126
|
name=self.name, version=self.version, session=session
|
|
127
127
|
).settings(cache=self.cache, prefetch=self.prefetch)
|
|
128
128
|
ds = ds.remove_file_signals()
|
datachain/lib/udf.py
CHANGED
datachain/query/dataset.py
CHANGED
|
@@ -47,6 +47,10 @@ from datachain.error import (
|
|
|
47
47
|
QueryScriptCancelError,
|
|
48
48
|
)
|
|
49
49
|
from datachain.func.base import Function
|
|
50
|
+
from datachain.lib.listing import (
|
|
51
|
+
is_listing_dataset,
|
|
52
|
+
listing_dataset_expired,
|
|
53
|
+
)
|
|
50
54
|
from datachain.lib.udf import UDFAdapter, _get_cache
|
|
51
55
|
from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
|
|
52
56
|
from datachain.query.schema import C, UDFParamSpec, normalize_param
|
|
@@ -151,13 +155,6 @@ def step_result(
|
|
|
151
155
|
)
|
|
152
156
|
|
|
153
157
|
|
|
154
|
-
class StartingStep(ABC):
|
|
155
|
-
"""An initial query processing step, referencing a data source."""
|
|
156
|
-
|
|
157
|
-
@abstractmethod
|
|
158
|
-
def apply(self) -> "StepResult": ...
|
|
159
|
-
|
|
160
|
-
|
|
161
158
|
@frozen
|
|
162
159
|
class Step(ABC):
|
|
163
160
|
"""A query processing step (filtering, mutation, etc.)"""
|
|
@@ -170,7 +167,7 @@ class Step(ABC):
|
|
|
170
167
|
|
|
171
168
|
|
|
172
169
|
@frozen
|
|
173
|
-
class QueryStep
|
|
170
|
+
class QueryStep:
|
|
174
171
|
catalog: "Catalog"
|
|
175
172
|
dataset_name: str
|
|
176
173
|
dataset_version: int
|
|
@@ -1086,6 +1083,7 @@ class DatasetQuery:
|
|
|
1086
1083
|
indexing_column_types: Optional[dict[str, Any]] = None,
|
|
1087
1084
|
in_memory: bool = False,
|
|
1088
1085
|
fallback_to_studio: bool = True,
|
|
1086
|
+
update: bool = False,
|
|
1089
1087
|
) -> None:
|
|
1090
1088
|
from datachain.remote.studio import is_token_set
|
|
1091
1089
|
|
|
@@ -1097,26 +1095,44 @@ class DatasetQuery:
|
|
|
1097
1095
|
self.temp_table_names: list[str] = []
|
|
1098
1096
|
self.dependencies: set[DatasetDependencyType] = set()
|
|
1099
1097
|
self.table = self.get_table()
|
|
1100
|
-
self.starting_step:
|
|
1098
|
+
self.starting_step: Optional[QueryStep] = None
|
|
1101
1099
|
self.name: Optional[str] = None
|
|
1102
1100
|
self.version: Optional[int] = None
|
|
1103
1101
|
self.feature_schema: Optional[dict] = None
|
|
1104
1102
|
self.column_types: Optional[dict[str, Any]] = None
|
|
1103
|
+
self.before_steps: list[Callable] = []
|
|
1104
|
+
self.listing_fn: Optional[Callable] = None
|
|
1105
|
+
self.update = update
|
|
1105
1106
|
|
|
1106
|
-
self.
|
|
1107
|
+
self.list_ds_name: Optional[str] = None
|
|
1107
1108
|
|
|
1108
|
-
|
|
1109
|
-
|
|
1109
|
+
self.name = name
|
|
1110
|
+
self.dialect = self.catalog.warehouse.db.dialect
|
|
1111
|
+
if version:
|
|
1112
|
+
self.version = version
|
|
1113
|
+
|
|
1114
|
+
if is_listing_dataset(name):
|
|
1115
|
+
# not setting query step yet as listing dataset might not exist at
|
|
1116
|
+
# this point
|
|
1117
|
+
self.list_ds_name = name
|
|
1118
|
+
elif fallback_to_studio and is_token_set():
|
|
1119
|
+
self._set_starting_step(
|
|
1120
|
+
self.catalog.get_dataset_with_remote_fallback(name, version)
|
|
1121
|
+
)
|
|
1110
1122
|
else:
|
|
1111
|
-
|
|
1123
|
+
self._set_starting_step(self.catalog.get_dataset(name))
|
|
1124
|
+
|
|
1125
|
+
def _set_starting_step(self, ds: "DatasetRecord") -> None:
|
|
1126
|
+
if not self.version:
|
|
1127
|
+
self.version = ds.latest_version
|
|
1128
|
+
|
|
1129
|
+
self.starting_step = QueryStep(self.catalog, ds.name, self.version)
|
|
1112
1130
|
|
|
1113
|
-
|
|
1131
|
+
# at this point we know our starting dataset so setting up schemas
|
|
1114
1132
|
self.feature_schema = ds.get_version(self.version).feature_schema
|
|
1115
1133
|
self.column_types = copy(ds.schema)
|
|
1116
1134
|
if "sys__id" in self.column_types:
|
|
1117
1135
|
self.column_types.pop("sys__id")
|
|
1118
|
-
self.starting_step = QueryStep(self.catalog, name, self.version)
|
|
1119
|
-
self.dialect = self.catalog.warehouse.db.dialect
|
|
1120
1136
|
|
|
1121
1137
|
def __iter__(self):
|
|
1122
1138
|
return iter(self.db_results())
|
|
@@ -1180,11 +1196,30 @@ class DatasetQuery:
|
|
|
1180
1196
|
col.table = self.table
|
|
1181
1197
|
return col
|
|
1182
1198
|
|
|
1199
|
+
def set_listing_fn(self, fn: Callable) -> None:
|
|
1200
|
+
"""Setting listing function to be run if needed"""
|
|
1201
|
+
self.listing_fn = fn
|
|
1202
|
+
|
|
1183
1203
|
def apply_steps(self) -> QueryGenerator:
|
|
1184
1204
|
"""
|
|
1185
1205
|
Apply the steps in the query and return the resulting
|
|
1186
1206
|
sqlalchemy.SelectBase.
|
|
1187
1207
|
"""
|
|
1208
|
+
if self.list_ds_name and not self.starting_step:
|
|
1209
|
+
listing_ds = None
|
|
1210
|
+
try:
|
|
1211
|
+
listing_ds = self.catalog.get_dataset(self.list_ds_name)
|
|
1212
|
+
except DatasetNotFoundError:
|
|
1213
|
+
pass
|
|
1214
|
+
|
|
1215
|
+
if not listing_ds or self.update or listing_dataset_expired(listing_ds):
|
|
1216
|
+
assert self.listing_fn
|
|
1217
|
+
self.listing_fn()
|
|
1218
|
+
listing_ds = self.catalog.get_dataset(self.list_ds_name)
|
|
1219
|
+
|
|
1220
|
+
# at this point we know what is our starting listing dataset name
|
|
1221
|
+
self._set_starting_step(listing_ds) # type: ignore [arg-type]
|
|
1222
|
+
|
|
1188
1223
|
query = self.clone()
|
|
1189
1224
|
|
|
1190
1225
|
index = os.getenv("DATACHAIN_QUERY_CHUNK_INDEX", self._chunk_index)
|
|
@@ -1203,6 +1238,7 @@ class DatasetQuery:
|
|
|
1203
1238
|
query = query.filter(C.sys__rand % total == index)
|
|
1204
1239
|
query.steps = query.steps[-1:] + query.steps[:-1]
|
|
1205
1240
|
|
|
1241
|
+
assert query.starting_step
|
|
1206
1242
|
result = query.starting_step.apply()
|
|
1207
1243
|
self.dependencies.update(result.dependencies)
|
|
1208
1244
|
|
datachain/toolkit/split.py
CHANGED
|
@@ -41,7 +41,7 @@ def train_test_split(
|
|
|
41
41
|
from datachain.toolkit import train_test_split
|
|
42
42
|
|
|
43
43
|
# Load a DataChain from a storage source (e.g., S3 bucket)
|
|
44
|
-
dc = dc.
|
|
44
|
+
dc = dc.read_storage("s3://bucket/dir/")
|
|
45
45
|
|
|
46
46
|
# Perform a 70/30 train-test split
|
|
47
47
|
train, test = train_test_split(dc, [0.7, 0.3])
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.14.
|
|
3
|
+
Version: 0.14.2
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
|
-
License: Apache-2.0
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
7
|
Project-URL: Documentation, https://datachain.dvc.ai
|
|
8
8
|
Project-URL: Issues, https://github.com/iterative/datachain/issues
|
|
9
9
|
Project-URL: Source, https://github.com/iterative/datachain
|
|
@@ -38,7 +38,7 @@ Requires-Dist: sqlalchemy>=2
|
|
|
38
38
|
Requires-Dist: multiprocess==0.70.16
|
|
39
39
|
Requires-Dist: cloudpickle
|
|
40
40
|
Requires-Dist: orjson>=3.10.5
|
|
41
|
-
Requires-Dist: pydantic<
|
|
41
|
+
Requires-Dist: pydantic<2.11,>=2
|
|
42
42
|
Requires-Dist: jmespath>=1.0
|
|
43
43
|
Requires-Dist: datamodel-code-generator>=0.25
|
|
44
44
|
Requires-Dist: Pillow<12,>=10.0.0
|
|
@@ -171,8 +171,8 @@ high confidence scores.
|
|
|
171
171
|
|
|
172
172
|
import datachain as dc
|
|
173
173
|
|
|
174
|
-
meta = dc.
|
|
175
|
-
images = dc.
|
|
174
|
+
meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
|
|
175
|
+
images = dc.read_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
|
|
176
176
|
|
|
177
177
|
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
178
178
|
annotated = images_id.merge(meta, on="id", right_on="meta.id")
|
|
@@ -213,7 +213,7 @@ Python code:
|
|
|
213
213
|
return result.lower().startswith("success")
|
|
214
214
|
|
|
215
215
|
chain = (
|
|
216
|
-
dc.
|
|
216
|
+
dc.read_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
|
|
217
217
|
.settings(parallel=4, cache=True)
|
|
218
218
|
.map(is_success=eval_dialogue)
|
|
219
219
|
.save("mistral_files")
|