datachain 0.14.0__py3-none-any.whl → 0.14.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +18 -18
- datachain/catalog/catalog.py +6 -6
- datachain/cli/commands/show.py +2 -2
- datachain/client/fsspec.py +3 -3
- datachain/lib/dc/__init__.py +18 -18
- datachain/lib/dc/csv.py +5 -5
- datachain/lib/dc/datachain.py +42 -42
- datachain/lib/dc/datasets.py +7 -7
- datachain/lib/dc/hf.py +5 -5
- datachain/lib/dc/json.py +6 -6
- datachain/lib/dc/listings.py +2 -2
- datachain/lib/dc/pandas.py +4 -4
- datachain/lib/dc/parquet.py +5 -5
- datachain/lib/dc/records.py +4 -4
- datachain/lib/dc/storage.py +101 -48
- datachain/lib/dc/values.py +4 -4
- datachain/lib/listing.py +11 -0
- datachain/lib/meta_formats.py +2 -2
- datachain/lib/pytorch.py +2 -2
- datachain/lib/udf.py +1 -1
- datachain/query/dataset.py +52 -16
- datachain/toolkit/split.py +1 -1
- {datachain-0.14.0.dist-info → datachain-0.14.2.dist-info}/METADATA +6 -6
- {datachain-0.14.0.dist-info → datachain-0.14.2.dist-info}/RECORD +28 -28
- {datachain-0.14.0.dist-info → datachain-0.14.2.dist-info}/WHEEL +0 -0
- {datachain-0.14.0.dist-info → datachain-0.14.2.dist-info}/entry_points.txt +0 -0
- {datachain-0.14.0.dist-info → datachain-0.14.2.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.14.0.dist-info → datachain-0.14.2.dist-info}/top_level.txt +0 -0
datachain/__init__.py
CHANGED
|
@@ -5,16 +5,16 @@ from datachain.lib.dc import (
|
|
|
5
5
|
DataChain,
|
|
6
6
|
Sys,
|
|
7
7
|
datasets,
|
|
8
|
-
from_csv,
|
|
9
|
-
from_dataset,
|
|
10
|
-
from_hf,
|
|
11
|
-
from_json,
|
|
12
|
-
from_pandas,
|
|
13
|
-
from_parquet,
|
|
14
|
-
from_records,
|
|
15
|
-
from_storage,
|
|
16
|
-
from_values,
|
|
17
8
|
listings,
|
|
9
|
+
read_csv,
|
|
10
|
+
read_dataset,
|
|
11
|
+
read_hf,
|
|
12
|
+
read_json,
|
|
13
|
+
read_pandas,
|
|
14
|
+
read_parquet,
|
|
15
|
+
read_records,
|
|
16
|
+
read_storage,
|
|
17
|
+
read_values,
|
|
18
18
|
)
|
|
19
19
|
from datachain.lib.file import (
|
|
20
20
|
ArrowRow,
|
|
@@ -61,17 +61,17 @@ __all__ = [
|
|
|
61
61
|
"VideoFragment",
|
|
62
62
|
"VideoFrame",
|
|
63
63
|
"datasets",
|
|
64
|
-
"from_csv",
|
|
65
|
-
"from_dataset",
|
|
66
|
-
"from_hf",
|
|
67
|
-
"from_json",
|
|
68
|
-
"from_pandas",
|
|
69
|
-
"from_parquet",
|
|
70
|
-
"from_records",
|
|
71
|
-
"from_storage",
|
|
72
|
-
"from_values",
|
|
73
64
|
"is_chain_type",
|
|
74
65
|
"listings",
|
|
75
66
|
"metrics",
|
|
76
67
|
"param",
|
|
68
|
+
"read_csv",
|
|
69
|
+
"read_dataset",
|
|
70
|
+
"read_hf",
|
|
71
|
+
"read_json",
|
|
72
|
+
"read_pandas",
|
|
73
|
+
"read_parquet",
|
|
74
|
+
"read_records",
|
|
75
|
+
"read_storage",
|
|
76
|
+
"read_values",
|
|
77
77
|
]
|
datachain/catalog/catalog.py
CHANGED
|
@@ -583,12 +583,12 @@ class Catalog:
|
|
|
583
583
|
object_name="file",
|
|
584
584
|
skip_indexing=False,
|
|
585
585
|
) -> tuple[Optional["Listing"], "Client", str]:
|
|
586
|
-
from datachain import
|
|
586
|
+
from datachain import read_storage
|
|
587
587
|
from datachain.listing import Listing
|
|
588
588
|
|
|
589
|
-
|
|
589
|
+
read_storage(
|
|
590
590
|
source, session=self.session, update=update, object_name=object_name
|
|
591
|
-
)
|
|
591
|
+
).exec()
|
|
592
592
|
|
|
593
593
|
list_ds_name, list_uri, list_path, _ = get_listing(
|
|
594
594
|
source, self.session, update=update
|
|
@@ -994,14 +994,14 @@ class Catalog:
|
|
|
994
994
|
if not sources:
|
|
995
995
|
raise ValueError("Sources needs to be non empty list")
|
|
996
996
|
|
|
997
|
-
from datachain import
|
|
997
|
+
from datachain import read_dataset, read_storage
|
|
998
998
|
|
|
999
999
|
chains = []
|
|
1000
1000
|
for source in sources:
|
|
1001
1001
|
if source.startswith(DATASET_PREFIX):
|
|
1002
|
-
dc =
|
|
1002
|
+
dc = read_dataset(source[len(DATASET_PREFIX) :], session=self.session)
|
|
1003
1003
|
else:
|
|
1004
|
-
dc =
|
|
1004
|
+
dc = read_storage(source, session=self.session, recursive=recursive)
|
|
1005
1005
|
|
|
1006
1006
|
chains.append(dc)
|
|
1007
1007
|
|
datachain/cli/commands/show.py
CHANGED
|
@@ -18,7 +18,7 @@ def show(
|
|
|
18
18
|
schema: bool = False,
|
|
19
19
|
include_hidden: bool = False,
|
|
20
20
|
) -> None:
|
|
21
|
-
from datachain import Session,
|
|
21
|
+
from datachain import Session, read_dataset
|
|
22
22
|
from datachain.query.dataset import DatasetQuery
|
|
23
23
|
from datachain.utils import show_records
|
|
24
24
|
|
|
@@ -51,5 +51,5 @@ def show(
|
|
|
51
51
|
if schema and dataset_version.feature_schema:
|
|
52
52
|
print("\nSchema:")
|
|
53
53
|
session = Session.get(catalog=catalog)
|
|
54
|
-
dc =
|
|
54
|
+
dc = read_dataset(name=name, version=version, session=session)
|
|
55
55
|
dc.print_schema()
|
datachain/client/fsspec.py
CHANGED
|
@@ -89,9 +89,9 @@ class Client(ABC):
|
|
|
89
89
|
from .local import FileClient
|
|
90
90
|
from .s3 import ClientS3
|
|
91
91
|
|
|
92
|
-
protocol = urlparse(
|
|
92
|
+
protocol = urlparse(os.fspath(url)).scheme
|
|
93
93
|
|
|
94
|
-
if not protocol or _is_win_local_path(
|
|
94
|
+
if not protocol or _is_win_local_path(os.fspath(url)):
|
|
95
95
|
return FileClient
|
|
96
96
|
if protocol == ClientS3.protocol:
|
|
97
97
|
return ClientS3
|
|
@@ -122,7 +122,7 @@ class Client(ABC):
|
|
|
122
122
|
source: Union[str, os.PathLike[str]], cache: Cache, **kwargs
|
|
123
123
|
) -> "Client":
|
|
124
124
|
cls = Client.get_implementation(source)
|
|
125
|
-
storage_url, _ = cls.split_url(
|
|
125
|
+
storage_url, _ = cls.split_url(os.fspath(source))
|
|
126
126
|
if os.name == "nt":
|
|
127
127
|
storage_url = storage_url.removeprefix("/")
|
|
128
128
|
|
datachain/lib/dc/__init__.py
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
from .csv import
|
|
1
|
+
from .csv import read_csv
|
|
2
2
|
from .datachain import C, Column, DataChain
|
|
3
|
-
from .datasets import datasets,
|
|
4
|
-
from .hf import
|
|
5
|
-
from .json import
|
|
3
|
+
from .datasets import datasets, read_dataset
|
|
4
|
+
from .hf import read_hf
|
|
5
|
+
from .json import read_json
|
|
6
6
|
from .listings import listings
|
|
7
|
-
from .pandas import
|
|
8
|
-
from .parquet import
|
|
9
|
-
from .records import
|
|
10
|
-
from .storage import
|
|
7
|
+
from .pandas import read_pandas
|
|
8
|
+
from .parquet import read_parquet
|
|
9
|
+
from .records import read_records
|
|
10
|
+
from .storage import read_storage
|
|
11
11
|
from .utils import DatasetMergeError, DatasetPrepareError, Sys
|
|
12
|
-
from .values import
|
|
12
|
+
from .values import read_values
|
|
13
13
|
|
|
14
14
|
__all__ = [
|
|
15
15
|
"C",
|
|
@@ -19,14 +19,14 @@ __all__ = [
|
|
|
19
19
|
"DatasetPrepareError",
|
|
20
20
|
"Sys",
|
|
21
21
|
"datasets",
|
|
22
|
-
"from_csv",
|
|
23
|
-
"from_dataset",
|
|
24
|
-
"from_hf",
|
|
25
|
-
"from_json",
|
|
26
|
-
"from_pandas",
|
|
27
|
-
"from_parquet",
|
|
28
|
-
"from_records",
|
|
29
|
-
"from_storage",
|
|
30
|
-
"from_values",
|
|
31
22
|
"listings",
|
|
23
|
+
"read_csv",
|
|
24
|
+
"read_dataset",
|
|
25
|
+
"read_hf",
|
|
26
|
+
"read_json",
|
|
27
|
+
"read_pandas",
|
|
28
|
+
"read_parquet",
|
|
29
|
+
"read_records",
|
|
30
|
+
"read_storage",
|
|
31
|
+
"read_values",
|
|
32
32
|
]
|
datachain/lib/dc/csv.py
CHANGED
|
@@ -16,7 +16,7 @@ if TYPE_CHECKING:
|
|
|
16
16
|
from .datachain import DataChain
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
def
|
|
19
|
+
def read_csv(
|
|
20
20
|
path,
|
|
21
21
|
delimiter: Optional[str] = None,
|
|
22
22
|
header: bool = True,
|
|
@@ -58,13 +58,13 @@ def from_csv(
|
|
|
58
58
|
Reading a csv file:
|
|
59
59
|
```py
|
|
60
60
|
import datachain as dc
|
|
61
|
-
chain = dc.
|
|
61
|
+
chain = dc.read_csv("s3://mybucket/file.csv")
|
|
62
62
|
```
|
|
63
63
|
|
|
64
64
|
Reading csv files from a directory as a combined dataset:
|
|
65
65
|
```py
|
|
66
66
|
import datachain as dc
|
|
67
|
-
chain = dc.
|
|
67
|
+
chain = dc.read_csv("s3://mybucket/dir")
|
|
68
68
|
```
|
|
69
69
|
"""
|
|
70
70
|
from pandas.io.parsers.readers import STR_NA_VALUES
|
|
@@ -72,7 +72,7 @@ def from_csv(
|
|
|
72
72
|
from pyarrow.dataset import CsvFileFormat
|
|
73
73
|
from pyarrow.lib import type_for_alias
|
|
74
74
|
|
|
75
|
-
from .storage import
|
|
75
|
+
from .storage import read_storage
|
|
76
76
|
|
|
77
77
|
parse_options = parse_options or {}
|
|
78
78
|
if "delimiter" not in parse_options:
|
|
@@ -88,7 +88,7 @@ def from_csv(
|
|
|
88
88
|
else:
|
|
89
89
|
column_types = {}
|
|
90
90
|
|
|
91
|
-
chain =
|
|
91
|
+
chain = read_storage(path, session=session, settings=settings, **kwargs)
|
|
92
92
|
|
|
93
93
|
column_names = None
|
|
94
94
|
if not header:
|
datachain/lib/dc/datachain.py
CHANGED
|
@@ -84,22 +84,22 @@ class DataChain:
|
|
|
84
84
|
underlyind library `Pydantic`.
|
|
85
85
|
|
|
86
86
|
See Also:
|
|
87
|
-
`
|
|
87
|
+
`read_storage("s3://my-bucket/my-dir/")` - reading unstructured
|
|
88
88
|
data files from storages such as S3, gs or Azure ADLS.
|
|
89
89
|
|
|
90
90
|
`DataChain.save("name")` - saving to a dataset.
|
|
91
91
|
|
|
92
|
-
`
|
|
92
|
+
`read_dataset("name")` - reading from a dataset.
|
|
93
93
|
|
|
94
|
-
`
|
|
94
|
+
`read_values(fib=[1, 2, 3, 5, 8])` - generating from values.
|
|
95
95
|
|
|
96
|
-
`
|
|
96
|
+
`read_pandas(pd.DataFrame(...))` - generating from pandas.
|
|
97
97
|
|
|
98
|
-
`
|
|
98
|
+
`read_json("file.json")` - generating from json.
|
|
99
99
|
|
|
100
|
-
`
|
|
100
|
+
`read_csv("file.csv")` - generating from csv.
|
|
101
101
|
|
|
102
|
-
`
|
|
102
|
+
`read_parquet("file.parquet")` - generating from parquet.
|
|
103
103
|
|
|
104
104
|
Example:
|
|
105
105
|
```py
|
|
@@ -118,7 +118,7 @@ class DataChain:
|
|
|
118
118
|
api_key = os.environ["MISTRAL_API_KEY"]
|
|
119
119
|
|
|
120
120
|
chain = (
|
|
121
|
-
dc.
|
|
121
|
+
dc.read_storage("gs://datachain-demo/chatbot-KiT/")
|
|
122
122
|
.limit(5)
|
|
123
123
|
.settings(cache=True, parallel=5)
|
|
124
124
|
.map(
|
|
@@ -315,27 +315,27 @@ class DataChain:
|
|
|
315
315
|
*args,
|
|
316
316
|
**kwargs,
|
|
317
317
|
) -> "DataChain":
|
|
318
|
-
from .storage import
|
|
318
|
+
from .storage import read_storage
|
|
319
319
|
|
|
320
320
|
warnings.warn(
|
|
321
321
|
"Class method `from_storage` is deprecated. "
|
|
322
|
-
"Use `
|
|
322
|
+
"Use `read_storage` function instead from top_module.",
|
|
323
323
|
DeprecationWarning,
|
|
324
324
|
stacklevel=2,
|
|
325
325
|
)
|
|
326
|
-
return
|
|
326
|
+
return read_storage(*args, **kwargs)
|
|
327
327
|
|
|
328
328
|
@classmethod
|
|
329
329
|
def from_dataset(cls, *args, **kwargs) -> "DataChain":
|
|
330
|
-
from .datasets import
|
|
330
|
+
from .datasets import read_dataset
|
|
331
331
|
|
|
332
332
|
warnings.warn(
|
|
333
333
|
"Class method `from_dataset` is deprecated. "
|
|
334
|
-
"Use `
|
|
334
|
+
"Use `read_dataset` function instead from top_module.",
|
|
335
335
|
DeprecationWarning,
|
|
336
336
|
stacklevel=2,
|
|
337
337
|
)
|
|
338
|
-
return
|
|
338
|
+
return read_dataset(*args, **kwargs)
|
|
339
339
|
|
|
340
340
|
@classmethod
|
|
341
341
|
def from_json(
|
|
@@ -343,15 +343,15 @@ class DataChain:
|
|
|
343
343
|
*args,
|
|
344
344
|
**kwargs,
|
|
345
345
|
) -> "DataChain":
|
|
346
|
-
from .json import
|
|
346
|
+
from .json import read_json
|
|
347
347
|
|
|
348
348
|
warnings.warn(
|
|
349
349
|
"Class method `from_json` is deprecated. "
|
|
350
|
-
"Use `
|
|
350
|
+
"Use `read_json` function instead from top_module.",
|
|
351
351
|
DeprecationWarning,
|
|
352
352
|
stacklevel=2,
|
|
353
353
|
)
|
|
354
|
-
return
|
|
354
|
+
return read_json(*args, **kwargs)
|
|
355
355
|
|
|
356
356
|
def explode(
|
|
357
357
|
self,
|
|
@@ -487,7 +487,7 @@ class DataChain:
|
|
|
487
487
|
)
|
|
488
488
|
|
|
489
489
|
chain = (
|
|
490
|
-
dc.
|
|
490
|
+
dc.read_storage("s3://my-bucket")
|
|
491
491
|
.apply(parse_stem)
|
|
492
492
|
.filter(C("stem").glob("*cat*"))
|
|
493
493
|
)
|
|
@@ -727,7 +727,7 @@ class DataChain:
|
|
|
727
727
|
|
|
728
728
|
Note:
|
|
729
729
|
Order is not guaranteed when steps are added after an `order_by` statement.
|
|
730
|
-
I.e. when using `
|
|
730
|
+
I.e. when using `read_dataset` an `order_by` statement should be used if
|
|
731
731
|
the order of the records in the chain is important.
|
|
732
732
|
Using `order_by` directly before `limit`, `collect` and `collect_flatten`
|
|
733
733
|
will give expected results.
|
|
@@ -1466,15 +1466,15 @@ class DataChain:
|
|
|
1466
1466
|
*args,
|
|
1467
1467
|
**kwargs,
|
|
1468
1468
|
) -> "DataChain":
|
|
1469
|
-
from .values import
|
|
1469
|
+
from .values import read_values
|
|
1470
1470
|
|
|
1471
1471
|
warnings.warn(
|
|
1472
1472
|
"Class method `from_values` is deprecated. "
|
|
1473
|
-
"Use `
|
|
1473
|
+
"Use `read_values` function instead from top_module.",
|
|
1474
1474
|
DeprecationWarning,
|
|
1475
1475
|
stacklevel=2,
|
|
1476
1476
|
)
|
|
1477
|
-
return
|
|
1477
|
+
return read_values(*args, **kwargs)
|
|
1478
1478
|
|
|
1479
1479
|
@classmethod
|
|
1480
1480
|
def from_pandas(
|
|
@@ -1482,15 +1482,15 @@ class DataChain:
|
|
|
1482
1482
|
*args,
|
|
1483
1483
|
**kwargs,
|
|
1484
1484
|
) -> "DataChain":
|
|
1485
|
-
from .pandas import
|
|
1485
|
+
from .pandas import read_pandas
|
|
1486
1486
|
|
|
1487
1487
|
warnings.warn(
|
|
1488
1488
|
"Class method `from_pandas` is deprecated. "
|
|
1489
|
-
"Use `
|
|
1489
|
+
"Use `read_pandas` function instead from top_module.",
|
|
1490
1490
|
DeprecationWarning,
|
|
1491
1491
|
stacklevel=2,
|
|
1492
1492
|
)
|
|
1493
|
-
return
|
|
1493
|
+
return read_pandas(*args, **kwargs)
|
|
1494
1494
|
|
|
1495
1495
|
def to_pandas(self, flatten=False, include_hidden=True) -> "pd.DataFrame":
|
|
1496
1496
|
"""Return a pandas DataFrame from the chain.
|
|
@@ -1575,15 +1575,15 @@ class DataChain:
|
|
|
1575
1575
|
*args,
|
|
1576
1576
|
**kwargs,
|
|
1577
1577
|
) -> "DataChain":
|
|
1578
|
-
from .hf import
|
|
1578
|
+
from .hf import read_hf
|
|
1579
1579
|
|
|
1580
1580
|
warnings.warn(
|
|
1581
1581
|
"Class method `from_hf` is deprecated. "
|
|
1582
|
-
"Use `
|
|
1582
|
+
"Use `read_hf` function instead from top_module.",
|
|
1583
1583
|
DeprecationWarning,
|
|
1584
1584
|
stacklevel=2,
|
|
1585
1585
|
)
|
|
1586
|
-
return
|
|
1586
|
+
return read_hf(*args, **kwargs)
|
|
1587
1587
|
|
|
1588
1588
|
def parse_tabular(
|
|
1589
1589
|
self,
|
|
@@ -1610,7 +1610,7 @@ class DataChain:
|
|
|
1610
1610
|
Reading a json lines file:
|
|
1611
1611
|
```py
|
|
1612
1612
|
import datachain as dc
|
|
1613
|
-
chain = dc.
|
|
1613
|
+
chain = dc.read_storage("s3://mybucket/file.jsonl")
|
|
1614
1614
|
chain = chain.parse_tabular(format="json")
|
|
1615
1615
|
```
|
|
1616
1616
|
|
|
@@ -1618,7 +1618,7 @@ class DataChain:
|
|
|
1618
1618
|
```py
|
|
1619
1619
|
import datachain as dc
|
|
1620
1620
|
|
|
1621
|
-
chain = dc.
|
|
1621
|
+
chain = dc.read_storage("s3://mybucket")
|
|
1622
1622
|
chain = chain.filter(dc.C("file.name").glob("*.jsonl"))
|
|
1623
1623
|
chain = chain.parse_tabular(format="json")
|
|
1624
1624
|
```
|
|
@@ -1680,15 +1680,15 @@ class DataChain:
|
|
|
1680
1680
|
*args,
|
|
1681
1681
|
**kwargs,
|
|
1682
1682
|
) -> "DataChain":
|
|
1683
|
-
from .csv import
|
|
1683
|
+
from .csv import read_csv
|
|
1684
1684
|
|
|
1685
1685
|
warnings.warn(
|
|
1686
1686
|
"Class method `from_csv` is deprecated. "
|
|
1687
|
-
"Use `
|
|
1687
|
+
"Use `read_csv` function instead from top_module.",
|
|
1688
1688
|
DeprecationWarning,
|
|
1689
1689
|
stacklevel=2,
|
|
1690
1690
|
)
|
|
1691
|
-
return
|
|
1691
|
+
return read_csv(*args, **kwargs)
|
|
1692
1692
|
|
|
1693
1693
|
@classmethod
|
|
1694
1694
|
def from_parquet(
|
|
@@ -1696,15 +1696,15 @@ class DataChain:
|
|
|
1696
1696
|
*args,
|
|
1697
1697
|
**kwargs,
|
|
1698
1698
|
) -> "DataChain":
|
|
1699
|
-
from .parquet import
|
|
1699
|
+
from .parquet import read_parquet
|
|
1700
1700
|
|
|
1701
1701
|
warnings.warn(
|
|
1702
1702
|
"Class method `from_parquet` is deprecated. "
|
|
1703
|
-
"Use `
|
|
1703
|
+
"Use `read_parquet` function instead from top_module.",
|
|
1704
1704
|
DeprecationWarning,
|
|
1705
1705
|
stacklevel=2,
|
|
1706
1706
|
)
|
|
1707
|
-
return
|
|
1707
|
+
return read_parquet(*args, **kwargs)
|
|
1708
1708
|
|
|
1709
1709
|
def to_parquet(
|
|
1710
1710
|
self,
|
|
@@ -1930,15 +1930,15 @@ class DataChain:
|
|
|
1930
1930
|
*args,
|
|
1931
1931
|
**kwargs,
|
|
1932
1932
|
) -> "DataChain":
|
|
1933
|
-
from .records import
|
|
1933
|
+
from .records import read_records
|
|
1934
1934
|
|
|
1935
1935
|
warnings.warn(
|
|
1936
1936
|
"Class method `from_records` is deprecated. "
|
|
1937
|
-
"Use `
|
|
1937
|
+
"Use `read_records` function instead from top_module.",
|
|
1938
1938
|
DeprecationWarning,
|
|
1939
1939
|
stacklevel=2,
|
|
1940
1940
|
)
|
|
1941
|
-
return
|
|
1941
|
+
return read_records(*args, **kwargs)
|
|
1942
1942
|
|
|
1943
1943
|
def sum(self, fr: DataType): # type: ignore[override]
|
|
1944
1944
|
"""Compute the sum of a column."""
|
|
@@ -1969,7 +1969,7 @@ class DataChain:
|
|
|
1969
1969
|
import datachain as dc
|
|
1970
1970
|
|
|
1971
1971
|
(
|
|
1972
|
-
dc.
|
|
1972
|
+
dc.read_storage(DATA, type="text")
|
|
1973
1973
|
.settings(parallel=4, cache=True)
|
|
1974
1974
|
.setup(client=lambda: anthropic.Anthropic(api_key=API_KEY))
|
|
1975
1975
|
.map(
|
|
@@ -2021,7 +2021,7 @@ class DataChain:
|
|
|
2021
2021
|
```py
|
|
2022
2022
|
import datachain as dc
|
|
2023
2023
|
|
|
2024
|
-
ds = dc.
|
|
2024
|
+
ds = dc.read_storage("s3://mybucket")
|
|
2025
2025
|
ds.to_storage("gs://mybucket", placement="filename")
|
|
2026
2026
|
```
|
|
2027
2027
|
"""
|
|
@@ -2139,7 +2139,7 @@ class DataChain:
|
|
|
2139
2139
|
```py
|
|
2140
2140
|
import datachain as dc
|
|
2141
2141
|
|
|
2142
|
-
chain = dc.
|
|
2142
|
+
chain = dc.read_storage(...)
|
|
2143
2143
|
chunk_1 = query._chunk(0, 2)
|
|
2144
2144
|
chunk_2 = query._chunk(1, 2)
|
|
2145
2145
|
```
|
datachain/lib/dc/datasets.py
CHANGED
|
@@ -13,7 +13,7 @@ from datachain.query import Session
|
|
|
13
13
|
from datachain.query.dataset import DatasetQuery
|
|
14
14
|
|
|
15
15
|
from .utils import Sys
|
|
16
|
-
from .values import
|
|
16
|
+
from .values import read_values
|
|
17
17
|
|
|
18
18
|
if TYPE_CHECKING:
|
|
19
19
|
from typing_extensions import ParamSpec
|
|
@@ -23,7 +23,7 @@ if TYPE_CHECKING:
|
|
|
23
23
|
P = ParamSpec("P")
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
def
|
|
26
|
+
def read_dataset(
|
|
27
27
|
name: str,
|
|
28
28
|
version: Optional[int] = None,
|
|
29
29
|
session: Optional[Session] = None,
|
|
@@ -44,15 +44,15 @@ def from_dataset(
|
|
|
44
44
|
Example:
|
|
45
45
|
```py
|
|
46
46
|
import datachain as dc
|
|
47
|
-
chain = dc.
|
|
47
|
+
chain = dc.read_dataset("my_cats")
|
|
48
48
|
```
|
|
49
49
|
|
|
50
50
|
```py
|
|
51
|
-
chain = dc.
|
|
51
|
+
chain = dc.read_dataset("my_cats", fallback_to_studio=False)
|
|
52
52
|
```
|
|
53
53
|
|
|
54
54
|
```py
|
|
55
|
-
chain = dc.
|
|
55
|
+
chain = dc.read_dataset("my_cats", version=1)
|
|
56
56
|
```
|
|
57
57
|
|
|
58
58
|
```py
|
|
@@ -64,7 +64,7 @@ def from_dataset(
|
|
|
64
64
|
"min_task_size": 1000,
|
|
65
65
|
"prefetch": 10,
|
|
66
66
|
}
|
|
67
|
-
chain = dc.
|
|
67
|
+
chain = dc.read_dataset(
|
|
68
68
|
name="my_cats",
|
|
69
69
|
version=1,
|
|
70
70
|
session=session,
|
|
@@ -140,7 +140,7 @@ def datasets(
|
|
|
140
140
|
)
|
|
141
141
|
]
|
|
142
142
|
|
|
143
|
-
return
|
|
143
|
+
return read_values(
|
|
144
144
|
session=session,
|
|
145
145
|
settings=settings,
|
|
146
146
|
in_memory=in_memory,
|
datachain/lib/dc/hf.py
CHANGED
|
@@ -18,7 +18,7 @@ if TYPE_CHECKING:
|
|
|
18
18
|
P = ParamSpec("P")
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
def
|
|
21
|
+
def read_hf(
|
|
22
22
|
dataset: Union[str, "HFDatasetType"],
|
|
23
23
|
*args,
|
|
24
24
|
session: Optional[Session] = None,
|
|
@@ -42,7 +42,7 @@ def from_hf(
|
|
|
42
42
|
Load from Hugging Face Hub:
|
|
43
43
|
```py
|
|
44
44
|
import datachain as dc
|
|
45
|
-
chain = dc.
|
|
45
|
+
chain = dc.read_hf("beans", split="train")
|
|
46
46
|
```
|
|
47
47
|
|
|
48
48
|
Generate chain from loaded dataset:
|
|
@@ -50,12 +50,12 @@ def from_hf(
|
|
|
50
50
|
from datasets import load_dataset
|
|
51
51
|
ds = load_dataset("beans", split="train")
|
|
52
52
|
import datachain as dc
|
|
53
|
-
chain = dc.
|
|
53
|
+
chain = dc.read_hf(ds)
|
|
54
54
|
```
|
|
55
55
|
"""
|
|
56
56
|
from datachain.lib.hf import HFGenerator, get_output_schema, stream_splits
|
|
57
57
|
|
|
58
|
-
from .values import
|
|
58
|
+
from .values import read_values
|
|
59
59
|
|
|
60
60
|
output: dict[str, DataType] = {}
|
|
61
61
|
ds_dict = stream_splits(dataset, *args, **kwargs)
|
|
@@ -69,5 +69,5 @@ def from_hf(
|
|
|
69
69
|
if object_name:
|
|
70
70
|
output = {object_name: model}
|
|
71
71
|
|
|
72
|
-
chain =
|
|
72
|
+
chain = read_values(split=list(ds_dict.keys()), session=session, settings=settings)
|
|
73
73
|
return chain.gen(HFGenerator(dataset, model, *args, **kwargs), output=output)
|
datachain/lib/dc/json.py
CHANGED
|
@@ -22,7 +22,7 @@ if TYPE_CHECKING:
|
|
|
22
22
|
P = ParamSpec("P")
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
def
|
|
25
|
+
def read_json(
|
|
26
26
|
path: Union[str, os.PathLike[str]],
|
|
27
27
|
type: FileType = "text",
|
|
28
28
|
spec: Optional[DataType] = None,
|
|
@@ -52,19 +52,19 @@ def from_json(
|
|
|
52
52
|
infer JSON schema from data, reduce using JMESPATH
|
|
53
53
|
```py
|
|
54
54
|
import datachain as dc
|
|
55
|
-
chain = dc.
|
|
55
|
+
chain = dc.read_json("gs://json", jmespath="key1.key2")
|
|
56
56
|
```
|
|
57
57
|
|
|
58
58
|
infer JSON schema from a particular path
|
|
59
59
|
```py
|
|
60
60
|
import datachain as dc
|
|
61
|
-
chain = dc.
|
|
61
|
+
chain = dc.read_json("gs://json_ds", schema_from="gs://json/my.json")
|
|
62
62
|
```
|
|
63
63
|
"""
|
|
64
|
-
from .storage import
|
|
64
|
+
from .storage import read_storage
|
|
65
65
|
|
|
66
66
|
if schema_from == "auto":
|
|
67
|
-
schema_from =
|
|
67
|
+
schema_from = os.fspath(path)
|
|
68
68
|
|
|
69
69
|
def jmespath_to_name(s: str):
|
|
70
70
|
name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
|
|
@@ -74,7 +74,7 @@ def from_json(
|
|
|
74
74
|
object_name = jmespath_to_name(jmespath)
|
|
75
75
|
if not object_name:
|
|
76
76
|
object_name = format
|
|
77
|
-
chain =
|
|
77
|
+
chain = read_storage(uri=path, type=type, **kwargs)
|
|
78
78
|
signal_dict = {
|
|
79
79
|
object_name: read_meta(
|
|
80
80
|
schema_from=schema_from,
|
datachain/lib/dc/listings.py
CHANGED
|
@@ -6,7 +6,7 @@ from typing import (
|
|
|
6
6
|
from datachain.lib.listing_info import ListingInfo
|
|
7
7
|
from datachain.query import Session
|
|
8
8
|
|
|
9
|
-
from .values import
|
|
9
|
+
from .values import read_values
|
|
10
10
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
12
|
from typing_extensions import ParamSpec
|
|
@@ -35,7 +35,7 @@ def listings(
|
|
|
35
35
|
session = Session.get(session, in_memory=in_memory)
|
|
36
36
|
catalog = kwargs.get("catalog") or session.catalog
|
|
37
37
|
|
|
38
|
-
return
|
|
38
|
+
return read_values(
|
|
39
39
|
session=session,
|
|
40
40
|
in_memory=in_memory,
|
|
41
41
|
output={object_name: ListingInfo},
|
datachain/lib/dc/pandas.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import (
|
|
|
5
5
|
|
|
6
6
|
from datachain.query import Session
|
|
7
7
|
|
|
8
|
-
from .values import
|
|
8
|
+
from .values import read_values
|
|
9
9
|
|
|
10
10
|
if TYPE_CHECKING:
|
|
11
11
|
import pandas as pd
|
|
@@ -16,7 +16,7 @@ if TYPE_CHECKING:
|
|
|
16
16
|
P = ParamSpec("P")
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
def
|
|
19
|
+
def read_pandas( # type: ignore[override]
|
|
20
20
|
df: "pd.DataFrame",
|
|
21
21
|
name: str = "",
|
|
22
22
|
session: Optional[Session] = None,
|
|
@@ -32,7 +32,7 @@ def from_pandas( # type: ignore[override]
|
|
|
32
32
|
import datachain as dc
|
|
33
33
|
|
|
34
34
|
df = pd.DataFrame({"fib": [1, 2, 3, 5, 8]})
|
|
35
|
-
dc.
|
|
35
|
+
dc.read_pandas(df)
|
|
36
36
|
```
|
|
37
37
|
"""
|
|
38
38
|
from .utils import DatasetPrepareError
|
|
@@ -46,7 +46,7 @@ def from_pandas( # type: ignore[override]
|
|
|
46
46
|
f"import from pandas error - '{column}' cannot be a column name",
|
|
47
47
|
)
|
|
48
48
|
|
|
49
|
-
return
|
|
49
|
+
return read_values(
|
|
50
50
|
name,
|
|
51
51
|
session,
|
|
52
52
|
settings=settings,
|