datachain 0.14.1__py3-none-any.whl → 0.14.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +18 -18
- datachain/catalog/catalog.py +5 -5
- datachain/cli/commands/show.py +2 -2
- datachain/lib/dc/__init__.py +18 -18
- datachain/lib/dc/csv.py +5 -5
- datachain/lib/dc/datachain.py +42 -42
- datachain/lib/dc/datasets.py +7 -7
- datachain/lib/dc/hf.py +5 -5
- datachain/lib/dc/json.py +5 -5
- datachain/lib/dc/listings.py +2 -2
- datachain/lib/dc/pandas.py +4 -4
- datachain/lib/dc/parquet.py +5 -5
- datachain/lib/dc/records.py +4 -4
- datachain/lib/dc/storage.py +13 -12
- datachain/lib/dc/values.py +4 -4
- datachain/lib/listing.py +11 -0
- datachain/lib/meta_formats.py +2 -2
- datachain/lib/pytorch.py +2 -2
- datachain/lib/udf.py +1 -1
- datachain/query/dataset.py +23 -10
- datachain/toolkit/split.py +1 -1
- {datachain-0.14.1.dist-info → datachain-0.14.2.dist-info}/METADATA +5 -5
- {datachain-0.14.1.dist-info → datachain-0.14.2.dist-info}/RECORD +27 -27
- {datachain-0.14.1.dist-info → datachain-0.14.2.dist-info}/WHEEL +0 -0
- {datachain-0.14.1.dist-info → datachain-0.14.2.dist-info}/entry_points.txt +0 -0
- {datachain-0.14.1.dist-info → datachain-0.14.2.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.14.1.dist-info → datachain-0.14.2.dist-info}/top_level.txt +0 -0
datachain/__init__.py
CHANGED
|
@@ -5,16 +5,16 @@ from datachain.lib.dc import (
|
|
|
5
5
|
DataChain,
|
|
6
6
|
Sys,
|
|
7
7
|
datasets,
|
|
8
|
-
from_csv,
|
|
9
|
-
from_dataset,
|
|
10
|
-
from_hf,
|
|
11
|
-
from_json,
|
|
12
|
-
from_pandas,
|
|
13
|
-
from_parquet,
|
|
14
|
-
from_records,
|
|
15
|
-
from_storage,
|
|
16
|
-
from_values,
|
|
17
8
|
listings,
|
|
9
|
+
read_csv,
|
|
10
|
+
read_dataset,
|
|
11
|
+
read_hf,
|
|
12
|
+
read_json,
|
|
13
|
+
read_pandas,
|
|
14
|
+
read_parquet,
|
|
15
|
+
read_records,
|
|
16
|
+
read_storage,
|
|
17
|
+
read_values,
|
|
18
18
|
)
|
|
19
19
|
from datachain.lib.file import (
|
|
20
20
|
ArrowRow,
|
|
@@ -61,17 +61,17 @@ __all__ = [
|
|
|
61
61
|
"VideoFragment",
|
|
62
62
|
"VideoFrame",
|
|
63
63
|
"datasets",
|
|
64
|
-
"from_csv",
|
|
65
|
-
"from_dataset",
|
|
66
|
-
"from_hf",
|
|
67
|
-
"from_json",
|
|
68
|
-
"from_pandas",
|
|
69
|
-
"from_parquet",
|
|
70
|
-
"from_records",
|
|
71
|
-
"from_storage",
|
|
72
|
-
"from_values",
|
|
73
64
|
"is_chain_type",
|
|
74
65
|
"listings",
|
|
75
66
|
"metrics",
|
|
76
67
|
"param",
|
|
68
|
+
"read_csv",
|
|
69
|
+
"read_dataset",
|
|
70
|
+
"read_hf",
|
|
71
|
+
"read_json",
|
|
72
|
+
"read_pandas",
|
|
73
|
+
"read_parquet",
|
|
74
|
+
"read_records",
|
|
75
|
+
"read_storage",
|
|
76
|
+
"read_values",
|
|
77
77
|
]
|
datachain/catalog/catalog.py
CHANGED
|
@@ -583,10 +583,10 @@ class Catalog:
|
|
|
583
583
|
object_name="file",
|
|
584
584
|
skip_indexing=False,
|
|
585
585
|
) -> tuple[Optional["Listing"], "Client", str]:
|
|
586
|
-
from datachain import
|
|
586
|
+
from datachain import read_storage
|
|
587
587
|
from datachain.listing import Listing
|
|
588
588
|
|
|
589
|
-
|
|
589
|
+
read_storage(
|
|
590
590
|
source, session=self.session, update=update, object_name=object_name
|
|
591
591
|
).exec()
|
|
592
592
|
|
|
@@ -994,14 +994,14 @@ class Catalog:
|
|
|
994
994
|
if not sources:
|
|
995
995
|
raise ValueError("Sources needs to be non empty list")
|
|
996
996
|
|
|
997
|
-
from datachain import
|
|
997
|
+
from datachain import read_dataset, read_storage
|
|
998
998
|
|
|
999
999
|
chains = []
|
|
1000
1000
|
for source in sources:
|
|
1001
1001
|
if source.startswith(DATASET_PREFIX):
|
|
1002
|
-
dc =
|
|
1002
|
+
dc = read_dataset(source[len(DATASET_PREFIX) :], session=self.session)
|
|
1003
1003
|
else:
|
|
1004
|
-
dc =
|
|
1004
|
+
dc = read_storage(source, session=self.session, recursive=recursive)
|
|
1005
1005
|
|
|
1006
1006
|
chains.append(dc)
|
|
1007
1007
|
|
datachain/cli/commands/show.py
CHANGED
|
@@ -18,7 +18,7 @@ def show(
|
|
|
18
18
|
schema: bool = False,
|
|
19
19
|
include_hidden: bool = False,
|
|
20
20
|
) -> None:
|
|
21
|
-
from datachain import Session,
|
|
21
|
+
from datachain import Session, read_dataset
|
|
22
22
|
from datachain.query.dataset import DatasetQuery
|
|
23
23
|
from datachain.utils import show_records
|
|
24
24
|
|
|
@@ -51,5 +51,5 @@ def show(
|
|
|
51
51
|
if schema and dataset_version.feature_schema:
|
|
52
52
|
print("\nSchema:")
|
|
53
53
|
session = Session.get(catalog=catalog)
|
|
54
|
-
dc =
|
|
54
|
+
dc = read_dataset(name=name, version=version, session=session)
|
|
55
55
|
dc.print_schema()
|
datachain/lib/dc/__init__.py
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
from .csv import
|
|
1
|
+
from .csv import read_csv
|
|
2
2
|
from .datachain import C, Column, DataChain
|
|
3
|
-
from .datasets import datasets,
|
|
4
|
-
from .hf import
|
|
5
|
-
from .json import
|
|
3
|
+
from .datasets import datasets, read_dataset
|
|
4
|
+
from .hf import read_hf
|
|
5
|
+
from .json import read_json
|
|
6
6
|
from .listings import listings
|
|
7
|
-
from .pandas import
|
|
8
|
-
from .parquet import
|
|
9
|
-
from .records import
|
|
10
|
-
from .storage import
|
|
7
|
+
from .pandas import read_pandas
|
|
8
|
+
from .parquet import read_parquet
|
|
9
|
+
from .records import read_records
|
|
10
|
+
from .storage import read_storage
|
|
11
11
|
from .utils import DatasetMergeError, DatasetPrepareError, Sys
|
|
12
|
-
from .values import
|
|
12
|
+
from .values import read_values
|
|
13
13
|
|
|
14
14
|
__all__ = [
|
|
15
15
|
"C",
|
|
@@ -19,14 +19,14 @@ __all__ = [
|
|
|
19
19
|
"DatasetPrepareError",
|
|
20
20
|
"Sys",
|
|
21
21
|
"datasets",
|
|
22
|
-
"from_csv",
|
|
23
|
-
"from_dataset",
|
|
24
|
-
"from_hf",
|
|
25
|
-
"from_json",
|
|
26
|
-
"from_pandas",
|
|
27
|
-
"from_parquet",
|
|
28
|
-
"from_records",
|
|
29
|
-
"from_storage",
|
|
30
|
-
"from_values",
|
|
31
22
|
"listings",
|
|
23
|
+
"read_csv",
|
|
24
|
+
"read_dataset",
|
|
25
|
+
"read_hf",
|
|
26
|
+
"read_json",
|
|
27
|
+
"read_pandas",
|
|
28
|
+
"read_parquet",
|
|
29
|
+
"read_records",
|
|
30
|
+
"read_storage",
|
|
31
|
+
"read_values",
|
|
32
32
|
]
|
datachain/lib/dc/csv.py
CHANGED
|
@@ -16,7 +16,7 @@ if TYPE_CHECKING:
|
|
|
16
16
|
from .datachain import DataChain
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
def
|
|
19
|
+
def read_csv(
|
|
20
20
|
path,
|
|
21
21
|
delimiter: Optional[str] = None,
|
|
22
22
|
header: bool = True,
|
|
@@ -58,13 +58,13 @@ def from_csv(
|
|
|
58
58
|
Reading a csv file:
|
|
59
59
|
```py
|
|
60
60
|
import datachain as dc
|
|
61
|
-
chain = dc.
|
|
61
|
+
chain = dc.read_csv("s3://mybucket/file.csv")
|
|
62
62
|
```
|
|
63
63
|
|
|
64
64
|
Reading csv files from a directory as a combined dataset:
|
|
65
65
|
```py
|
|
66
66
|
import datachain as dc
|
|
67
|
-
chain = dc.
|
|
67
|
+
chain = dc.read_csv("s3://mybucket/dir")
|
|
68
68
|
```
|
|
69
69
|
"""
|
|
70
70
|
from pandas.io.parsers.readers import STR_NA_VALUES
|
|
@@ -72,7 +72,7 @@ def from_csv(
|
|
|
72
72
|
from pyarrow.dataset import CsvFileFormat
|
|
73
73
|
from pyarrow.lib import type_for_alias
|
|
74
74
|
|
|
75
|
-
from .storage import
|
|
75
|
+
from .storage import read_storage
|
|
76
76
|
|
|
77
77
|
parse_options = parse_options or {}
|
|
78
78
|
if "delimiter" not in parse_options:
|
|
@@ -88,7 +88,7 @@ def from_csv(
|
|
|
88
88
|
else:
|
|
89
89
|
column_types = {}
|
|
90
90
|
|
|
91
|
-
chain =
|
|
91
|
+
chain = read_storage(path, session=session, settings=settings, **kwargs)
|
|
92
92
|
|
|
93
93
|
column_names = None
|
|
94
94
|
if not header:
|
datachain/lib/dc/datachain.py
CHANGED
|
@@ -84,22 +84,22 @@ class DataChain:
|
|
|
84
84
|
underlyind library `Pydantic`.
|
|
85
85
|
|
|
86
86
|
See Also:
|
|
87
|
-
`
|
|
87
|
+
`read_storage("s3://my-bucket/my-dir/")` - reading unstructured
|
|
88
88
|
data files from storages such as S3, gs or Azure ADLS.
|
|
89
89
|
|
|
90
90
|
`DataChain.save("name")` - saving to a dataset.
|
|
91
91
|
|
|
92
|
-
`
|
|
92
|
+
`read_dataset("name")` - reading from a dataset.
|
|
93
93
|
|
|
94
|
-
`
|
|
94
|
+
`read_values(fib=[1, 2, 3, 5, 8])` - generating from values.
|
|
95
95
|
|
|
96
|
-
`
|
|
96
|
+
`read_pandas(pd.DataFrame(...))` - generating from pandas.
|
|
97
97
|
|
|
98
|
-
`
|
|
98
|
+
`read_json("file.json")` - generating from json.
|
|
99
99
|
|
|
100
|
-
`
|
|
100
|
+
`read_csv("file.csv")` - generating from csv.
|
|
101
101
|
|
|
102
|
-
`
|
|
102
|
+
`read_parquet("file.parquet")` - generating from parquet.
|
|
103
103
|
|
|
104
104
|
Example:
|
|
105
105
|
```py
|
|
@@ -118,7 +118,7 @@ class DataChain:
|
|
|
118
118
|
api_key = os.environ["MISTRAL_API_KEY"]
|
|
119
119
|
|
|
120
120
|
chain = (
|
|
121
|
-
dc.
|
|
121
|
+
dc.read_storage("gs://datachain-demo/chatbot-KiT/")
|
|
122
122
|
.limit(5)
|
|
123
123
|
.settings(cache=True, parallel=5)
|
|
124
124
|
.map(
|
|
@@ -315,27 +315,27 @@ class DataChain:
|
|
|
315
315
|
*args,
|
|
316
316
|
**kwargs,
|
|
317
317
|
) -> "DataChain":
|
|
318
|
-
from .storage import
|
|
318
|
+
from .storage import read_storage
|
|
319
319
|
|
|
320
320
|
warnings.warn(
|
|
321
321
|
"Class method `from_storage` is deprecated. "
|
|
322
|
-
"Use `
|
|
322
|
+
"Use `read_storage` function instead from top_module.",
|
|
323
323
|
DeprecationWarning,
|
|
324
324
|
stacklevel=2,
|
|
325
325
|
)
|
|
326
|
-
return
|
|
326
|
+
return read_storage(*args, **kwargs)
|
|
327
327
|
|
|
328
328
|
@classmethod
|
|
329
329
|
def from_dataset(cls, *args, **kwargs) -> "DataChain":
|
|
330
|
-
from .datasets import
|
|
330
|
+
from .datasets import read_dataset
|
|
331
331
|
|
|
332
332
|
warnings.warn(
|
|
333
333
|
"Class method `from_dataset` is deprecated. "
|
|
334
|
-
"Use `
|
|
334
|
+
"Use `read_dataset` function instead from top_module.",
|
|
335
335
|
DeprecationWarning,
|
|
336
336
|
stacklevel=2,
|
|
337
337
|
)
|
|
338
|
-
return
|
|
338
|
+
return read_dataset(*args, **kwargs)
|
|
339
339
|
|
|
340
340
|
@classmethod
|
|
341
341
|
def from_json(
|
|
@@ -343,15 +343,15 @@ class DataChain:
|
|
|
343
343
|
*args,
|
|
344
344
|
**kwargs,
|
|
345
345
|
) -> "DataChain":
|
|
346
|
-
from .json import
|
|
346
|
+
from .json import read_json
|
|
347
347
|
|
|
348
348
|
warnings.warn(
|
|
349
349
|
"Class method `from_json` is deprecated. "
|
|
350
|
-
"Use `
|
|
350
|
+
"Use `read_json` function instead from top_module.",
|
|
351
351
|
DeprecationWarning,
|
|
352
352
|
stacklevel=2,
|
|
353
353
|
)
|
|
354
|
-
return
|
|
354
|
+
return read_json(*args, **kwargs)
|
|
355
355
|
|
|
356
356
|
def explode(
|
|
357
357
|
self,
|
|
@@ -487,7 +487,7 @@ class DataChain:
|
|
|
487
487
|
)
|
|
488
488
|
|
|
489
489
|
chain = (
|
|
490
|
-
dc.
|
|
490
|
+
dc.read_storage("s3://my-bucket")
|
|
491
491
|
.apply(parse_stem)
|
|
492
492
|
.filter(C("stem").glob("*cat*"))
|
|
493
493
|
)
|
|
@@ -727,7 +727,7 @@ class DataChain:
|
|
|
727
727
|
|
|
728
728
|
Note:
|
|
729
729
|
Order is not guaranteed when steps are added after an `order_by` statement.
|
|
730
|
-
I.e. when using `
|
|
730
|
+
I.e. when using `read_dataset` an `order_by` statement should be used if
|
|
731
731
|
the order of the records in the chain is important.
|
|
732
732
|
Using `order_by` directly before `limit`, `collect` and `collect_flatten`
|
|
733
733
|
will give expected results.
|
|
@@ -1466,15 +1466,15 @@ class DataChain:
|
|
|
1466
1466
|
*args,
|
|
1467
1467
|
**kwargs,
|
|
1468
1468
|
) -> "DataChain":
|
|
1469
|
-
from .values import
|
|
1469
|
+
from .values import read_values
|
|
1470
1470
|
|
|
1471
1471
|
warnings.warn(
|
|
1472
1472
|
"Class method `from_values` is deprecated. "
|
|
1473
|
-
"Use `
|
|
1473
|
+
"Use `read_values` function instead from top_module.",
|
|
1474
1474
|
DeprecationWarning,
|
|
1475
1475
|
stacklevel=2,
|
|
1476
1476
|
)
|
|
1477
|
-
return
|
|
1477
|
+
return read_values(*args, **kwargs)
|
|
1478
1478
|
|
|
1479
1479
|
@classmethod
|
|
1480
1480
|
def from_pandas(
|
|
@@ -1482,15 +1482,15 @@ class DataChain:
|
|
|
1482
1482
|
*args,
|
|
1483
1483
|
**kwargs,
|
|
1484
1484
|
) -> "DataChain":
|
|
1485
|
-
from .pandas import
|
|
1485
|
+
from .pandas import read_pandas
|
|
1486
1486
|
|
|
1487
1487
|
warnings.warn(
|
|
1488
1488
|
"Class method `from_pandas` is deprecated. "
|
|
1489
|
-
"Use `
|
|
1489
|
+
"Use `read_pandas` function instead from top_module.",
|
|
1490
1490
|
DeprecationWarning,
|
|
1491
1491
|
stacklevel=2,
|
|
1492
1492
|
)
|
|
1493
|
-
return
|
|
1493
|
+
return read_pandas(*args, **kwargs)
|
|
1494
1494
|
|
|
1495
1495
|
def to_pandas(self, flatten=False, include_hidden=True) -> "pd.DataFrame":
|
|
1496
1496
|
"""Return a pandas DataFrame from the chain.
|
|
@@ -1575,15 +1575,15 @@ class DataChain:
|
|
|
1575
1575
|
*args,
|
|
1576
1576
|
**kwargs,
|
|
1577
1577
|
) -> "DataChain":
|
|
1578
|
-
from .hf import
|
|
1578
|
+
from .hf import read_hf
|
|
1579
1579
|
|
|
1580
1580
|
warnings.warn(
|
|
1581
1581
|
"Class method `from_hf` is deprecated. "
|
|
1582
|
-
"Use `
|
|
1582
|
+
"Use `read_hf` function instead from top_module.",
|
|
1583
1583
|
DeprecationWarning,
|
|
1584
1584
|
stacklevel=2,
|
|
1585
1585
|
)
|
|
1586
|
-
return
|
|
1586
|
+
return read_hf(*args, **kwargs)
|
|
1587
1587
|
|
|
1588
1588
|
def parse_tabular(
|
|
1589
1589
|
self,
|
|
@@ -1610,7 +1610,7 @@ class DataChain:
|
|
|
1610
1610
|
Reading a json lines file:
|
|
1611
1611
|
```py
|
|
1612
1612
|
import datachain as dc
|
|
1613
|
-
chain = dc.
|
|
1613
|
+
chain = dc.read_storage("s3://mybucket/file.jsonl")
|
|
1614
1614
|
chain = chain.parse_tabular(format="json")
|
|
1615
1615
|
```
|
|
1616
1616
|
|
|
@@ -1618,7 +1618,7 @@ class DataChain:
|
|
|
1618
1618
|
```py
|
|
1619
1619
|
import datachain as dc
|
|
1620
1620
|
|
|
1621
|
-
chain = dc.
|
|
1621
|
+
chain = dc.read_storage("s3://mybucket")
|
|
1622
1622
|
chain = chain.filter(dc.C("file.name").glob("*.jsonl"))
|
|
1623
1623
|
chain = chain.parse_tabular(format="json")
|
|
1624
1624
|
```
|
|
@@ -1680,15 +1680,15 @@ class DataChain:
|
|
|
1680
1680
|
*args,
|
|
1681
1681
|
**kwargs,
|
|
1682
1682
|
) -> "DataChain":
|
|
1683
|
-
from .csv import
|
|
1683
|
+
from .csv import read_csv
|
|
1684
1684
|
|
|
1685
1685
|
warnings.warn(
|
|
1686
1686
|
"Class method `from_csv` is deprecated. "
|
|
1687
|
-
"Use `
|
|
1687
|
+
"Use `read_csv` function instead from top_module.",
|
|
1688
1688
|
DeprecationWarning,
|
|
1689
1689
|
stacklevel=2,
|
|
1690
1690
|
)
|
|
1691
|
-
return
|
|
1691
|
+
return read_csv(*args, **kwargs)
|
|
1692
1692
|
|
|
1693
1693
|
@classmethod
|
|
1694
1694
|
def from_parquet(
|
|
@@ -1696,15 +1696,15 @@ class DataChain:
|
|
|
1696
1696
|
*args,
|
|
1697
1697
|
**kwargs,
|
|
1698
1698
|
) -> "DataChain":
|
|
1699
|
-
from .parquet import
|
|
1699
|
+
from .parquet import read_parquet
|
|
1700
1700
|
|
|
1701
1701
|
warnings.warn(
|
|
1702
1702
|
"Class method `from_parquet` is deprecated. "
|
|
1703
|
-
"Use `
|
|
1703
|
+
"Use `read_parquet` function instead from top_module.",
|
|
1704
1704
|
DeprecationWarning,
|
|
1705
1705
|
stacklevel=2,
|
|
1706
1706
|
)
|
|
1707
|
-
return
|
|
1707
|
+
return read_parquet(*args, **kwargs)
|
|
1708
1708
|
|
|
1709
1709
|
def to_parquet(
|
|
1710
1710
|
self,
|
|
@@ -1930,15 +1930,15 @@ class DataChain:
|
|
|
1930
1930
|
*args,
|
|
1931
1931
|
**kwargs,
|
|
1932
1932
|
) -> "DataChain":
|
|
1933
|
-
from .records import
|
|
1933
|
+
from .records import read_records
|
|
1934
1934
|
|
|
1935
1935
|
warnings.warn(
|
|
1936
1936
|
"Class method `from_records` is deprecated. "
|
|
1937
|
-
"Use `
|
|
1937
|
+
"Use `read_records` function instead from top_module.",
|
|
1938
1938
|
DeprecationWarning,
|
|
1939
1939
|
stacklevel=2,
|
|
1940
1940
|
)
|
|
1941
|
-
return
|
|
1941
|
+
return read_records(*args, **kwargs)
|
|
1942
1942
|
|
|
1943
1943
|
def sum(self, fr: DataType): # type: ignore[override]
|
|
1944
1944
|
"""Compute the sum of a column."""
|
|
@@ -1969,7 +1969,7 @@ class DataChain:
|
|
|
1969
1969
|
import datachain as dc
|
|
1970
1970
|
|
|
1971
1971
|
(
|
|
1972
|
-
dc.
|
|
1972
|
+
dc.read_storage(DATA, type="text")
|
|
1973
1973
|
.settings(parallel=4, cache=True)
|
|
1974
1974
|
.setup(client=lambda: anthropic.Anthropic(api_key=API_KEY))
|
|
1975
1975
|
.map(
|
|
@@ -2021,7 +2021,7 @@ class DataChain:
|
|
|
2021
2021
|
```py
|
|
2022
2022
|
import datachain as dc
|
|
2023
2023
|
|
|
2024
|
-
ds = dc.
|
|
2024
|
+
ds = dc.read_storage("s3://mybucket")
|
|
2025
2025
|
ds.to_storage("gs://mybucket", placement="filename")
|
|
2026
2026
|
```
|
|
2027
2027
|
"""
|
|
@@ -2139,7 +2139,7 @@ class DataChain:
|
|
|
2139
2139
|
```py
|
|
2140
2140
|
import datachain as dc
|
|
2141
2141
|
|
|
2142
|
-
chain = dc.
|
|
2142
|
+
chain = dc.read_storage(...)
|
|
2143
2143
|
chunk_1 = query._chunk(0, 2)
|
|
2144
2144
|
chunk_2 = query._chunk(1, 2)
|
|
2145
2145
|
```
|
datachain/lib/dc/datasets.py
CHANGED
|
@@ -13,7 +13,7 @@ from datachain.query import Session
|
|
|
13
13
|
from datachain.query.dataset import DatasetQuery
|
|
14
14
|
|
|
15
15
|
from .utils import Sys
|
|
16
|
-
from .values import
|
|
16
|
+
from .values import read_values
|
|
17
17
|
|
|
18
18
|
if TYPE_CHECKING:
|
|
19
19
|
from typing_extensions import ParamSpec
|
|
@@ -23,7 +23,7 @@ if TYPE_CHECKING:
|
|
|
23
23
|
P = ParamSpec("P")
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
def
|
|
26
|
+
def read_dataset(
|
|
27
27
|
name: str,
|
|
28
28
|
version: Optional[int] = None,
|
|
29
29
|
session: Optional[Session] = None,
|
|
@@ -44,15 +44,15 @@ def from_dataset(
|
|
|
44
44
|
Example:
|
|
45
45
|
```py
|
|
46
46
|
import datachain as dc
|
|
47
|
-
chain = dc.
|
|
47
|
+
chain = dc.read_dataset("my_cats")
|
|
48
48
|
```
|
|
49
49
|
|
|
50
50
|
```py
|
|
51
|
-
chain = dc.
|
|
51
|
+
chain = dc.read_dataset("my_cats", fallback_to_studio=False)
|
|
52
52
|
```
|
|
53
53
|
|
|
54
54
|
```py
|
|
55
|
-
chain = dc.
|
|
55
|
+
chain = dc.read_dataset("my_cats", version=1)
|
|
56
56
|
```
|
|
57
57
|
|
|
58
58
|
```py
|
|
@@ -64,7 +64,7 @@ def from_dataset(
|
|
|
64
64
|
"min_task_size": 1000,
|
|
65
65
|
"prefetch": 10,
|
|
66
66
|
}
|
|
67
|
-
chain = dc.
|
|
67
|
+
chain = dc.read_dataset(
|
|
68
68
|
name="my_cats",
|
|
69
69
|
version=1,
|
|
70
70
|
session=session,
|
|
@@ -140,7 +140,7 @@ def datasets(
|
|
|
140
140
|
)
|
|
141
141
|
]
|
|
142
142
|
|
|
143
|
-
return
|
|
143
|
+
return read_values(
|
|
144
144
|
session=session,
|
|
145
145
|
settings=settings,
|
|
146
146
|
in_memory=in_memory,
|
datachain/lib/dc/hf.py
CHANGED
|
@@ -18,7 +18,7 @@ if TYPE_CHECKING:
|
|
|
18
18
|
P = ParamSpec("P")
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
def
|
|
21
|
+
def read_hf(
|
|
22
22
|
dataset: Union[str, "HFDatasetType"],
|
|
23
23
|
*args,
|
|
24
24
|
session: Optional[Session] = None,
|
|
@@ -42,7 +42,7 @@ def from_hf(
|
|
|
42
42
|
Load from Hugging Face Hub:
|
|
43
43
|
```py
|
|
44
44
|
import datachain as dc
|
|
45
|
-
chain = dc.
|
|
45
|
+
chain = dc.read_hf("beans", split="train")
|
|
46
46
|
```
|
|
47
47
|
|
|
48
48
|
Generate chain from loaded dataset:
|
|
@@ -50,12 +50,12 @@ def from_hf(
|
|
|
50
50
|
from datasets import load_dataset
|
|
51
51
|
ds = load_dataset("beans", split="train")
|
|
52
52
|
import datachain as dc
|
|
53
|
-
chain = dc.
|
|
53
|
+
chain = dc.read_hf(ds)
|
|
54
54
|
```
|
|
55
55
|
"""
|
|
56
56
|
from datachain.lib.hf import HFGenerator, get_output_schema, stream_splits
|
|
57
57
|
|
|
58
|
-
from .values import
|
|
58
|
+
from .values import read_values
|
|
59
59
|
|
|
60
60
|
output: dict[str, DataType] = {}
|
|
61
61
|
ds_dict = stream_splits(dataset, *args, **kwargs)
|
|
@@ -69,5 +69,5 @@ def from_hf(
|
|
|
69
69
|
if object_name:
|
|
70
70
|
output = {object_name: model}
|
|
71
71
|
|
|
72
|
-
chain =
|
|
72
|
+
chain = read_values(split=list(ds_dict.keys()), session=session, settings=settings)
|
|
73
73
|
return chain.gen(HFGenerator(dataset, model, *args, **kwargs), output=output)
|
datachain/lib/dc/json.py
CHANGED
|
@@ -22,7 +22,7 @@ if TYPE_CHECKING:
|
|
|
22
22
|
P = ParamSpec("P")
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
def
|
|
25
|
+
def read_json(
|
|
26
26
|
path: Union[str, os.PathLike[str]],
|
|
27
27
|
type: FileType = "text",
|
|
28
28
|
spec: Optional[DataType] = None,
|
|
@@ -52,16 +52,16 @@ def from_json(
|
|
|
52
52
|
infer JSON schema from data, reduce using JMESPATH
|
|
53
53
|
```py
|
|
54
54
|
import datachain as dc
|
|
55
|
-
chain = dc.
|
|
55
|
+
chain = dc.read_json("gs://json", jmespath="key1.key2")
|
|
56
56
|
```
|
|
57
57
|
|
|
58
58
|
infer JSON schema from a particular path
|
|
59
59
|
```py
|
|
60
60
|
import datachain as dc
|
|
61
|
-
chain = dc.
|
|
61
|
+
chain = dc.read_json("gs://json_ds", schema_from="gs://json/my.json")
|
|
62
62
|
```
|
|
63
63
|
"""
|
|
64
|
-
from .storage import
|
|
64
|
+
from .storage import read_storage
|
|
65
65
|
|
|
66
66
|
if schema_from == "auto":
|
|
67
67
|
schema_from = os.fspath(path)
|
|
@@ -74,7 +74,7 @@ def from_json(
|
|
|
74
74
|
object_name = jmespath_to_name(jmespath)
|
|
75
75
|
if not object_name:
|
|
76
76
|
object_name = format
|
|
77
|
-
chain =
|
|
77
|
+
chain = read_storage(uri=path, type=type, **kwargs)
|
|
78
78
|
signal_dict = {
|
|
79
79
|
object_name: read_meta(
|
|
80
80
|
schema_from=schema_from,
|
datachain/lib/dc/listings.py
CHANGED
|
@@ -6,7 +6,7 @@ from typing import (
|
|
|
6
6
|
from datachain.lib.listing_info import ListingInfo
|
|
7
7
|
from datachain.query import Session
|
|
8
8
|
|
|
9
|
-
from .values import
|
|
9
|
+
from .values import read_values
|
|
10
10
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
12
|
from typing_extensions import ParamSpec
|
|
@@ -35,7 +35,7 @@ def listings(
|
|
|
35
35
|
session = Session.get(session, in_memory=in_memory)
|
|
36
36
|
catalog = kwargs.get("catalog") or session.catalog
|
|
37
37
|
|
|
38
|
-
return
|
|
38
|
+
return read_values(
|
|
39
39
|
session=session,
|
|
40
40
|
in_memory=in_memory,
|
|
41
41
|
output={object_name: ListingInfo},
|
datachain/lib/dc/pandas.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import (
|
|
|
5
5
|
|
|
6
6
|
from datachain.query import Session
|
|
7
7
|
|
|
8
|
-
from .values import
|
|
8
|
+
from .values import read_values
|
|
9
9
|
|
|
10
10
|
if TYPE_CHECKING:
|
|
11
11
|
import pandas as pd
|
|
@@ -16,7 +16,7 @@ if TYPE_CHECKING:
|
|
|
16
16
|
P = ParamSpec("P")
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
def
|
|
19
|
+
def read_pandas( # type: ignore[override]
|
|
20
20
|
df: "pd.DataFrame",
|
|
21
21
|
name: str = "",
|
|
22
22
|
session: Optional[Session] = None,
|
|
@@ -32,7 +32,7 @@ def from_pandas( # type: ignore[override]
|
|
|
32
32
|
import datachain as dc
|
|
33
33
|
|
|
34
34
|
df = pd.DataFrame({"fib": [1, 2, 3, 5, 8]})
|
|
35
|
-
dc.
|
|
35
|
+
dc.read_pandas(df)
|
|
36
36
|
```
|
|
37
37
|
"""
|
|
38
38
|
from .utils import DatasetPrepareError
|
|
@@ -46,7 +46,7 @@ def from_pandas( # type: ignore[override]
|
|
|
46
46
|
f"import from pandas error - '{column}' cannot be a column name",
|
|
47
47
|
)
|
|
48
48
|
|
|
49
|
-
return
|
|
49
|
+
return read_values(
|
|
50
50
|
name,
|
|
51
51
|
session,
|
|
52
52
|
settings=settings,
|
datachain/lib/dc/parquet.py
CHANGED
|
@@ -15,7 +15,7 @@ if TYPE_CHECKING:
|
|
|
15
15
|
P = ParamSpec("P")
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
def
|
|
18
|
+
def read_parquet(
|
|
19
19
|
path,
|
|
20
20
|
partitioning: Any = "hive",
|
|
21
21
|
output: Optional[dict[str, DataType]] = None,
|
|
@@ -43,18 +43,18 @@ def from_parquet(
|
|
|
43
43
|
Reading a single file:
|
|
44
44
|
```py
|
|
45
45
|
import datachain as dc
|
|
46
|
-
dc.
|
|
46
|
+
dc.read_parquet("s3://mybucket/file.parquet")
|
|
47
47
|
```
|
|
48
48
|
|
|
49
49
|
Reading a partitioned dataset from a directory:
|
|
50
50
|
```py
|
|
51
51
|
import datachain as dc
|
|
52
|
-
dc.
|
|
52
|
+
dc.read_parquet("s3://mybucket/dir")
|
|
53
53
|
```
|
|
54
54
|
"""
|
|
55
|
-
from .storage import
|
|
55
|
+
from .storage import read_storage
|
|
56
56
|
|
|
57
|
-
chain =
|
|
57
|
+
chain = read_storage(path, session=session, settings=settings, **kwargs)
|
|
58
58
|
return chain.parse_tabular(
|
|
59
59
|
output=output,
|
|
60
60
|
object_name=object_name,
|
datachain/lib/dc/records.py
CHANGED
|
@@ -21,7 +21,7 @@ if TYPE_CHECKING:
|
|
|
21
21
|
P = ParamSpec("P")
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
def
|
|
24
|
+
def read_records(
|
|
25
25
|
to_insert: Optional[Union[dict, list[dict]]],
|
|
26
26
|
session: Optional[Session] = None,
|
|
27
27
|
settings: Optional[dict] = None,
|
|
@@ -40,10 +40,10 @@ def from_records(
|
|
|
40
40
|
Example:
|
|
41
41
|
```py
|
|
42
42
|
import datachain as dc
|
|
43
|
-
single_record = dc.
|
|
43
|
+
single_record = dc.read_records(dc.DEFAULT_FILE_RECORD)
|
|
44
44
|
```
|
|
45
45
|
"""
|
|
46
|
-
from .datasets import
|
|
46
|
+
from .datasets import read_dataset
|
|
47
47
|
|
|
48
48
|
session = Session.get(session, in_memory=in_memory)
|
|
49
49
|
catalog = session.catalog
|
|
@@ -87,4 +87,4 @@ def from_records(
|
|
|
87
87
|
insert_q = dr.get_table().insert()
|
|
88
88
|
for record in to_insert:
|
|
89
89
|
db.execute(insert_q.values(**record))
|
|
90
|
-
return
|
|
90
|
+
return read_dataset(name=dsr.name, session=session, settings=settings)
|
datachain/lib/dc/storage.py
CHANGED
|
@@ -21,7 +21,7 @@ if TYPE_CHECKING:
|
|
|
21
21
|
from .datachain import DataChain
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
def
|
|
24
|
+
def read_storage(
|
|
25
25
|
uri: Union[str, os.PathLike[str], list[str], list[os.PathLike[str]]],
|
|
26
26
|
*,
|
|
27
27
|
type: FileType = "binary",
|
|
@@ -55,12 +55,12 @@ def from_storage(
|
|
|
55
55
|
Simple call from s3:
|
|
56
56
|
```python
|
|
57
57
|
import datachain as dc
|
|
58
|
-
chain = dc.
|
|
58
|
+
chain = dc.read_storage("s3://my-bucket/my-dir")
|
|
59
59
|
```
|
|
60
60
|
|
|
61
61
|
Multiple URIs:
|
|
62
62
|
```python
|
|
63
|
-
chain = dc.
|
|
63
|
+
chain = dc.read_storage([
|
|
64
64
|
"s3://bucket1/dir1",
|
|
65
65
|
"s3://bucket2/dir2"
|
|
66
66
|
])
|
|
@@ -68,7 +68,7 @@ def from_storage(
|
|
|
68
68
|
|
|
69
69
|
With AWS S3-compatible storage:
|
|
70
70
|
```python
|
|
71
|
-
chain = dc.
|
|
71
|
+
chain = dc.read_storage(
|
|
72
72
|
"s3://my-bucket/my-dir",
|
|
73
73
|
client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
|
|
74
74
|
)
|
|
@@ -77,7 +77,7 @@ def from_storage(
|
|
|
77
77
|
Pass existing session
|
|
78
78
|
```py
|
|
79
79
|
session = Session.get()
|
|
80
|
-
chain = dc.
|
|
80
|
+
chain = dc.read_storage([
|
|
81
81
|
"path/to/dir1",
|
|
82
82
|
"path/to/dir2"
|
|
83
83
|
], session=session, recursive=True)
|
|
@@ -88,9 +88,9 @@ def from_storage(
|
|
|
88
88
|
avoiding redundant updates for URIs pointing to the same storage location.
|
|
89
89
|
"""
|
|
90
90
|
from .datachain import DataChain
|
|
91
|
-
from .datasets import
|
|
92
|
-
from .records import
|
|
93
|
-
from .values import
|
|
91
|
+
from .datasets import read_dataset
|
|
92
|
+
from .records import read_records
|
|
93
|
+
from .values import read_values
|
|
94
94
|
|
|
95
95
|
file_type = get_file_type(type)
|
|
96
96
|
|
|
@@ -122,7 +122,8 @@ def from_storage(
|
|
|
122
122
|
)
|
|
123
123
|
continue
|
|
124
124
|
|
|
125
|
-
dc =
|
|
125
|
+
dc = read_dataset(list_ds_name, session=session, settings=settings)
|
|
126
|
+
dc._query.update = update
|
|
126
127
|
dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
|
|
127
128
|
|
|
128
129
|
if update or not list_ds_exists:
|
|
@@ -130,7 +131,7 @@ def from_storage(
|
|
|
130
131
|
def lst_fn(ds_name, lst_uri):
|
|
131
132
|
# disable prefetch for listing, as it pre-downloads all files
|
|
132
133
|
(
|
|
133
|
-
|
|
134
|
+
read_records(
|
|
134
135
|
DataChain.DEFAULT_FILE_RECORD,
|
|
135
136
|
session=session,
|
|
136
137
|
settings=settings,
|
|
@@ -144,7 +145,7 @@ def from_storage(
|
|
|
144
145
|
.save(ds_name, listing=True)
|
|
145
146
|
)
|
|
146
147
|
|
|
147
|
-
dc._query.
|
|
148
|
+
dc._query.set_listing_fn(
|
|
148
149
|
lambda ds_name=list_ds_name, lst_uri=list_uri: lst_fn(ds_name, lst_uri)
|
|
149
150
|
)
|
|
150
151
|
|
|
@@ -154,7 +155,7 @@ def from_storage(
|
|
|
154
155
|
listed_ds_name.add(list_ds_name)
|
|
155
156
|
|
|
156
157
|
if file_values:
|
|
157
|
-
file_chain =
|
|
158
|
+
file_chain = read_values(
|
|
158
159
|
session=session,
|
|
159
160
|
settings=settings,
|
|
160
161
|
in_memory=in_memory,
|
datachain/lib/dc/values.py
CHANGED
|
@@ -6,7 +6,7 @@ from typing import (
|
|
|
6
6
|
|
|
7
7
|
from datachain.lib.convert.values_to_tuples import values_to_tuples
|
|
8
8
|
from datachain.lib.data_model import dict_to_data_model
|
|
9
|
-
from datachain.lib.dc.records import
|
|
9
|
+
from datachain.lib.dc.records import read_records
|
|
10
10
|
from datachain.lib.dc.utils import OutputType
|
|
11
11
|
from datachain.query import Session
|
|
12
12
|
|
|
@@ -18,7 +18,7 @@ if TYPE_CHECKING:
|
|
|
18
18
|
P = ParamSpec("P")
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
def
|
|
21
|
+
def read_values(
|
|
22
22
|
ds_name: str = "",
|
|
23
23
|
session: Optional[Session] = None,
|
|
24
24
|
settings: Optional[dict] = None,
|
|
@@ -32,7 +32,7 @@ def from_values(
|
|
|
32
32
|
Example:
|
|
33
33
|
```py
|
|
34
34
|
import datachain as dc
|
|
35
|
-
dc.
|
|
35
|
+
dc.read_values(fib=[1, 2, 3, 5, 8])
|
|
36
36
|
```
|
|
37
37
|
"""
|
|
38
38
|
from .datachain import DataChain
|
|
@@ -42,7 +42,7 @@ def from_values(
|
|
|
42
42
|
def _func_fr() -> Iterator[tuple_type]: # type: ignore[valid-type]
|
|
43
43
|
yield from tuples
|
|
44
44
|
|
|
45
|
-
chain =
|
|
45
|
+
chain = read_records(
|
|
46
46
|
DataChain.DEFAULT_FILE_RECORD,
|
|
47
47
|
session=session,
|
|
48
48
|
settings=settings,
|
datachain/lib/listing.py
CHANGED
|
@@ -4,6 +4,7 @@ import os
|
|
|
4
4
|
import posixpath
|
|
5
5
|
from collections.abc import Iterator
|
|
6
6
|
from contextlib import contextmanager
|
|
7
|
+
from datetime import datetime, timedelta, timezone
|
|
7
8
|
from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
|
|
8
9
|
|
|
9
10
|
from fsspec.asyn import get_loop
|
|
@@ -32,6 +33,16 @@ logging.getLogger("aiobotocore.credentials").setLevel(logging.CRITICAL)
|
|
|
32
33
|
logging.getLogger("gcsfs").setLevel(logging.CRITICAL)
|
|
33
34
|
|
|
34
35
|
|
|
36
|
+
def listing_dataset_expired(lst_ds) -> bool:
|
|
37
|
+
"""Function that checks if listing dataset is expired or not"""
|
|
38
|
+
lst_version = lst_ds.versions[-1]
|
|
39
|
+
if not lst_version.finished_at:
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
expires = lst_version.finished_at + timedelta(seconds=LISTING_TTL)
|
|
43
|
+
return datetime.now(timezone.utc) > expires
|
|
44
|
+
|
|
45
|
+
|
|
35
46
|
def list_bucket(uri: str, cache, client_config=None) -> Callable:
|
|
36
47
|
"""
|
|
37
48
|
Function that returns another generator function that yields File objects
|
datachain/lib/meta_formats.py
CHANGED
|
@@ -103,10 +103,10 @@ def read_meta( # noqa: C901
|
|
|
103
103
|
model_name=None,
|
|
104
104
|
nrows=None,
|
|
105
105
|
) -> Callable:
|
|
106
|
-
from datachain import
|
|
106
|
+
from datachain import read_storage
|
|
107
107
|
|
|
108
108
|
if schema_from:
|
|
109
|
-
file = next(
|
|
109
|
+
file = next(read_storage(schema_from, type="text").limit(1).collect("file"))
|
|
110
110
|
model_code = gen_datamodel_code(
|
|
111
111
|
file, format=format, jmespath=jmespath, model_name=model_name
|
|
112
112
|
)
|
datachain/lib/pytorch.py
CHANGED
|
@@ -14,7 +14,7 @@ from torchvision.transforms import v2
|
|
|
14
14
|
from datachain import Session
|
|
15
15
|
from datachain.cache import get_temp_cache
|
|
16
16
|
from datachain.catalog import Catalog, get_catalog
|
|
17
|
-
from datachain.lib.dc.datasets import
|
|
17
|
+
from datachain.lib.dc.datasets import read_dataset
|
|
18
18
|
from datachain.lib.settings import Settings
|
|
19
19
|
from datachain.lib.text import convert_text
|
|
20
20
|
from datachain.progress import CombinedDownloadCallback
|
|
@@ -122,7 +122,7 @@ class PytorchDataset(IterableDataset):
|
|
|
122
122
|
) -> Generator[tuple[Any, ...], None, None]:
|
|
123
123
|
catalog = self._get_catalog()
|
|
124
124
|
session = Session("PyTorch", catalog=catalog)
|
|
125
|
-
ds =
|
|
125
|
+
ds = read_dataset(
|
|
126
126
|
name=self.name, version=self.version, session=session
|
|
127
127
|
).settings(cache=self.cache, prefetch=self.prefetch)
|
|
128
128
|
ds = ds.remove_file_signals()
|
datachain/lib/udf.py
CHANGED
datachain/query/dataset.py
CHANGED
|
@@ -47,7 +47,10 @@ from datachain.error import (
|
|
|
47
47
|
QueryScriptCancelError,
|
|
48
48
|
)
|
|
49
49
|
from datachain.func.base import Function
|
|
50
|
-
from datachain.lib.listing import
|
|
50
|
+
from datachain.lib.listing import (
|
|
51
|
+
is_listing_dataset,
|
|
52
|
+
listing_dataset_expired,
|
|
53
|
+
)
|
|
51
54
|
from datachain.lib.udf import UDFAdapter, _get_cache
|
|
52
55
|
from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback
|
|
53
56
|
from datachain.query.schema import C, UDFParamSpec, normalize_param
|
|
@@ -1080,6 +1083,7 @@ class DatasetQuery:
|
|
|
1080
1083
|
indexing_column_types: Optional[dict[str, Any]] = None,
|
|
1081
1084
|
in_memory: bool = False,
|
|
1082
1085
|
fallback_to_studio: bool = True,
|
|
1086
|
+
update: bool = False,
|
|
1083
1087
|
) -> None:
|
|
1084
1088
|
from datachain.remote.studio import is_token_set
|
|
1085
1089
|
|
|
@@ -1097,6 +1101,8 @@ class DatasetQuery:
|
|
|
1097
1101
|
self.feature_schema: Optional[dict] = None
|
|
1098
1102
|
self.column_types: Optional[dict[str, Any]] = None
|
|
1099
1103
|
self.before_steps: list[Callable] = []
|
|
1104
|
+
self.listing_fn: Optional[Callable] = None
|
|
1105
|
+
self.update = update
|
|
1100
1106
|
|
|
1101
1107
|
self.list_ds_name: Optional[str] = None
|
|
1102
1108
|
|
|
@@ -1190,23 +1196,30 @@ class DatasetQuery:
|
|
|
1190
1196
|
col.table = self.table
|
|
1191
1197
|
return col
|
|
1192
1198
|
|
|
1193
|
-
def
|
|
1194
|
-
"""
|
|
1195
|
-
|
|
1196
|
-
"""
|
|
1197
|
-
self.before_steps.append(fn)
|
|
1199
|
+
def set_listing_fn(self, fn: Callable) -> None:
|
|
1200
|
+
"""Setting listing function to be run if needed"""
|
|
1201
|
+
self.listing_fn = fn
|
|
1198
1202
|
|
|
1199
1203
|
def apply_steps(self) -> QueryGenerator:
|
|
1200
1204
|
"""
|
|
1201
1205
|
Apply the steps in the query and return the resulting
|
|
1202
1206
|
sqlalchemy.SelectBase.
|
|
1203
1207
|
"""
|
|
1204
|
-
|
|
1205
|
-
|
|
1208
|
+
if self.list_ds_name and not self.starting_step:
|
|
1209
|
+
listing_ds = None
|
|
1210
|
+
try:
|
|
1211
|
+
listing_ds = self.catalog.get_dataset(self.list_ds_name)
|
|
1212
|
+
except DatasetNotFoundError:
|
|
1213
|
+
pass
|
|
1214
|
+
|
|
1215
|
+
if not listing_ds or self.update or listing_dataset_expired(listing_ds):
|
|
1216
|
+
assert self.listing_fn
|
|
1217
|
+
self.listing_fn()
|
|
1218
|
+
listing_ds = self.catalog.get_dataset(self.list_ds_name)
|
|
1206
1219
|
|
|
1207
|
-
if self.list_ds_name:
|
|
1208
1220
|
# at this point we know what is our starting listing dataset name
|
|
1209
|
-
self._set_starting_step(
|
|
1221
|
+
self._set_starting_step(listing_ds) # type: ignore [arg-type]
|
|
1222
|
+
|
|
1210
1223
|
query = self.clone()
|
|
1211
1224
|
|
|
1212
1225
|
index = os.getenv("DATACHAIN_QUERY_CHUNK_INDEX", self._chunk_index)
|
datachain/toolkit/split.py
CHANGED
|
@@ -41,7 +41,7 @@ def train_test_split(
|
|
|
41
41
|
from datachain.toolkit import train_test_split
|
|
42
42
|
|
|
43
43
|
# Load a DataChain from a storage source (e.g., S3 bucket)
|
|
44
|
-
dc = dc.
|
|
44
|
+
dc = dc.read_storage("s3://bucket/dir/")
|
|
45
45
|
|
|
46
46
|
# Perform a 70/30 train-test split
|
|
47
47
|
train, test = train_test_split(dc, [0.7, 0.3])
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.14.
|
|
3
|
+
Version: 0.14.2
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -38,7 +38,7 @@ Requires-Dist: sqlalchemy>=2
|
|
|
38
38
|
Requires-Dist: multiprocess==0.70.16
|
|
39
39
|
Requires-Dist: cloudpickle
|
|
40
40
|
Requires-Dist: orjson>=3.10.5
|
|
41
|
-
Requires-Dist: pydantic<
|
|
41
|
+
Requires-Dist: pydantic<2.11,>=2
|
|
42
42
|
Requires-Dist: jmespath>=1.0
|
|
43
43
|
Requires-Dist: datamodel-code-generator>=0.25
|
|
44
44
|
Requires-Dist: Pillow<12,>=10.0.0
|
|
@@ -171,8 +171,8 @@ high confidence scores.
|
|
|
171
171
|
|
|
172
172
|
import datachain as dc
|
|
173
173
|
|
|
174
|
-
meta = dc.
|
|
175
|
-
images = dc.
|
|
174
|
+
meta = dc.read_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta", anon=True)
|
|
175
|
+
images = dc.read_storage("gs://datachain-demo/dogs-and-cats/*jpg", anon=True)
|
|
176
176
|
|
|
177
177
|
images_id = images.map(id=lambda file: file.path.split('.')[-2])
|
|
178
178
|
annotated = images_id.merge(meta, on="id", right_on="meta.id")
|
|
@@ -213,7 +213,7 @@ Python code:
|
|
|
213
213
|
return result.lower().startswith("success")
|
|
214
214
|
|
|
215
215
|
chain = (
|
|
216
|
-
dc.
|
|
216
|
+
dc.read_storage("gs://datachain-demo/chatbot-KiT/", object_name="file", anon=True)
|
|
217
217
|
.settings(parallel=4, cache=True)
|
|
218
218
|
.map(is_success=eval_dialogue)
|
|
219
219
|
.save("mistral_files")
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datachain/__init__.py,sha256=
|
|
1
|
+
datachain/__init__.py,sha256=h3W0agyTcpXOfMA26jZyHo-Gs7vLXhbR-9uEkzK8Szk,1414
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
|
|
4
4
|
datachain/cache.py,sha256=yQblPhOh_Mq74Ma7xT1CL1idLJ0HgrQxpGVYvRy_9Eg,3623
|
|
@@ -17,7 +17,7 @@ datachain/studio.py,sha256=9MEpFPLKI3gG4isKklcfD5BMLeNsSXhtOUboOjW4Fdc,10017
|
|
|
17
17
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
18
18
|
datachain/utils.py,sha256=CLAYkI7iPbLYw3Pjh5EkWuc2UOs8wEbuXQnqIs4UyV8,14173
|
|
19
19
|
datachain/catalog/__init__.py,sha256=cMZzSz3VoUi-6qXSVaHYN-agxQuAcz2XSqnEPZ55crE,353
|
|
20
|
-
datachain/catalog/catalog.py,sha256=
|
|
20
|
+
datachain/catalog/catalog.py,sha256=k-okQ4aqoyWrsNlDeCz6jP6TNRiZCUENbGV9Sz6EEtw,60729
|
|
21
21
|
datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
|
|
22
22
|
datachain/catalog/loader.py,sha256=AhSQR_-S-9lY3DcXn3PVZv9UtarHOMlDy2x75iDwUjo,6035
|
|
23
23
|
datachain/cli/__init__.py,sha256=YPVkuQ7IezNhtzo5xrfca1hEIiZtFxOlJCOzAOEuxmA,8335
|
|
@@ -29,7 +29,7 @@ datachain/cli/commands/index.py,sha256=eglNaIe1yyIadUHHumjtNbgIjht6kme7SS7xE3YHR
|
|
|
29
29
|
datachain/cli/commands/ls.py,sha256=dSD2_MHng4t9HRFJZWMOCjPL4XU3qaBV3piNl8UXP08,5275
|
|
30
30
|
datachain/cli/commands/misc.py,sha256=c0DmkOLwcDI2YhA8ArOuLJk6aGzSMZCiKL_E2JGibVE,600
|
|
31
31
|
datachain/cli/commands/query.py,sha256=2S7hQxialt1fkbocxi6JXZI6jS5QnFrD1aOjKgZkzfI,1471
|
|
32
|
-
datachain/cli/commands/show.py,sha256=
|
|
32
|
+
datachain/cli/commands/show.py,sha256=P6e6bYiRCyVKO0ggnoFkLkwGmBWlrlm8W5c_sBNxBBw,1604
|
|
33
33
|
datachain/cli/parser/__init__.py,sha256=rtjlqSsDd4LZH9WdgvluO27M4sID1wD7YkQ4cKhNXzw,15721
|
|
34
34
|
datachain/cli/parser/job.py,sha256=kvQkSfieyUmvJpOK8p78UgS8sygHhQXztRlOtVcgtaU,3449
|
|
35
35
|
datachain/cli/parser/studio.py,sha256=Y-1OlQGecLVi9QofvWUfSlPd2ISyaESf7QFGZqGsrdw,3609
|
|
@@ -73,16 +73,16 @@ datachain/lib/dataset_info.py,sha256=IjdF1E0TQNOq9YyynfWiCFTeZpbyGfyJvxgJY4YN810
|
|
|
73
73
|
datachain/lib/file.py,sha256=HLQXS_WULm7Y-fkHMy0WpibVAcrkLPRS6CrZy6rwFe0,30450
|
|
74
74
|
datachain/lib/hf.py,sha256=gjxuStZBlKtNk3-4yYSlWZDv9zBGblOdvEy_Lwap5hA,5882
|
|
75
75
|
datachain/lib/image.py,sha256=butvUY_33PVEYPKX2nVCPeJjJVcBaptZwsE9REQsTS8,3247
|
|
76
|
-
datachain/lib/listing.py,sha256=
|
|
76
|
+
datachain/lib/listing.py,sha256=O29s7H-2rqjHHGKWkKGNNXlo2zynv4pygVTKImpV8fo,7046
|
|
77
77
|
datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
|
|
78
|
-
datachain/lib/meta_formats.py,sha256=
|
|
78
|
+
datachain/lib/meta_formats.py,sha256=Epydbdch1g4CojK8wd_ePzmwmljC4fVWlJtZ16jsX-A,6349
|
|
79
79
|
datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
|
|
80
|
-
datachain/lib/pytorch.py,sha256=
|
|
80
|
+
datachain/lib/pytorch.py,sha256=YS6yR13iVlrAXo5wzJswFFUHwWOql9KTdWIa86DXB-k,7712
|
|
81
81
|
datachain/lib/settings.py,sha256=ZELRCTLbi5vzRPiDX6cQ9LLg9TefJ_A05gIGni0lll8,2535
|
|
82
82
|
datachain/lib/signal_schema.py,sha256=DRatqSG7OVtCUCWyZvMXe4m7r7XFO6NCfzsJRDErMtg,35185
|
|
83
83
|
datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
|
|
84
84
|
datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
|
|
85
|
-
datachain/lib/udf.py,sha256=
|
|
85
|
+
datachain/lib/udf.py,sha256=h38a457xg-4wO2XcxPs4pzDq8JxTmYm4N84iAf0HRzY,16168
|
|
86
86
|
datachain/lib/udf_signature.py,sha256=2EtsOPDNSPqcOlYwqbCdy6RF5MldI-7smii8aLy8p7Y,7543
|
|
87
87
|
datachain/lib/utils.py,sha256=QrjVs_oLRXEotOPUYurBJypBFi_ReTJmxcnJeH4j2Uk,1596
|
|
88
88
|
datachain/lib/video.py,sha256=suH_8Mi8VYk4-IVb1vjSduF_njs64ji1WGKHxDLnGYw,6629
|
|
@@ -94,19 +94,19 @@ datachain/lib/convert/python_to_sql.py,sha256=wg-O5FRKX3x3Wh8ZL1b9ntMlgf1zRO4djM
|
|
|
94
94
|
datachain/lib/convert/sql_to_python.py,sha256=XXCBYDQFUXJIBNWkjEP944cnCfJ8GF2Tji0DLF3A_zQ,315
|
|
95
95
|
datachain/lib/convert/unflatten.py,sha256=ysMkstwJzPMWUlnxn-Z-tXJR3wmhjHeSN_P-sDcLS6s,2010
|
|
96
96
|
datachain/lib/convert/values_to_tuples.py,sha256=EFfIGBiVVltJQG8blzsQ1dGXneh4D3wdLfSUeoK10OI,3931
|
|
97
|
-
datachain/lib/dc/__init__.py,sha256=
|
|
98
|
-
datachain/lib/dc/csv.py,sha256=
|
|
99
|
-
datachain/lib/dc/datachain.py,sha256=
|
|
100
|
-
datachain/lib/dc/datasets.py,sha256=
|
|
101
|
-
datachain/lib/dc/hf.py,sha256=
|
|
102
|
-
datachain/lib/dc/json.py,sha256=
|
|
103
|
-
datachain/lib/dc/listings.py,sha256=
|
|
104
|
-
datachain/lib/dc/pandas.py,sha256=
|
|
105
|
-
datachain/lib/dc/parquet.py,sha256=
|
|
106
|
-
datachain/lib/dc/records.py,sha256=
|
|
107
|
-
datachain/lib/dc/storage.py,sha256=
|
|
97
|
+
datachain/lib/dc/__init__.py,sha256=6rKKHS6MA3mS6UJXiysrv4TURs4R_UWAQK2tJ2t1QMs,743
|
|
98
|
+
datachain/lib/dc/csv.py,sha256=d0ULzpsTTeqp_eM-2jVHb1kYHQN2lJFf4O6LWd5tOJw,4401
|
|
99
|
+
datachain/lib/dc/datachain.py,sha256=hwuAElfEhRLyh-Uvuc7YIpFx6nsI_B90xwnMqgkkgrI,76390
|
|
100
|
+
datachain/lib/dc/datasets.py,sha256=hTzq18Ij9kpOAJOU-VN4-VyThTTxLSWLfVIk3bgzAPs,4329
|
|
101
|
+
datachain/lib/dc/hf.py,sha256=I1vFNOa1C87lBuBj5FHENLY2jTaQ8erngiX0cyBmOp4,2170
|
|
102
|
+
datachain/lib/dc/json.py,sha256=9ei9ZNzWVXZWD4HNGTfBhcoLPnXBBDywKV-3Wi1mT28,2725
|
|
103
|
+
datachain/lib/dc/listings.py,sha256=qPy1DTvYkbNICT1ujo8LwezzMEW8E3dln1knw7Jwl0I,1044
|
|
104
|
+
datachain/lib/dc/pandas.py,sha256=jJvgNPPjiSLAjdYlhI4fvGKNWRh-hbMgZyBlURS633E,1249
|
|
105
|
+
datachain/lib/dc/parquet.py,sha256=lXCSr_S7bQsPUWq1pJ-Ur8R8RxArjyFpCpBXK-aorQw,1809
|
|
106
|
+
datachain/lib/dc/records.py,sha256=DOFkQV7A7kZnMiCS4mHOzee2ibWIhz-mWQpgVsU78SE,2524
|
|
107
|
+
datachain/lib/dc/storage.py,sha256=kM3Ix2L0j01a4XcXPZpdDxvici9yu-Ks-Cd3uf_qESA,5327
|
|
108
108
|
datachain/lib/dc/utils.py,sha256=Ct-0FqCaDhNWHx09gJFcCXJGPjMI-VZr4t-GJyqTi44,3984
|
|
109
|
-
datachain/lib/dc/values.py,sha256=
|
|
109
|
+
datachain/lib/dc/values.py,sha256=HaABQKmhgW-N1pcBn7CQuTIiOFXYVjU1H9LbupGM3WQ,1409
|
|
110
110
|
datachain/model/__init__.py,sha256=R9faX5OHV1xh2EW-g2MPedwbtEqt3LodJRyluB-QylI,189
|
|
111
111
|
datachain/model/bbox.py,sha256=cQNHuQuVsh6bW3n3Hj40F2Cc20cExQ9Lg_q7R2jxUMI,9324
|
|
112
112
|
datachain/model/pose.py,sha256=rjquA6M-I-Y30Xm6YSkGv1OY52hJZmR2AuxbIpE5uD0,3865
|
|
@@ -118,7 +118,7 @@ datachain/model/ultralytics/pose.py,sha256=gXAWfAk4OWZl93hKcQPKZvqJa3nIrECB4RM8K
|
|
|
118
118
|
datachain/model/ultralytics/segment.py,sha256=koq1HASo29isf0in6oSlzmU4IzsmOXe87F1ajQQVfh4,2911
|
|
119
119
|
datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
|
|
120
120
|
datachain/query/batch.py,sha256=6w8gzLTmLeylststu-gT5jIqEfi4-djS7_yTYyeo-fw,4190
|
|
121
|
-
datachain/query/dataset.py,sha256=
|
|
121
|
+
datachain/query/dataset.py,sha256=G_fyt3vwifY5Usnp8pvkho543innrcDOImKLqG3W3YU,58665
|
|
122
122
|
datachain/query/dispatch.py,sha256=_1vjeQ1wjUoxlik55k0JkWqQCUfMjgVWmEOyWRkx0dU,12437
|
|
123
123
|
datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
|
|
124
124
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
@@ -148,11 +148,11 @@ datachain/sql/sqlite/base.py,sha256=N-cQT0Hpu9ROWe4OiKlkkn_YP1NKCRZZ3xSfTzpyaDA,
|
|
|
148
148
|
datachain/sql/sqlite/types.py,sha256=cH6oge2E_YWFy22wY-txPJH8gxoQFSpCthtZR8PZjpo,1849
|
|
149
149
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
150
150
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
151
|
-
datachain/toolkit/split.py,sha256=
|
|
151
|
+
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
152
152
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
153
|
-
datachain-0.14.
|
|
154
|
-
datachain-0.14.
|
|
155
|
-
datachain-0.14.
|
|
156
|
-
datachain-0.14.
|
|
157
|
-
datachain-0.14.
|
|
158
|
-
datachain-0.14.
|
|
153
|
+
datachain-0.14.2.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
154
|
+
datachain-0.14.2.dist-info/METADATA,sha256=hvPp9rvpa2p9FnopnOrd4DvJE1Rugef5YHe8vViSPyI,11338
|
|
155
|
+
datachain-0.14.2.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
|
156
|
+
datachain-0.14.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
157
|
+
datachain-0.14.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
158
|
+
datachain-0.14.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|