datachain 0.14.1__py3-none-any.whl → 0.14.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +18 -18
- datachain/catalog/catalog.py +5 -5
- datachain/catalog/loader.py +4 -9
- datachain/cli/commands/show.py +2 -2
- datachain/data_storage/warehouse.py +9 -0
- datachain/lib/dc/__init__.py +18 -18
- datachain/lib/dc/csv.py +5 -5
- datachain/lib/dc/datachain.py +42 -42
- datachain/lib/dc/datasets.py +7 -7
- datachain/lib/dc/hf.py +5 -5
- datachain/lib/dc/json.py +5 -5
- datachain/lib/dc/listings.py +2 -2
- datachain/lib/dc/pandas.py +4 -4
- datachain/lib/dc/parquet.py +5 -5
- datachain/lib/dc/records.py +4 -4
- datachain/lib/dc/storage.py +13 -12
- datachain/lib/dc/values.py +4 -4
- datachain/lib/listing.py +11 -0
- datachain/lib/meta_formats.py +2 -2
- datachain/lib/pytorch.py +2 -2
- datachain/lib/udf.py +1 -1
- datachain/query/dataset.py +62 -50
- datachain/query/dispatch.py +6 -12
- datachain/query/udf.py +30 -1
- datachain/toolkit/split.py +1 -1
- datachain/utils.py +30 -4
- {datachain-0.14.1.dist-info → datachain-0.14.3.dist-info}/METADATA +5 -5
- {datachain-0.14.1.dist-info → datachain-0.14.3.dist-info}/RECORD +32 -32
- {datachain-0.14.1.dist-info → datachain-0.14.3.dist-info}/WHEEL +0 -0
- {datachain-0.14.1.dist-info → datachain-0.14.3.dist-info}/entry_points.txt +0 -0
- {datachain-0.14.1.dist-info → datachain-0.14.3.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.14.1.dist-info → datachain-0.14.3.dist-info}/top_level.txt +0 -0
datachain/__init__.py
CHANGED
|
@@ -5,16 +5,16 @@ from datachain.lib.dc import (
|
|
|
5
5
|
DataChain,
|
|
6
6
|
Sys,
|
|
7
7
|
datasets,
|
|
8
|
-
from_csv,
|
|
9
|
-
from_dataset,
|
|
10
|
-
from_hf,
|
|
11
|
-
from_json,
|
|
12
|
-
from_pandas,
|
|
13
|
-
from_parquet,
|
|
14
|
-
from_records,
|
|
15
|
-
from_storage,
|
|
16
|
-
from_values,
|
|
17
8
|
listings,
|
|
9
|
+
read_csv,
|
|
10
|
+
read_dataset,
|
|
11
|
+
read_hf,
|
|
12
|
+
read_json,
|
|
13
|
+
read_pandas,
|
|
14
|
+
read_parquet,
|
|
15
|
+
read_records,
|
|
16
|
+
read_storage,
|
|
17
|
+
read_values,
|
|
18
18
|
)
|
|
19
19
|
from datachain.lib.file import (
|
|
20
20
|
ArrowRow,
|
|
@@ -61,17 +61,17 @@ __all__ = [
|
|
|
61
61
|
"VideoFragment",
|
|
62
62
|
"VideoFrame",
|
|
63
63
|
"datasets",
|
|
64
|
-
"from_csv",
|
|
65
|
-
"from_dataset",
|
|
66
|
-
"from_hf",
|
|
67
|
-
"from_json",
|
|
68
|
-
"from_pandas",
|
|
69
|
-
"from_parquet",
|
|
70
|
-
"from_records",
|
|
71
|
-
"from_storage",
|
|
72
|
-
"from_values",
|
|
73
64
|
"is_chain_type",
|
|
74
65
|
"listings",
|
|
75
66
|
"metrics",
|
|
76
67
|
"param",
|
|
68
|
+
"read_csv",
|
|
69
|
+
"read_dataset",
|
|
70
|
+
"read_hf",
|
|
71
|
+
"read_json",
|
|
72
|
+
"read_pandas",
|
|
73
|
+
"read_parquet",
|
|
74
|
+
"read_records",
|
|
75
|
+
"read_storage",
|
|
76
|
+
"read_values",
|
|
77
77
|
]
|
datachain/catalog/catalog.py
CHANGED
|
@@ -583,10 +583,10 @@ class Catalog:
|
|
|
583
583
|
object_name="file",
|
|
584
584
|
skip_indexing=False,
|
|
585
585
|
) -> tuple[Optional["Listing"], "Client", str]:
|
|
586
|
-
from datachain import
|
|
586
|
+
from datachain import read_storage
|
|
587
587
|
from datachain.listing import Listing
|
|
588
588
|
|
|
589
|
-
|
|
589
|
+
read_storage(
|
|
590
590
|
source, session=self.session, update=update, object_name=object_name
|
|
591
591
|
).exec()
|
|
592
592
|
|
|
@@ -994,14 +994,14 @@ class Catalog:
|
|
|
994
994
|
if not sources:
|
|
995
995
|
raise ValueError("Sources needs to be non empty list")
|
|
996
996
|
|
|
997
|
-
from datachain import
|
|
997
|
+
from datachain import read_dataset, read_storage
|
|
998
998
|
|
|
999
999
|
chains = []
|
|
1000
1000
|
for source in sources:
|
|
1001
1001
|
if source.startswith(DATASET_PREFIX):
|
|
1002
|
-
dc =
|
|
1002
|
+
dc = read_dataset(source[len(DATASET_PREFIX) :], session=self.session)
|
|
1003
1003
|
else:
|
|
1004
|
-
dc =
|
|
1004
|
+
dc = read_storage(source, session=self.session, recursive=recursive)
|
|
1005
1005
|
|
|
1006
1006
|
chains.append(dc)
|
|
1007
1007
|
|
datachain/catalog/loader.py
CHANGED
|
@@ -7,6 +7,7 @@ from datachain.utils import get_envs_by_prefix
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
8
8
|
from datachain.catalog import Catalog
|
|
9
9
|
from datachain.data_storage import AbstractMetastore, AbstractWarehouse
|
|
10
|
+
from datachain.query.udf import AbstractUDFDistributor
|
|
10
11
|
|
|
11
12
|
METASTORE_SERIALIZED = "DATACHAIN__METASTORE"
|
|
12
13
|
METASTORE_IMPORT_PATH = "DATACHAIN_METASTORE"
|
|
@@ -15,7 +16,6 @@ WAREHOUSE_SERIALIZED = "DATACHAIN__WAREHOUSE"
|
|
|
15
16
|
WAREHOUSE_IMPORT_PATH = "DATACHAIN_WAREHOUSE"
|
|
16
17
|
WAREHOUSE_ARG_PREFIX = "DATACHAIN_WAREHOUSE_ARG_"
|
|
17
18
|
DISTRIBUTED_IMPORT_PATH = "DATACHAIN_DISTRIBUTED"
|
|
18
|
-
DISTRIBUTED_ARG_PREFIX = "DATACHAIN_DISTRIBUTED_ARG_"
|
|
19
19
|
|
|
20
20
|
IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
|
|
21
21
|
|
|
@@ -100,27 +100,22 @@ def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
|
|
|
100
100
|
return warehouse_class(**warehouse_args)
|
|
101
101
|
|
|
102
102
|
|
|
103
|
-
def
|
|
103
|
+
def get_udf_distributor_class() -> type["AbstractUDFDistributor"]:
|
|
104
104
|
distributed_import_path = os.environ.get(DISTRIBUTED_IMPORT_PATH)
|
|
105
|
-
distributed_arg_envs = get_envs_by_prefix(DISTRIBUTED_ARG_PREFIX)
|
|
106
|
-
# Convert env variable names to keyword argument names by lowercasing them
|
|
107
|
-
distributed_args = {k.lower(): v for k, v in distributed_arg_envs.items()}
|
|
108
105
|
|
|
109
106
|
if not distributed_import_path:
|
|
110
107
|
raise RuntimeError(
|
|
111
108
|
f"{DISTRIBUTED_IMPORT_PATH} import path is required "
|
|
112
109
|
"for distributed UDF processing."
|
|
113
110
|
)
|
|
114
|
-
# Distributed class paths are specified as (for example):
|
|
115
|
-
# module.classname
|
|
111
|
+
# Distributed class paths are specified as (for example): module.classname
|
|
116
112
|
if "." not in distributed_import_path:
|
|
117
113
|
raise RuntimeError(
|
|
118
114
|
f"Invalid {DISTRIBUTED_IMPORT_PATH} import path: {distributed_import_path}"
|
|
119
115
|
)
|
|
120
116
|
module_name, _, class_name = distributed_import_path.rpartition(".")
|
|
121
117
|
distributed = import_module(module_name)
|
|
122
|
-
|
|
123
|
-
return distributed_class(**distributed_args | kwargs)
|
|
118
|
+
return getattr(distributed, class_name)
|
|
124
119
|
|
|
125
120
|
|
|
126
121
|
def get_catalog(
|
datachain/cli/commands/show.py
CHANGED
|
@@ -18,7 +18,7 @@ def show(
|
|
|
18
18
|
schema: bool = False,
|
|
19
19
|
include_hidden: bool = False,
|
|
20
20
|
) -> None:
|
|
21
|
-
from datachain import Session,
|
|
21
|
+
from datachain import Session, read_dataset
|
|
22
22
|
from datachain.query.dataset import DatasetQuery
|
|
23
23
|
from datachain.utils import show_records
|
|
24
24
|
|
|
@@ -51,5 +51,5 @@ def show(
|
|
|
51
51
|
if schema and dataset_version.feature_schema:
|
|
52
52
|
print("\nSchema:")
|
|
53
53
|
session = Session.get(catalog=catalog)
|
|
54
|
-
dc =
|
|
54
|
+
dc = read_dataset(name=name, version=version, session=session)
|
|
55
55
|
dc.print_schema()
|
|
@@ -199,6 +199,15 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
199
199
|
# Query Execution
|
|
200
200
|
#
|
|
201
201
|
|
|
202
|
+
def query_count(self, query: sa.sql.selectable.Select) -> int:
|
|
203
|
+
"""Count the number of rows in a query."""
|
|
204
|
+
count_query = sa.select(func.count(1)).select_from(query.subquery())
|
|
205
|
+
return next(self.db.execute(count_query))[0]
|
|
206
|
+
|
|
207
|
+
def table_rows_count(self, table) -> int:
|
|
208
|
+
count_query = sa.select(func.count(1)).select_from(table)
|
|
209
|
+
return next(self.db.execute(count_query))[0]
|
|
210
|
+
|
|
202
211
|
def dataset_select_paginated(
|
|
203
212
|
self,
|
|
204
213
|
query,
|
datachain/lib/dc/__init__.py
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
from .csv import
|
|
1
|
+
from .csv import read_csv
|
|
2
2
|
from .datachain import C, Column, DataChain
|
|
3
|
-
from .datasets import datasets,
|
|
4
|
-
from .hf import
|
|
5
|
-
from .json import
|
|
3
|
+
from .datasets import datasets, read_dataset
|
|
4
|
+
from .hf import read_hf
|
|
5
|
+
from .json import read_json
|
|
6
6
|
from .listings import listings
|
|
7
|
-
from .pandas import
|
|
8
|
-
from .parquet import
|
|
9
|
-
from .records import
|
|
10
|
-
from .storage import
|
|
7
|
+
from .pandas import read_pandas
|
|
8
|
+
from .parquet import read_parquet
|
|
9
|
+
from .records import read_records
|
|
10
|
+
from .storage import read_storage
|
|
11
11
|
from .utils import DatasetMergeError, DatasetPrepareError, Sys
|
|
12
|
-
from .values import
|
|
12
|
+
from .values import read_values
|
|
13
13
|
|
|
14
14
|
__all__ = [
|
|
15
15
|
"C",
|
|
@@ -19,14 +19,14 @@ __all__ = [
|
|
|
19
19
|
"DatasetPrepareError",
|
|
20
20
|
"Sys",
|
|
21
21
|
"datasets",
|
|
22
|
-
"from_csv",
|
|
23
|
-
"from_dataset",
|
|
24
|
-
"from_hf",
|
|
25
|
-
"from_json",
|
|
26
|
-
"from_pandas",
|
|
27
|
-
"from_parquet",
|
|
28
|
-
"from_records",
|
|
29
|
-
"from_storage",
|
|
30
|
-
"from_values",
|
|
31
22
|
"listings",
|
|
23
|
+
"read_csv",
|
|
24
|
+
"read_dataset",
|
|
25
|
+
"read_hf",
|
|
26
|
+
"read_json",
|
|
27
|
+
"read_pandas",
|
|
28
|
+
"read_parquet",
|
|
29
|
+
"read_records",
|
|
30
|
+
"read_storage",
|
|
31
|
+
"read_values",
|
|
32
32
|
]
|
datachain/lib/dc/csv.py
CHANGED
|
@@ -16,7 +16,7 @@ if TYPE_CHECKING:
|
|
|
16
16
|
from .datachain import DataChain
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
def
|
|
19
|
+
def read_csv(
|
|
20
20
|
path,
|
|
21
21
|
delimiter: Optional[str] = None,
|
|
22
22
|
header: bool = True,
|
|
@@ -58,13 +58,13 @@ def from_csv(
|
|
|
58
58
|
Reading a csv file:
|
|
59
59
|
```py
|
|
60
60
|
import datachain as dc
|
|
61
|
-
chain = dc.
|
|
61
|
+
chain = dc.read_csv("s3://mybucket/file.csv")
|
|
62
62
|
```
|
|
63
63
|
|
|
64
64
|
Reading csv files from a directory as a combined dataset:
|
|
65
65
|
```py
|
|
66
66
|
import datachain as dc
|
|
67
|
-
chain = dc.
|
|
67
|
+
chain = dc.read_csv("s3://mybucket/dir")
|
|
68
68
|
```
|
|
69
69
|
"""
|
|
70
70
|
from pandas.io.parsers.readers import STR_NA_VALUES
|
|
@@ -72,7 +72,7 @@ def from_csv(
|
|
|
72
72
|
from pyarrow.dataset import CsvFileFormat
|
|
73
73
|
from pyarrow.lib import type_for_alias
|
|
74
74
|
|
|
75
|
-
from .storage import
|
|
75
|
+
from .storage import read_storage
|
|
76
76
|
|
|
77
77
|
parse_options = parse_options or {}
|
|
78
78
|
if "delimiter" not in parse_options:
|
|
@@ -88,7 +88,7 @@ def from_csv(
|
|
|
88
88
|
else:
|
|
89
89
|
column_types = {}
|
|
90
90
|
|
|
91
|
-
chain =
|
|
91
|
+
chain = read_storage(path, session=session, settings=settings, **kwargs)
|
|
92
92
|
|
|
93
93
|
column_names = None
|
|
94
94
|
if not header:
|
datachain/lib/dc/datachain.py
CHANGED
|
@@ -84,22 +84,22 @@ class DataChain:
|
|
|
84
84
|
underlyind library `Pydantic`.
|
|
85
85
|
|
|
86
86
|
See Also:
|
|
87
|
-
`
|
|
87
|
+
`read_storage("s3://my-bucket/my-dir/")` - reading unstructured
|
|
88
88
|
data files from storages such as S3, gs or Azure ADLS.
|
|
89
89
|
|
|
90
90
|
`DataChain.save("name")` - saving to a dataset.
|
|
91
91
|
|
|
92
|
-
`
|
|
92
|
+
`read_dataset("name")` - reading from a dataset.
|
|
93
93
|
|
|
94
|
-
`
|
|
94
|
+
`read_values(fib=[1, 2, 3, 5, 8])` - generating from values.
|
|
95
95
|
|
|
96
|
-
`
|
|
96
|
+
`read_pandas(pd.DataFrame(...))` - generating from pandas.
|
|
97
97
|
|
|
98
|
-
`
|
|
98
|
+
`read_json("file.json")` - generating from json.
|
|
99
99
|
|
|
100
|
-
`
|
|
100
|
+
`read_csv("file.csv")` - generating from csv.
|
|
101
101
|
|
|
102
|
-
`
|
|
102
|
+
`read_parquet("file.parquet")` - generating from parquet.
|
|
103
103
|
|
|
104
104
|
Example:
|
|
105
105
|
```py
|
|
@@ -118,7 +118,7 @@ class DataChain:
|
|
|
118
118
|
api_key = os.environ["MISTRAL_API_KEY"]
|
|
119
119
|
|
|
120
120
|
chain = (
|
|
121
|
-
dc.
|
|
121
|
+
dc.read_storage("gs://datachain-demo/chatbot-KiT/")
|
|
122
122
|
.limit(5)
|
|
123
123
|
.settings(cache=True, parallel=5)
|
|
124
124
|
.map(
|
|
@@ -315,27 +315,27 @@ class DataChain:
|
|
|
315
315
|
*args,
|
|
316
316
|
**kwargs,
|
|
317
317
|
) -> "DataChain":
|
|
318
|
-
from .storage import
|
|
318
|
+
from .storage import read_storage
|
|
319
319
|
|
|
320
320
|
warnings.warn(
|
|
321
321
|
"Class method `from_storage` is deprecated. "
|
|
322
|
-
"Use `
|
|
322
|
+
"Use `read_storage` function instead from top_module.",
|
|
323
323
|
DeprecationWarning,
|
|
324
324
|
stacklevel=2,
|
|
325
325
|
)
|
|
326
|
-
return
|
|
326
|
+
return read_storage(*args, **kwargs)
|
|
327
327
|
|
|
328
328
|
@classmethod
|
|
329
329
|
def from_dataset(cls, *args, **kwargs) -> "DataChain":
|
|
330
|
-
from .datasets import
|
|
330
|
+
from .datasets import read_dataset
|
|
331
331
|
|
|
332
332
|
warnings.warn(
|
|
333
333
|
"Class method `from_dataset` is deprecated. "
|
|
334
|
-
"Use `
|
|
334
|
+
"Use `read_dataset` function instead from top_module.",
|
|
335
335
|
DeprecationWarning,
|
|
336
336
|
stacklevel=2,
|
|
337
337
|
)
|
|
338
|
-
return
|
|
338
|
+
return read_dataset(*args, **kwargs)
|
|
339
339
|
|
|
340
340
|
@classmethod
|
|
341
341
|
def from_json(
|
|
@@ -343,15 +343,15 @@ class DataChain:
|
|
|
343
343
|
*args,
|
|
344
344
|
**kwargs,
|
|
345
345
|
) -> "DataChain":
|
|
346
|
-
from .json import
|
|
346
|
+
from .json import read_json
|
|
347
347
|
|
|
348
348
|
warnings.warn(
|
|
349
349
|
"Class method `from_json` is deprecated. "
|
|
350
|
-
"Use `
|
|
350
|
+
"Use `read_json` function instead from top_module.",
|
|
351
351
|
DeprecationWarning,
|
|
352
352
|
stacklevel=2,
|
|
353
353
|
)
|
|
354
|
-
return
|
|
354
|
+
return read_json(*args, **kwargs)
|
|
355
355
|
|
|
356
356
|
def explode(
|
|
357
357
|
self,
|
|
@@ -487,7 +487,7 @@ class DataChain:
|
|
|
487
487
|
)
|
|
488
488
|
|
|
489
489
|
chain = (
|
|
490
|
-
dc.
|
|
490
|
+
dc.read_storage("s3://my-bucket")
|
|
491
491
|
.apply(parse_stem)
|
|
492
492
|
.filter(C("stem").glob("*cat*"))
|
|
493
493
|
)
|
|
@@ -727,7 +727,7 @@ class DataChain:
|
|
|
727
727
|
|
|
728
728
|
Note:
|
|
729
729
|
Order is not guaranteed when steps are added after an `order_by` statement.
|
|
730
|
-
I.e. when using `
|
|
730
|
+
I.e. when using `read_dataset` an `order_by` statement should be used if
|
|
731
731
|
the order of the records in the chain is important.
|
|
732
732
|
Using `order_by` directly before `limit`, `collect` and `collect_flatten`
|
|
733
733
|
will give expected results.
|
|
@@ -1466,15 +1466,15 @@ class DataChain:
|
|
|
1466
1466
|
*args,
|
|
1467
1467
|
**kwargs,
|
|
1468
1468
|
) -> "DataChain":
|
|
1469
|
-
from .values import
|
|
1469
|
+
from .values import read_values
|
|
1470
1470
|
|
|
1471
1471
|
warnings.warn(
|
|
1472
1472
|
"Class method `from_values` is deprecated. "
|
|
1473
|
-
"Use `
|
|
1473
|
+
"Use `read_values` function instead from top_module.",
|
|
1474
1474
|
DeprecationWarning,
|
|
1475
1475
|
stacklevel=2,
|
|
1476
1476
|
)
|
|
1477
|
-
return
|
|
1477
|
+
return read_values(*args, **kwargs)
|
|
1478
1478
|
|
|
1479
1479
|
@classmethod
|
|
1480
1480
|
def from_pandas(
|
|
@@ -1482,15 +1482,15 @@ class DataChain:
|
|
|
1482
1482
|
*args,
|
|
1483
1483
|
**kwargs,
|
|
1484
1484
|
) -> "DataChain":
|
|
1485
|
-
from .pandas import
|
|
1485
|
+
from .pandas import read_pandas
|
|
1486
1486
|
|
|
1487
1487
|
warnings.warn(
|
|
1488
1488
|
"Class method `from_pandas` is deprecated. "
|
|
1489
|
-
"Use `
|
|
1489
|
+
"Use `read_pandas` function instead from top_module.",
|
|
1490
1490
|
DeprecationWarning,
|
|
1491
1491
|
stacklevel=2,
|
|
1492
1492
|
)
|
|
1493
|
-
return
|
|
1493
|
+
return read_pandas(*args, **kwargs)
|
|
1494
1494
|
|
|
1495
1495
|
def to_pandas(self, flatten=False, include_hidden=True) -> "pd.DataFrame":
|
|
1496
1496
|
"""Return a pandas DataFrame from the chain.
|
|
@@ -1575,15 +1575,15 @@ class DataChain:
|
|
|
1575
1575
|
*args,
|
|
1576
1576
|
**kwargs,
|
|
1577
1577
|
) -> "DataChain":
|
|
1578
|
-
from .hf import
|
|
1578
|
+
from .hf import read_hf
|
|
1579
1579
|
|
|
1580
1580
|
warnings.warn(
|
|
1581
1581
|
"Class method `from_hf` is deprecated. "
|
|
1582
|
-
"Use `
|
|
1582
|
+
"Use `read_hf` function instead from top_module.",
|
|
1583
1583
|
DeprecationWarning,
|
|
1584
1584
|
stacklevel=2,
|
|
1585
1585
|
)
|
|
1586
|
-
return
|
|
1586
|
+
return read_hf(*args, **kwargs)
|
|
1587
1587
|
|
|
1588
1588
|
def parse_tabular(
|
|
1589
1589
|
self,
|
|
@@ -1610,7 +1610,7 @@ class DataChain:
|
|
|
1610
1610
|
Reading a json lines file:
|
|
1611
1611
|
```py
|
|
1612
1612
|
import datachain as dc
|
|
1613
|
-
chain = dc.
|
|
1613
|
+
chain = dc.read_storage("s3://mybucket/file.jsonl")
|
|
1614
1614
|
chain = chain.parse_tabular(format="json")
|
|
1615
1615
|
```
|
|
1616
1616
|
|
|
@@ -1618,7 +1618,7 @@ class DataChain:
|
|
|
1618
1618
|
```py
|
|
1619
1619
|
import datachain as dc
|
|
1620
1620
|
|
|
1621
|
-
chain = dc.
|
|
1621
|
+
chain = dc.read_storage("s3://mybucket")
|
|
1622
1622
|
chain = chain.filter(dc.C("file.name").glob("*.jsonl"))
|
|
1623
1623
|
chain = chain.parse_tabular(format="json")
|
|
1624
1624
|
```
|
|
@@ -1680,15 +1680,15 @@ class DataChain:
|
|
|
1680
1680
|
*args,
|
|
1681
1681
|
**kwargs,
|
|
1682
1682
|
) -> "DataChain":
|
|
1683
|
-
from .csv import
|
|
1683
|
+
from .csv import read_csv
|
|
1684
1684
|
|
|
1685
1685
|
warnings.warn(
|
|
1686
1686
|
"Class method `from_csv` is deprecated. "
|
|
1687
|
-
"Use `
|
|
1687
|
+
"Use `read_csv` function instead from top_module.",
|
|
1688
1688
|
DeprecationWarning,
|
|
1689
1689
|
stacklevel=2,
|
|
1690
1690
|
)
|
|
1691
|
-
return
|
|
1691
|
+
return read_csv(*args, **kwargs)
|
|
1692
1692
|
|
|
1693
1693
|
@classmethod
|
|
1694
1694
|
def from_parquet(
|
|
@@ -1696,15 +1696,15 @@ class DataChain:
|
|
|
1696
1696
|
*args,
|
|
1697
1697
|
**kwargs,
|
|
1698
1698
|
) -> "DataChain":
|
|
1699
|
-
from .parquet import
|
|
1699
|
+
from .parquet import read_parquet
|
|
1700
1700
|
|
|
1701
1701
|
warnings.warn(
|
|
1702
1702
|
"Class method `from_parquet` is deprecated. "
|
|
1703
|
-
"Use `
|
|
1703
|
+
"Use `read_parquet` function instead from top_module.",
|
|
1704
1704
|
DeprecationWarning,
|
|
1705
1705
|
stacklevel=2,
|
|
1706
1706
|
)
|
|
1707
|
-
return
|
|
1707
|
+
return read_parquet(*args, **kwargs)
|
|
1708
1708
|
|
|
1709
1709
|
def to_parquet(
|
|
1710
1710
|
self,
|
|
@@ -1930,15 +1930,15 @@ class DataChain:
|
|
|
1930
1930
|
*args,
|
|
1931
1931
|
**kwargs,
|
|
1932
1932
|
) -> "DataChain":
|
|
1933
|
-
from .records import
|
|
1933
|
+
from .records import read_records
|
|
1934
1934
|
|
|
1935
1935
|
warnings.warn(
|
|
1936
1936
|
"Class method `from_records` is deprecated. "
|
|
1937
|
-
"Use `
|
|
1937
|
+
"Use `read_records` function instead from top_module.",
|
|
1938
1938
|
DeprecationWarning,
|
|
1939
1939
|
stacklevel=2,
|
|
1940
1940
|
)
|
|
1941
|
-
return
|
|
1941
|
+
return read_records(*args, **kwargs)
|
|
1942
1942
|
|
|
1943
1943
|
def sum(self, fr: DataType): # type: ignore[override]
|
|
1944
1944
|
"""Compute the sum of a column."""
|
|
@@ -1969,7 +1969,7 @@ class DataChain:
|
|
|
1969
1969
|
import datachain as dc
|
|
1970
1970
|
|
|
1971
1971
|
(
|
|
1972
|
-
dc.
|
|
1972
|
+
dc.read_storage(DATA, type="text")
|
|
1973
1973
|
.settings(parallel=4, cache=True)
|
|
1974
1974
|
.setup(client=lambda: anthropic.Anthropic(api_key=API_KEY))
|
|
1975
1975
|
.map(
|
|
@@ -2021,7 +2021,7 @@ class DataChain:
|
|
|
2021
2021
|
```py
|
|
2022
2022
|
import datachain as dc
|
|
2023
2023
|
|
|
2024
|
-
ds = dc.
|
|
2024
|
+
ds = dc.read_storage("s3://mybucket")
|
|
2025
2025
|
ds.to_storage("gs://mybucket", placement="filename")
|
|
2026
2026
|
```
|
|
2027
2027
|
"""
|
|
@@ -2139,7 +2139,7 @@ class DataChain:
|
|
|
2139
2139
|
```py
|
|
2140
2140
|
import datachain as dc
|
|
2141
2141
|
|
|
2142
|
-
chain = dc.
|
|
2142
|
+
chain = dc.read_storage(...)
|
|
2143
2143
|
chunk_1 = query._chunk(0, 2)
|
|
2144
2144
|
chunk_2 = query._chunk(1, 2)
|
|
2145
2145
|
```
|
datachain/lib/dc/datasets.py
CHANGED
|
@@ -13,7 +13,7 @@ from datachain.query import Session
|
|
|
13
13
|
from datachain.query.dataset import DatasetQuery
|
|
14
14
|
|
|
15
15
|
from .utils import Sys
|
|
16
|
-
from .values import
|
|
16
|
+
from .values import read_values
|
|
17
17
|
|
|
18
18
|
if TYPE_CHECKING:
|
|
19
19
|
from typing_extensions import ParamSpec
|
|
@@ -23,7 +23,7 @@ if TYPE_CHECKING:
|
|
|
23
23
|
P = ParamSpec("P")
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
def
|
|
26
|
+
def read_dataset(
|
|
27
27
|
name: str,
|
|
28
28
|
version: Optional[int] = None,
|
|
29
29
|
session: Optional[Session] = None,
|
|
@@ -44,15 +44,15 @@ def from_dataset(
|
|
|
44
44
|
Example:
|
|
45
45
|
```py
|
|
46
46
|
import datachain as dc
|
|
47
|
-
chain = dc.
|
|
47
|
+
chain = dc.read_dataset("my_cats")
|
|
48
48
|
```
|
|
49
49
|
|
|
50
50
|
```py
|
|
51
|
-
chain = dc.
|
|
51
|
+
chain = dc.read_dataset("my_cats", fallback_to_studio=False)
|
|
52
52
|
```
|
|
53
53
|
|
|
54
54
|
```py
|
|
55
|
-
chain = dc.
|
|
55
|
+
chain = dc.read_dataset("my_cats", version=1)
|
|
56
56
|
```
|
|
57
57
|
|
|
58
58
|
```py
|
|
@@ -64,7 +64,7 @@ def from_dataset(
|
|
|
64
64
|
"min_task_size": 1000,
|
|
65
65
|
"prefetch": 10,
|
|
66
66
|
}
|
|
67
|
-
chain = dc.
|
|
67
|
+
chain = dc.read_dataset(
|
|
68
68
|
name="my_cats",
|
|
69
69
|
version=1,
|
|
70
70
|
session=session,
|
|
@@ -140,7 +140,7 @@ def datasets(
|
|
|
140
140
|
)
|
|
141
141
|
]
|
|
142
142
|
|
|
143
|
-
return
|
|
143
|
+
return read_values(
|
|
144
144
|
session=session,
|
|
145
145
|
settings=settings,
|
|
146
146
|
in_memory=in_memory,
|
datachain/lib/dc/hf.py
CHANGED
|
@@ -18,7 +18,7 @@ if TYPE_CHECKING:
|
|
|
18
18
|
P = ParamSpec("P")
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
def
|
|
21
|
+
def read_hf(
|
|
22
22
|
dataset: Union[str, "HFDatasetType"],
|
|
23
23
|
*args,
|
|
24
24
|
session: Optional[Session] = None,
|
|
@@ -42,7 +42,7 @@ def from_hf(
|
|
|
42
42
|
Load from Hugging Face Hub:
|
|
43
43
|
```py
|
|
44
44
|
import datachain as dc
|
|
45
|
-
chain = dc.
|
|
45
|
+
chain = dc.read_hf("beans", split="train")
|
|
46
46
|
```
|
|
47
47
|
|
|
48
48
|
Generate chain from loaded dataset:
|
|
@@ -50,12 +50,12 @@ def from_hf(
|
|
|
50
50
|
from datasets import load_dataset
|
|
51
51
|
ds = load_dataset("beans", split="train")
|
|
52
52
|
import datachain as dc
|
|
53
|
-
chain = dc.
|
|
53
|
+
chain = dc.read_hf(ds)
|
|
54
54
|
```
|
|
55
55
|
"""
|
|
56
56
|
from datachain.lib.hf import HFGenerator, get_output_schema, stream_splits
|
|
57
57
|
|
|
58
|
-
from .values import
|
|
58
|
+
from .values import read_values
|
|
59
59
|
|
|
60
60
|
output: dict[str, DataType] = {}
|
|
61
61
|
ds_dict = stream_splits(dataset, *args, **kwargs)
|
|
@@ -69,5 +69,5 @@ def from_hf(
|
|
|
69
69
|
if object_name:
|
|
70
70
|
output = {object_name: model}
|
|
71
71
|
|
|
72
|
-
chain =
|
|
72
|
+
chain = read_values(split=list(ds_dict.keys()), session=session, settings=settings)
|
|
73
73
|
return chain.gen(HFGenerator(dataset, model, *args, **kwargs), output=output)
|
datachain/lib/dc/json.py
CHANGED
|
@@ -22,7 +22,7 @@ if TYPE_CHECKING:
|
|
|
22
22
|
P = ParamSpec("P")
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
def
|
|
25
|
+
def read_json(
|
|
26
26
|
path: Union[str, os.PathLike[str]],
|
|
27
27
|
type: FileType = "text",
|
|
28
28
|
spec: Optional[DataType] = None,
|
|
@@ -52,16 +52,16 @@ def from_json(
|
|
|
52
52
|
infer JSON schema from data, reduce using JMESPATH
|
|
53
53
|
```py
|
|
54
54
|
import datachain as dc
|
|
55
|
-
chain = dc.
|
|
55
|
+
chain = dc.read_json("gs://json", jmespath="key1.key2")
|
|
56
56
|
```
|
|
57
57
|
|
|
58
58
|
infer JSON schema from a particular path
|
|
59
59
|
```py
|
|
60
60
|
import datachain as dc
|
|
61
|
-
chain = dc.
|
|
61
|
+
chain = dc.read_json("gs://json_ds", schema_from="gs://json/my.json")
|
|
62
62
|
```
|
|
63
63
|
"""
|
|
64
|
-
from .storage import
|
|
64
|
+
from .storage import read_storage
|
|
65
65
|
|
|
66
66
|
if schema_from == "auto":
|
|
67
67
|
schema_from = os.fspath(path)
|
|
@@ -74,7 +74,7 @@ def from_json(
|
|
|
74
74
|
object_name = jmespath_to_name(jmespath)
|
|
75
75
|
if not object_name:
|
|
76
76
|
object_name = format
|
|
77
|
-
chain =
|
|
77
|
+
chain = read_storage(uri=path, type=type, **kwargs)
|
|
78
78
|
signal_dict = {
|
|
79
79
|
object_name: read_meta(
|
|
80
80
|
schema_from=schema_from,
|
datachain/lib/dc/listings.py
CHANGED
|
@@ -6,7 +6,7 @@ from typing import (
|
|
|
6
6
|
from datachain.lib.listing_info import ListingInfo
|
|
7
7
|
from datachain.query import Session
|
|
8
8
|
|
|
9
|
-
from .values import
|
|
9
|
+
from .values import read_values
|
|
10
10
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
12
|
from typing_extensions import ParamSpec
|
|
@@ -35,7 +35,7 @@ def listings(
|
|
|
35
35
|
session = Session.get(session, in_memory=in_memory)
|
|
36
36
|
catalog = kwargs.get("catalog") or session.catalog
|
|
37
37
|
|
|
38
|
-
return
|
|
38
|
+
return read_values(
|
|
39
39
|
session=session,
|
|
40
40
|
in_memory=in_memory,
|
|
41
41
|
output={object_name: ListingInfo},
|