datachain 0.13.0__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +28 -1
- datachain/catalog/catalog.py +18 -9
- datachain/cli/commands/ls.py +2 -2
- datachain/cli/commands/show.py +2 -3
- datachain/diff/__init__.py +8 -5
- datachain/lib/dc/__init__.py +32 -0
- datachain/lib/dc/csv.py +127 -0
- datachain/lib/{dc.py → dc/datachain.py} +144 -733
- datachain/lib/dc/datasets.py +149 -0
- datachain/lib/dc/hf.py +73 -0
- datachain/lib/dc/json.py +91 -0
- datachain/lib/dc/listings.py +43 -0
- datachain/lib/dc/pandas.py +56 -0
- datachain/lib/dc/parquet.py +65 -0
- datachain/lib/dc/records.py +90 -0
- datachain/lib/dc/storage.py +118 -0
- datachain/lib/dc/utils.py +128 -0
- datachain/lib/dc/values.py +53 -0
- datachain/lib/meta_formats.py +2 -4
- datachain/lib/pytorch.py +2 -2
- datachain/lib/udf.py +3 -3
- datachain/toolkit/split.py +2 -2
- {datachain-0.13.0.dist-info → datachain-0.14.0.dist-info}/METADATA +12 -11
- {datachain-0.13.0.dist-info → datachain-0.14.0.dist-info}/RECORD +28 -16
- {datachain-0.13.0.dist-info → datachain-0.14.0.dist-info}/WHEEL +1 -1
- {datachain-0.13.0.dist-info → datachain-0.14.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.13.0.dist-info → datachain-0.14.0.dist-info/licenses}/LICENSE +0 -0
- {datachain-0.13.0.dist-info → datachain-0.14.0.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
import copy
|
|
2
2
|
import os
|
|
3
3
|
import os.path
|
|
4
|
-
import re
|
|
5
4
|
import sys
|
|
5
|
+
import warnings
|
|
6
6
|
from collections.abc import Iterator, Sequence
|
|
7
|
-
from functools import wraps
|
|
8
7
|
from typing import (
|
|
9
8
|
IO,
|
|
10
9
|
TYPE_CHECKING,
|
|
@@ -22,7 +21,6 @@ from typing import (
|
|
|
22
21
|
import orjson
|
|
23
22
|
import sqlalchemy
|
|
24
23
|
from pydantic import BaseModel
|
|
25
|
-
from sqlalchemy.sql.functions import GenericFunction
|
|
26
24
|
from tqdm import tqdm
|
|
27
25
|
|
|
28
26
|
from datachain.dataset import DatasetRecord
|
|
@@ -30,22 +28,13 @@ from datachain.func import literal
|
|
|
30
28
|
from datachain.func.base import Function
|
|
31
29
|
from datachain.func.func import Func
|
|
32
30
|
from datachain.lib.convert.python_to_sql import python_to_sql
|
|
33
|
-
from datachain.lib.convert.values_to_tuples import values_to_tuples
|
|
34
31
|
from datachain.lib.data_model import DataModel, DataType, DataValue, dict_to_data_model
|
|
35
|
-
from datachain.lib.dataset_info import DatasetInfo
|
|
36
32
|
from datachain.lib.file import (
|
|
37
33
|
EXPORT_FILES_MAX_THREADS,
|
|
38
34
|
ArrowRow,
|
|
39
|
-
File,
|
|
40
35
|
FileExporter,
|
|
41
|
-
FileType,
|
|
42
|
-
get_file_type,
|
|
43
36
|
)
|
|
44
37
|
from datachain.lib.file import ExportPlacement as FileExportPlacement
|
|
45
|
-
from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
|
|
46
|
-
from datachain.lib.listing_info import ListingInfo
|
|
47
|
-
from datachain.lib.meta_formats import read_meta
|
|
48
|
-
from datachain.lib.model_store import ModelStore
|
|
49
38
|
from datachain.lib.settings import Settings
|
|
50
39
|
from datachain.lib.signal_schema import SignalSchema
|
|
51
40
|
from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
|
|
@@ -57,124 +46,29 @@ from datachain.query.schema import DEFAULT_DELIMITER, Column, ColumnMeta
|
|
|
57
46
|
from datachain.sql.functions import path as pathfunc
|
|
58
47
|
from datachain.utils import batched_it, inside_notebook, row_to_nested_dict
|
|
59
48
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
49
|
+
from .utils import (
|
|
50
|
+
DatasetMergeError,
|
|
51
|
+
DatasetPrepareError,
|
|
52
|
+
MergeColType,
|
|
53
|
+
OutputType,
|
|
54
|
+
Sys,
|
|
55
|
+
_get_merge_error_str,
|
|
56
|
+
_validate_merge_on,
|
|
57
|
+
resolve_columns,
|
|
58
|
+
)
|
|
68
59
|
|
|
69
60
|
C = Column
|
|
70
61
|
|
|
71
62
|
_T = TypeVar("_T")
|
|
72
|
-
D = TypeVar("D", bound="DataChain")
|
|
73
63
|
UDFObjT = TypeVar("UDFObjT", bound=UDFBase)
|
|
74
64
|
|
|
75
65
|
DEFAULT_PARQUET_CHUNK_SIZE = 100_000
|
|
76
66
|
|
|
67
|
+
if TYPE_CHECKING:
|
|
68
|
+
import pandas as pd
|
|
69
|
+
from typing_extensions import ParamSpec, Self
|
|
77
70
|
|
|
78
|
-
|
|
79
|
-
method: "Callable[Concatenate[D, P], D]",
|
|
80
|
-
) -> "Callable[Concatenate[D, P], D]":
|
|
81
|
-
"""Decorator that resolvs input column names to their actual DB names. This is
|
|
82
|
-
specially important for nested columns as user works with them by using dot
|
|
83
|
-
notation e.g (file.name) but are actually defined with default delimiter
|
|
84
|
-
in DB, e.g file__name.
|
|
85
|
-
If there are any sql functions in arguments, they will just be transferred as is
|
|
86
|
-
to a method.
|
|
87
|
-
"""
|
|
88
|
-
|
|
89
|
-
@wraps(method)
|
|
90
|
-
def _inner(self: D, *args: "P.args", **kwargs: "P.kwargs") -> D:
|
|
91
|
-
resolved_args = self.signals_schema.resolve(
|
|
92
|
-
*[arg for arg in args if not isinstance(arg, GenericFunction)] # type: ignore[arg-type]
|
|
93
|
-
).db_signals()
|
|
94
|
-
|
|
95
|
-
for idx, arg in enumerate(args):
|
|
96
|
-
if isinstance(arg, GenericFunction):
|
|
97
|
-
resolved_args.insert(idx, arg) # type: ignore[arg-type]
|
|
98
|
-
|
|
99
|
-
return method(self, *resolved_args, **kwargs)
|
|
100
|
-
|
|
101
|
-
return _inner
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
class DatasetPrepareError(DataChainParamsError): # noqa: D101
|
|
105
|
-
def __init__(self, name, msg, output=None): # noqa: D107
|
|
106
|
-
name = f" '{name}'" if name else ""
|
|
107
|
-
output = f" output '{output}'" if output else ""
|
|
108
|
-
super().__init__(f"Dataset{name}{output} processing prepare error: {msg}")
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
class DatasetFromValuesError(DataChainParamsError): # noqa: D101
|
|
112
|
-
def __init__(self, name, msg): # noqa: D107
|
|
113
|
-
name = f" '{name}'" if name else ""
|
|
114
|
-
super().__init__(f"Dataset{name} from values error: {msg}")
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
MergeColType = Union[str, Function, sqlalchemy.ColumnElement]
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
def _validate_merge_on(
|
|
121
|
-
on: Union[MergeColType, Sequence[MergeColType]],
|
|
122
|
-
ds: "DataChain",
|
|
123
|
-
) -> Sequence[MergeColType]:
|
|
124
|
-
if isinstance(on, (str, sqlalchemy.ColumnElement)):
|
|
125
|
-
return [on]
|
|
126
|
-
if isinstance(on, Function):
|
|
127
|
-
return [on.get_column(table=ds._query.table)]
|
|
128
|
-
if isinstance(on, Sequence):
|
|
129
|
-
return [
|
|
130
|
-
c.get_column(table=ds._query.table) if isinstance(c, Function) else c
|
|
131
|
-
for c in on
|
|
132
|
-
]
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
def _get_merge_error_str(col: MergeColType) -> str:
|
|
136
|
-
if isinstance(col, str):
|
|
137
|
-
return col
|
|
138
|
-
if isinstance(col, Function):
|
|
139
|
-
return f"{col.name}()"
|
|
140
|
-
if isinstance(col, sqlalchemy.Column):
|
|
141
|
-
return col.name.replace(DEFAULT_DELIMITER, ".")
|
|
142
|
-
if isinstance(col, sqlalchemy.ColumnElement) and hasattr(col, "name"):
|
|
143
|
-
return f"{col.name} expression"
|
|
144
|
-
return str(col)
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
class DatasetMergeError(DataChainParamsError): # noqa: D101
|
|
148
|
-
def __init__( # noqa: D107
|
|
149
|
-
self,
|
|
150
|
-
on: Union[MergeColType, Sequence[MergeColType]],
|
|
151
|
-
right_on: Optional[Union[MergeColType, Sequence[MergeColType]]],
|
|
152
|
-
msg: str,
|
|
153
|
-
):
|
|
154
|
-
def _get_str(
|
|
155
|
-
on: Union[MergeColType, Sequence[MergeColType]],
|
|
156
|
-
) -> str:
|
|
157
|
-
if not isinstance(on, Sequence):
|
|
158
|
-
return str(on) # type: ignore[unreachable]
|
|
159
|
-
return ", ".join([_get_merge_error_str(col) for col in on])
|
|
160
|
-
|
|
161
|
-
on_str = _get_str(on)
|
|
162
|
-
right_on_str = (
|
|
163
|
-
", right_on='" + _get_str(right_on) + "'"
|
|
164
|
-
if right_on and isinstance(right_on, Sequence)
|
|
165
|
-
else ""
|
|
166
|
-
)
|
|
167
|
-
super().__init__(f"Merge error on='{on_str}'{right_on_str}: {msg}")
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
OutputType = Union[None, DataType, Sequence[str], dict[str, DataType]]
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
class Sys(DataModel):
|
|
174
|
-
"""Model for internal DataChain signals `id` and `rand`."""
|
|
175
|
-
|
|
176
|
-
id: int
|
|
177
|
-
rand: int
|
|
71
|
+
P = ParamSpec("P")
|
|
178
72
|
|
|
179
73
|
|
|
180
74
|
class DataChain:
|
|
@@ -190,22 +84,22 @@ class DataChain:
|
|
|
190
84
|
underlyind library `Pydantic`.
|
|
191
85
|
|
|
192
86
|
See Also:
|
|
193
|
-
`
|
|
87
|
+
`from_storage("s3://my-bucket/my-dir/")` - reading unstructured
|
|
194
88
|
data files from storages such as S3, gs or Azure ADLS.
|
|
195
89
|
|
|
196
90
|
`DataChain.save("name")` - saving to a dataset.
|
|
197
91
|
|
|
198
|
-
`
|
|
92
|
+
`from_dataset("name")` - reading from a dataset.
|
|
199
93
|
|
|
200
|
-
`
|
|
94
|
+
`from_values(fib=[1, 2, 3, 5, 8])` - generating from values.
|
|
201
95
|
|
|
202
|
-
`
|
|
96
|
+
`from_pandas(pd.DataFrame(...))` - generating from pandas.
|
|
203
97
|
|
|
204
|
-
`
|
|
98
|
+
`from_json("file.json")` - generating from json.
|
|
205
99
|
|
|
206
|
-
`
|
|
100
|
+
`from_csv("file.csv")` - generating from csv.
|
|
207
101
|
|
|
208
|
-
`
|
|
102
|
+
`from_parquet("file.parquet")` - generating from parquet.
|
|
209
103
|
|
|
210
104
|
Example:
|
|
211
105
|
```py
|
|
@@ -213,8 +107,7 @@ class DataChain:
|
|
|
213
107
|
|
|
214
108
|
from mistralai.client import MistralClient
|
|
215
109
|
from mistralai.models.chat_completion import ChatMessage
|
|
216
|
-
|
|
217
|
-
from datachain import DataChain, Column
|
|
110
|
+
import datachain as dc
|
|
218
111
|
|
|
219
112
|
PROMPT = (
|
|
220
113
|
"Was this bot dialog successful? "
|
|
@@ -225,7 +118,7 @@ class DataChain:
|
|
|
225
118
|
api_key = os.environ["MISTRAL_API_KEY"]
|
|
226
119
|
|
|
227
120
|
chain = (
|
|
228
|
-
|
|
121
|
+
dc.from_storage("gs://datachain-demo/chatbot-KiT/")
|
|
229
122
|
.limit(5)
|
|
230
123
|
.settings(cache=True, parallel=5)
|
|
231
124
|
.map(
|
|
@@ -408,246 +301,57 @@ class DataChain:
|
|
|
408
301
|
self._settings = settings if settings else Settings()
|
|
409
302
|
return self
|
|
410
303
|
|
|
411
|
-
def reset_schema(self, signals_schema: SignalSchema) -> "Self":
|
|
304
|
+
def reset_schema(self, signals_schema: SignalSchema) -> "Self":
|
|
412
305
|
self.signals_schema = signals_schema
|
|
413
306
|
return self
|
|
414
307
|
|
|
415
|
-
def add_schema(self, signals_schema: SignalSchema) -> "Self":
|
|
308
|
+
def add_schema(self, signals_schema: SignalSchema) -> "Self":
|
|
416
309
|
self.signals_schema |= signals_schema
|
|
417
310
|
return self
|
|
418
311
|
|
|
419
312
|
@classmethod
|
|
420
313
|
def from_storage(
|
|
421
314
|
cls,
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
settings: Optional[dict] = None,
|
|
427
|
-
in_memory: bool = False,
|
|
428
|
-
recursive: Optional[bool] = True,
|
|
429
|
-
object_name: str = "file",
|
|
430
|
-
update: bool = False,
|
|
431
|
-
anon: bool = False,
|
|
432
|
-
client_config: Optional[dict] = None,
|
|
433
|
-
) -> "Self":
|
|
434
|
-
"""Get data from a storage as a list of file with all file attributes.
|
|
435
|
-
It returns the chain itself as usual.
|
|
436
|
-
|
|
437
|
-
Parameters:
|
|
438
|
-
uri : storage URI with directory. URI must start with storage prefix such
|
|
439
|
-
as `s3://`, `gs://`, `az://` or "file:///"
|
|
440
|
-
type : read file as "binary", "text", or "image" data. Default is "binary".
|
|
441
|
-
recursive : search recursively for the given path.
|
|
442
|
-
object_name : Created object column name.
|
|
443
|
-
update : force storage reindexing. Default is False.
|
|
444
|
-
anon : If True, we will treat cloud bucket as public one
|
|
445
|
-
client_config : Optional client configuration for the storage client.
|
|
446
|
-
|
|
447
|
-
Example:
|
|
448
|
-
Simple call from s3
|
|
449
|
-
```py
|
|
450
|
-
chain = DataChain.from_storage("s3://my-bucket/my-dir")
|
|
451
|
-
```
|
|
452
|
-
|
|
453
|
-
With AWS S3-compatible storage
|
|
454
|
-
```py
|
|
455
|
-
chain = DataChain.from_storage(
|
|
456
|
-
"s3://my-bucket/my-dir",
|
|
457
|
-
client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
|
|
458
|
-
)
|
|
459
|
-
```
|
|
460
|
-
|
|
461
|
-
Pass existing session
|
|
462
|
-
```py
|
|
463
|
-
session = Session.get()
|
|
464
|
-
chain = DataChain.from_storage("s3://my-bucket/my-dir", session=session)
|
|
465
|
-
```
|
|
466
|
-
"""
|
|
467
|
-
file_type = get_file_type(type)
|
|
468
|
-
|
|
469
|
-
if anon:
|
|
470
|
-
client_config = (client_config or {}) | {"anon": True}
|
|
471
|
-
session = Session.get(session, client_config=client_config, in_memory=in_memory)
|
|
472
|
-
cache = session.catalog.cache
|
|
473
|
-
client_config = session.catalog.client_config
|
|
315
|
+
*args,
|
|
316
|
+
**kwargs,
|
|
317
|
+
) -> "DataChain":
|
|
318
|
+
from .storage import from_storage
|
|
474
319
|
|
|
475
|
-
|
|
476
|
-
|
|
320
|
+
warnings.warn(
|
|
321
|
+
"Class method `from_storage` is deprecated. "
|
|
322
|
+
"Use `from_storage` function instead from top_module.",
|
|
323
|
+
DeprecationWarning,
|
|
324
|
+
stacklevel=2,
|
|
477
325
|
)
|
|
478
|
-
|
|
479
|
-
# ds_name is None if object is a file, we don't want to use cache
|
|
480
|
-
# or do listing in that case - just read that single object
|
|
481
|
-
if not list_ds_name:
|
|
482
|
-
dc = cls.from_values(
|
|
483
|
-
session=session,
|
|
484
|
-
settings=settings,
|
|
485
|
-
in_memory=in_memory,
|
|
486
|
-
file=[get_file_info(list_uri, cache, client_config=client_config)],
|
|
487
|
-
)
|
|
488
|
-
dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
|
|
489
|
-
return dc
|
|
490
|
-
|
|
491
|
-
if update or not list_ds_exists:
|
|
492
|
-
# disable prefetch for listing, as it pre-downloads all files
|
|
493
|
-
(
|
|
494
|
-
cls.from_records(
|
|
495
|
-
DataChain.DEFAULT_FILE_RECORD,
|
|
496
|
-
session=session,
|
|
497
|
-
settings=settings,
|
|
498
|
-
in_memory=in_memory,
|
|
499
|
-
)
|
|
500
|
-
.settings(prefetch=0)
|
|
501
|
-
.gen(
|
|
502
|
-
list_bucket(list_uri, cache, client_config=client_config),
|
|
503
|
-
output={f"{object_name}": File},
|
|
504
|
-
)
|
|
505
|
-
.save(list_ds_name, listing=True)
|
|
506
|
-
)
|
|
507
|
-
|
|
508
|
-
dc = cls.from_dataset(list_ds_name, session=session, settings=settings)
|
|
509
|
-
dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
|
|
510
|
-
|
|
511
|
-
return ls(dc, list_path, recursive=recursive, object_name=object_name)
|
|
326
|
+
return from_storage(*args, **kwargs)
|
|
512
327
|
|
|
513
328
|
@classmethod
|
|
514
|
-
def from_dataset(
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
"""Get data from a saved Dataset. It returns the chain itself.
|
|
523
|
-
If dataset or version is not found locally, it will try to pull it from Studio.
|
|
524
|
-
|
|
525
|
-
Parameters:
|
|
526
|
-
name : dataset name
|
|
527
|
-
version : dataset version
|
|
528
|
-
session : Session to use for the chain.
|
|
529
|
-
settings : Settings to use for the chain.
|
|
530
|
-
fallback_to_studio : Try to pull dataset from Studio if not found locally.
|
|
531
|
-
Default is True.
|
|
532
|
-
|
|
533
|
-
Example:
|
|
534
|
-
```py
|
|
535
|
-
chain = DataChain.from_dataset("my_cats")
|
|
536
|
-
```
|
|
537
|
-
|
|
538
|
-
```py
|
|
539
|
-
chain = DataChain.from_dataset("my_cats", fallback_to_studio=False)
|
|
540
|
-
```
|
|
541
|
-
|
|
542
|
-
```py
|
|
543
|
-
chain = DataChain.from_dataset("my_cats", version=1)
|
|
544
|
-
```
|
|
545
|
-
|
|
546
|
-
```py
|
|
547
|
-
session = Session.get(client_config={"aws_endpoint_url": "<minio-url>"})
|
|
548
|
-
settings = {
|
|
549
|
-
"cache": True,
|
|
550
|
-
"parallel": 4,
|
|
551
|
-
"workers": 4,
|
|
552
|
-
"min_task_size": 1000,
|
|
553
|
-
"prefetch": 10,
|
|
554
|
-
}
|
|
555
|
-
chain = DataChain.from_dataset(
|
|
556
|
-
name="my_cats",
|
|
557
|
-
version=1,
|
|
558
|
-
session=session,
|
|
559
|
-
settings=settings,
|
|
560
|
-
fallback_to_studio=True,
|
|
561
|
-
)
|
|
562
|
-
```
|
|
563
|
-
"""
|
|
564
|
-
from datachain.telemetry import telemetry
|
|
565
|
-
|
|
566
|
-
query = DatasetQuery(
|
|
567
|
-
name=name,
|
|
568
|
-
version=version,
|
|
569
|
-
session=session,
|
|
570
|
-
indexing_column_types=File._datachain_column_types,
|
|
571
|
-
fallback_to_studio=fallback_to_studio,
|
|
329
|
+
def from_dataset(cls, *args, **kwargs) -> "DataChain":
|
|
330
|
+
from .datasets import from_dataset
|
|
331
|
+
|
|
332
|
+
warnings.warn(
|
|
333
|
+
"Class method `from_dataset` is deprecated. "
|
|
334
|
+
"Use `from_dataset` function instead from top_module.",
|
|
335
|
+
DeprecationWarning,
|
|
336
|
+
stacklevel=2,
|
|
572
337
|
)
|
|
573
|
-
|
|
574
|
-
if settings:
|
|
575
|
-
_settings = Settings(**settings)
|
|
576
|
-
else:
|
|
577
|
-
_settings = Settings()
|
|
578
|
-
|
|
579
|
-
signals_schema = SignalSchema({"sys": Sys})
|
|
580
|
-
if query.feature_schema:
|
|
581
|
-
signals_schema |= SignalSchema.deserialize(query.feature_schema)
|
|
582
|
-
else:
|
|
583
|
-
signals_schema |= SignalSchema.from_column_types(query.column_types or {})
|
|
584
|
-
return cls(query, _settings, signals_schema)
|
|
338
|
+
return from_dataset(*args, **kwargs)
|
|
585
339
|
|
|
586
340
|
@classmethod
|
|
587
341
|
def from_json(
|
|
588
342
|
cls,
|
|
589
|
-
|
|
590
|
-
type: FileType = "text",
|
|
591
|
-
spec: Optional[DataType] = None,
|
|
592
|
-
schema_from: Optional[str] = "auto",
|
|
593
|
-
jmespath: Optional[str] = None,
|
|
594
|
-
object_name: Optional[str] = "",
|
|
595
|
-
model_name: Optional[str] = None,
|
|
596
|
-
format: Optional[str] = "json",
|
|
597
|
-
nrows=None,
|
|
343
|
+
*args,
|
|
598
344
|
**kwargs,
|
|
599
345
|
) -> "DataChain":
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
Parameters:
|
|
603
|
-
path : storage URI with directory. URI must start with storage prefix such
|
|
604
|
-
as `s3://`, `gs://`, `az://` or "file:///"
|
|
605
|
-
type : read file as "binary", "text", or "image" data. Default is "text".
|
|
606
|
-
spec : optional Data Model
|
|
607
|
-
schema_from : path to sample to infer spec (if schema not provided)
|
|
608
|
-
object_name : generated object column name
|
|
609
|
-
model_name : optional generated model name
|
|
610
|
-
format: "json", "jsonl"
|
|
611
|
-
jmespath : optional JMESPATH expression to reduce JSON
|
|
612
|
-
nrows : optional row limit for jsonl and JSON arrays
|
|
613
|
-
|
|
614
|
-
Example:
|
|
615
|
-
infer JSON schema from data, reduce using JMESPATH
|
|
616
|
-
```py
|
|
617
|
-
chain = DataChain.from_json("gs://json", jmespath="key1.key2")
|
|
618
|
-
```
|
|
619
|
-
|
|
620
|
-
infer JSON schema from a particular path
|
|
621
|
-
```py
|
|
622
|
-
chain = DataChain.from_json("gs://json_ds", schema_from="gs://json/my.json")
|
|
623
|
-
```
|
|
624
|
-
"""
|
|
625
|
-
if schema_from == "auto":
|
|
626
|
-
schema_from = str(path)
|
|
346
|
+
from .json import from_json
|
|
627
347
|
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
object_name = format
|
|
636
|
-
chain = DataChain.from_storage(uri=path, type=type, **kwargs)
|
|
637
|
-
signal_dict = {
|
|
638
|
-
object_name: read_meta(
|
|
639
|
-
schema_from=schema_from,
|
|
640
|
-
format=format,
|
|
641
|
-
spec=spec,
|
|
642
|
-
model_name=model_name,
|
|
643
|
-
jmespath=jmespath,
|
|
644
|
-
nrows=nrows,
|
|
645
|
-
),
|
|
646
|
-
"params": {"file": File},
|
|
647
|
-
}
|
|
648
|
-
# disable prefetch if nrows is set
|
|
649
|
-
settings = {"prefetch": 0} if nrows else {}
|
|
650
|
-
return chain.settings(**settings).gen(**signal_dict) # type: ignore[misc, arg-type]
|
|
348
|
+
warnings.warn(
|
|
349
|
+
"Class method `from_json` is deprecated. "
|
|
350
|
+
"Use `from_json` function instead from top_module.",
|
|
351
|
+
DeprecationWarning,
|
|
352
|
+
stacklevel=2,
|
|
353
|
+
)
|
|
354
|
+
return from_json(*args, **kwargs)
|
|
651
355
|
|
|
652
356
|
def explode(
|
|
653
357
|
self,
|
|
@@ -710,81 +414,34 @@ class DataChain:
|
|
|
710
414
|
@classmethod
|
|
711
415
|
def datasets(
|
|
712
416
|
cls,
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
in_memory: bool = False,
|
|
716
|
-
object_name: str = "dataset",
|
|
717
|
-
include_listing: bool = False,
|
|
718
|
-
studio: bool = False,
|
|
417
|
+
*args,
|
|
418
|
+
**kwargs,
|
|
719
419
|
) -> "DataChain":
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
Args:
|
|
723
|
-
session: Optional session instance. If not provided, uses default session.
|
|
724
|
-
settings: Optional dictionary of settings to configure the chain.
|
|
725
|
-
in_memory: If True, creates an in-memory session. Defaults to False.
|
|
726
|
-
object_name: Name of the output object in the chain. Defaults to "dataset".
|
|
727
|
-
include_listing: If True, includes listing datasets. Defaults to False.
|
|
728
|
-
studio: If True, returns datasets from Studio only,
|
|
729
|
-
otherwise returns all local datasets. Defaults to False.
|
|
730
|
-
|
|
731
|
-
Returns:
|
|
732
|
-
DataChain: A new DataChain instance containing dataset information.
|
|
733
|
-
|
|
734
|
-
Example:
|
|
735
|
-
```py
|
|
736
|
-
from datachain import DataChain
|
|
737
|
-
|
|
738
|
-
chain = DataChain.datasets()
|
|
739
|
-
for ds in chain.collect("dataset"):
|
|
740
|
-
print(f"{ds.name}@v{ds.version}")
|
|
741
|
-
```
|
|
742
|
-
"""
|
|
743
|
-
session = Session.get(session, in_memory=in_memory)
|
|
744
|
-
catalog = session.catalog
|
|
745
|
-
|
|
746
|
-
datasets = [
|
|
747
|
-
DatasetInfo.from_models(d, v, j)
|
|
748
|
-
for d, v, j in catalog.list_datasets_versions(
|
|
749
|
-
include_listing=include_listing, studio=studio
|
|
750
|
-
)
|
|
751
|
-
]
|
|
420
|
+
from .datasets import datasets
|
|
752
421
|
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
**{object_name: datasets}, # type: ignore[arg-type]
|
|
422
|
+
warnings.warn(
|
|
423
|
+
"Class method `datasets` is deprecated. "
|
|
424
|
+
"Use `datasets` function instead from top_module.",
|
|
425
|
+
DeprecationWarning,
|
|
426
|
+
stacklevel=2,
|
|
759
427
|
)
|
|
428
|
+
return datasets(*args, **kwargs)
|
|
760
429
|
|
|
761
430
|
@classmethod
|
|
762
431
|
def listings(
|
|
763
432
|
cls,
|
|
764
|
-
|
|
765
|
-
in_memory: bool = False,
|
|
766
|
-
object_name: str = "listing",
|
|
433
|
+
*args,
|
|
767
434
|
**kwargs,
|
|
768
435
|
) -> "DataChain":
|
|
769
|
-
|
|
770
|
-
Listing is a special kind of dataset which has directory listing data of
|
|
771
|
-
some underlying storage (e.g S3 bucket).
|
|
436
|
+
from .listings import listings
|
|
772
437
|
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
"""
|
|
779
|
-
session = Session.get(session, in_memory=in_memory)
|
|
780
|
-
catalog = kwargs.get("catalog") or session.catalog
|
|
781
|
-
|
|
782
|
-
return cls.from_values(
|
|
783
|
-
session=session,
|
|
784
|
-
in_memory=in_memory,
|
|
785
|
-
output={object_name: ListingInfo},
|
|
786
|
-
**{object_name: catalog.listings()}, # type: ignore[arg-type]
|
|
438
|
+
warnings.warn(
|
|
439
|
+
"Class method `listings` is deprecated. "
|
|
440
|
+
"Use `listings` function instead from top_module.",
|
|
441
|
+
DeprecationWarning,
|
|
442
|
+
stacklevel=2,
|
|
787
443
|
)
|
|
444
|
+
return listings(*args, **kwargs)
|
|
788
445
|
|
|
789
446
|
def save( # type: ignore[override]
|
|
790
447
|
self,
|
|
@@ -822,6 +479,7 @@ class DataChain:
|
|
|
822
479
|
|
|
823
480
|
Example:
|
|
824
481
|
```py
|
|
482
|
+
import datachain as dc
|
|
825
483
|
def parse_stem(chain):
|
|
826
484
|
return chain.map(
|
|
827
485
|
lambda file: file.get_file_stem()
|
|
@@ -829,7 +487,7 @@ class DataChain:
|
|
|
829
487
|
)
|
|
830
488
|
|
|
831
489
|
chain = (
|
|
832
|
-
|
|
490
|
+
dc.from_storage("s3://my-bucket")
|
|
833
491
|
.apply(parse_stem)
|
|
834
492
|
.filter(C("stem").glob("*cat*"))
|
|
835
493
|
)
|
|
@@ -1358,7 +1016,7 @@ class DataChain:
|
|
|
1358
1016
|
@overload
|
|
1359
1017
|
def results(self, *, include_hidden: bool) -> list[tuple[Any, ...]]: ...
|
|
1360
1018
|
|
|
1361
|
-
def results(self, *, row_factory=None, include_hidden=True):
|
|
1019
|
+
def results(self, *, row_factory=None, include_hidden=True):
|
|
1362
1020
|
if row_factory is None:
|
|
1363
1021
|
return list(self.collect_flatten(include_hidden=include_hidden))
|
|
1364
1022
|
return list(
|
|
@@ -1468,7 +1126,7 @@ class DataChain:
|
|
|
1468
1126
|
remove_prefetched=remove_prefetched,
|
|
1469
1127
|
)
|
|
1470
1128
|
|
|
1471
|
-
def remove_file_signals(self) -> "Self":
|
|
1129
|
+
def remove_file_signals(self) -> "Self":
|
|
1472
1130
|
schema = self.signals_schema.clone_without_file_signals()
|
|
1473
1131
|
return self.select(*schema.values.keys())
|
|
1474
1132
|
|
|
@@ -1805,73 +1463,34 @@ class DataChain:
|
|
|
1805
1463
|
@classmethod
|
|
1806
1464
|
def from_values(
|
|
1807
1465
|
cls,
|
|
1808
|
-
|
|
1809
|
-
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
output: OutputType = None,
|
|
1813
|
-
object_name: str = "",
|
|
1814
|
-
**fr_map,
|
|
1815
|
-
) -> "Self":
|
|
1816
|
-
"""Generate chain from list of values.
|
|
1817
|
-
|
|
1818
|
-
Example:
|
|
1819
|
-
```py
|
|
1820
|
-
DataChain.from_values(fib=[1, 2, 3, 5, 8])
|
|
1821
|
-
```
|
|
1822
|
-
"""
|
|
1823
|
-
tuple_type, output, tuples = values_to_tuples(ds_name, output, **fr_map)
|
|
1824
|
-
|
|
1825
|
-
def _func_fr() -> Iterator[tuple_type]: # type: ignore[valid-type]
|
|
1826
|
-
yield from tuples
|
|
1466
|
+
*args,
|
|
1467
|
+
**kwargs,
|
|
1468
|
+
) -> "DataChain":
|
|
1469
|
+
from .values import from_values
|
|
1827
1470
|
|
|
1828
|
-
|
|
1829
|
-
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
|
|
1471
|
+
warnings.warn(
|
|
1472
|
+
"Class method `from_values` is deprecated. "
|
|
1473
|
+
"Use `from_values` function instead from top_module.",
|
|
1474
|
+
DeprecationWarning,
|
|
1475
|
+
stacklevel=2,
|
|
1833
1476
|
)
|
|
1834
|
-
|
|
1835
|
-
output = {object_name: dict_to_data_model(object_name, output)} # type: ignore[arg-type]
|
|
1836
|
-
return chain.gen(_func_fr, output=output)
|
|
1477
|
+
return from_values(*args, **kwargs)
|
|
1837
1478
|
|
|
1838
1479
|
@classmethod
|
|
1839
|
-
def from_pandas(
|
|
1480
|
+
def from_pandas(
|
|
1840
1481
|
cls,
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
session: Optional[Session] = None,
|
|
1844
|
-
settings: Optional[dict] = None,
|
|
1845
|
-
in_memory: bool = False,
|
|
1846
|
-
object_name: str = "",
|
|
1482
|
+
*args,
|
|
1483
|
+
**kwargs,
|
|
1847
1484
|
) -> "DataChain":
|
|
1848
|
-
|
|
1849
|
-
|
|
1850
|
-
Example:
|
|
1851
|
-
```py
|
|
1852
|
-
import pandas as pd
|
|
1853
|
-
|
|
1854
|
-
df = pd.DataFrame({"fib": [1, 2, 3, 5, 8]})
|
|
1855
|
-
DataChain.from_pandas(df)
|
|
1856
|
-
```
|
|
1857
|
-
"""
|
|
1858
|
-
fr_map = {col.lower(): df[col].tolist() for col in df.columns}
|
|
1859
|
-
|
|
1860
|
-
for column in fr_map:
|
|
1861
|
-
if not column.isidentifier():
|
|
1862
|
-
raise DatasetPrepareError(
|
|
1863
|
-
name,
|
|
1864
|
-
f"import from pandas error - '{column}' cannot be a column name",
|
|
1865
|
-
)
|
|
1485
|
+
from .pandas import from_pandas
|
|
1866
1486
|
|
|
1867
|
-
|
|
1868
|
-
|
|
1869
|
-
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
in_memory=in_memory,
|
|
1873
|
-
**fr_map,
|
|
1487
|
+
warnings.warn(
|
|
1488
|
+
"Class method `from_pandas` is deprecated. "
|
|
1489
|
+
"Use `from_pandas` function instead from top_module.",
|
|
1490
|
+
DeprecationWarning,
|
|
1491
|
+
stacklevel=2,
|
|
1874
1492
|
)
|
|
1493
|
+
return from_pandas(*args, **kwargs)
|
|
1875
1494
|
|
|
1876
1495
|
def to_pandas(self, flatten=False, include_hidden=True) -> "pd.DataFrame":
|
|
1877
1496
|
"""Return a pandas DataFrame from the chain.
|
|
@@ -1953,56 +1572,18 @@ class DataChain:
|
|
|
1953
1572
|
@classmethod
|
|
1954
1573
|
def from_hf(
|
|
1955
1574
|
cls,
|
|
1956
|
-
dataset: Union[str, "HFDatasetType"],
|
|
1957
1575
|
*args,
|
|
1958
|
-
session: Optional[Session] = None,
|
|
1959
|
-
settings: Optional[dict] = None,
|
|
1960
|
-
object_name: str = "",
|
|
1961
|
-
model_name: str = "",
|
|
1962
1576
|
**kwargs,
|
|
1963
1577
|
) -> "DataChain":
|
|
1964
|
-
|
|
1965
|
-
|
|
1966
|
-
Parameters:
|
|
1967
|
-
dataset : Path or name of the dataset to read from Hugging Face Hub,
|
|
1968
|
-
or an instance of `datasets.Dataset`-like object.
|
|
1969
|
-
session : Session to use for the chain.
|
|
1970
|
-
settings : Settings to use for the chain.
|
|
1971
|
-
object_name : Generated object column name.
|
|
1972
|
-
model_name : Generated model name.
|
|
1973
|
-
kwargs : Parameters to pass to datasets.load_dataset.
|
|
1974
|
-
|
|
1975
|
-
Example:
|
|
1976
|
-
Load from Hugging Face Hub:
|
|
1977
|
-
```py
|
|
1978
|
-
DataChain.from_hf("beans", split="train")
|
|
1979
|
-
```
|
|
1980
|
-
|
|
1981
|
-
Generate chain from loaded dataset:
|
|
1982
|
-
```py
|
|
1983
|
-
from datasets import load_dataset
|
|
1984
|
-
ds = load_dataset("beans", split="train")
|
|
1985
|
-
DataChain.from_hf(ds)
|
|
1986
|
-
```
|
|
1987
|
-
"""
|
|
1988
|
-
from datachain.lib.hf import HFGenerator, get_output_schema, stream_splits
|
|
1989
|
-
|
|
1990
|
-
output: dict[str, DataType] = {}
|
|
1991
|
-
ds_dict = stream_splits(dataset, *args, **kwargs)
|
|
1992
|
-
if len(ds_dict) > 1:
|
|
1993
|
-
output = {"split": str}
|
|
1578
|
+
from .hf import from_hf
|
|
1994
1579
|
|
|
1995
|
-
|
|
1996
|
-
|
|
1997
|
-
|
|
1998
|
-
|
|
1999
|
-
|
|
2000
|
-
output = {object_name: model}
|
|
2001
|
-
|
|
2002
|
-
chain = DataChain.from_values(
|
|
2003
|
-
split=list(ds_dict.keys()), session=session, settings=settings
|
|
1580
|
+
warnings.warn(
|
|
1581
|
+
"Class method `from_hf` is deprecated. "
|
|
1582
|
+
"Use `from_hf` function instead from top_module.",
|
|
1583
|
+
DeprecationWarning,
|
|
1584
|
+
stacklevel=2,
|
|
2004
1585
|
)
|
|
2005
|
-
return
|
|
1586
|
+
return from_hf(*args, **kwargs)
|
|
2006
1587
|
|
|
2007
1588
|
def parse_tabular(
|
|
2008
1589
|
self,
|
|
@@ -2028,15 +1609,18 @@ class DataChain:
|
|
|
2028
1609
|
Example:
|
|
2029
1610
|
Reading a json lines file:
|
|
2030
1611
|
```py
|
|
2031
|
-
|
|
2032
|
-
|
|
1612
|
+
import datachain as dc
|
|
1613
|
+
chain = dc.from_storage("s3://mybucket/file.jsonl")
|
|
1614
|
+
chain = chain.parse_tabular(format="json")
|
|
2033
1615
|
```
|
|
2034
1616
|
|
|
2035
1617
|
Reading a filtered list of files as a dataset:
|
|
2036
1618
|
```py
|
|
2037
|
-
|
|
2038
|
-
|
|
2039
|
-
|
|
1619
|
+
import datachain as dc
|
|
1620
|
+
|
|
1621
|
+
chain = dc.from_storage("s3://mybucket")
|
|
1622
|
+
chain = chain.filter(dc.C("file.name").glob("*.jsonl"))
|
|
1623
|
+
chain = chain.parse_tabular(format="json")
|
|
2040
1624
|
```
|
|
2041
1625
|
"""
|
|
2042
1626
|
from pyarrow.dataset import CsvFileFormat, JsonFileFormat
|
|
@@ -2093,161 +1677,34 @@ class DataChain:
|
|
|
2093
1677
|
@classmethod
|
|
2094
1678
|
def from_csv(
|
|
2095
1679
|
cls,
|
|
2096
|
-
|
|
2097
|
-
delimiter: Optional[str] = None,
|
|
2098
|
-
header: bool = True,
|
|
2099
|
-
output: OutputType = None,
|
|
2100
|
-
object_name: str = "",
|
|
2101
|
-
model_name: str = "",
|
|
2102
|
-
source: bool = True,
|
|
2103
|
-
nrows=None,
|
|
2104
|
-
session: Optional[Session] = None,
|
|
2105
|
-
settings: Optional[dict] = None,
|
|
2106
|
-
column_types: Optional[dict[str, "Union[str, ArrowDataType]"]] = None,
|
|
2107
|
-
parse_options: Optional[dict[str, "Union[str, Union[bool, Callable]]"]] = None,
|
|
1680
|
+
*args,
|
|
2108
1681
|
**kwargs,
|
|
2109
1682
|
) -> "DataChain":
|
|
2110
|
-
|
|
2111
|
-
|
|
2112
|
-
Parameters:
|
|
2113
|
-
path : Storage URI with directory. URI must start with storage prefix such
|
|
2114
|
-
as `s3://`, `gs://`, `az://` or "file:///".
|
|
2115
|
-
delimiter : Character for delimiting columns. Takes precedence if also
|
|
2116
|
-
specified in `parse_options`. Defaults to ",".
|
|
2117
|
-
header : Whether the files include a header row.
|
|
2118
|
-
output : Dictionary or feature class defining column names and their
|
|
2119
|
-
corresponding types. List of column names is also accepted, in which
|
|
2120
|
-
case types will be inferred.
|
|
2121
|
-
object_name : Created object column name.
|
|
2122
|
-
model_name : Generated model name.
|
|
2123
|
-
source : Whether to include info about the source file.
|
|
2124
|
-
nrows : Optional row limit.
|
|
2125
|
-
session : Session to use for the chain.
|
|
2126
|
-
settings : Settings to use for the chain.
|
|
2127
|
-
column_types : Dictionary of column names and their corresponding types.
|
|
2128
|
-
It is passed to CSV reader and for each column specified type auto
|
|
2129
|
-
inference is disabled.
|
|
2130
|
-
parse_options: Tells the parser how to process lines.
|
|
2131
|
-
See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html
|
|
2132
|
-
|
|
2133
|
-
Example:
|
|
2134
|
-
Reading a csv file:
|
|
2135
|
-
```py
|
|
2136
|
-
dc = DataChain.from_csv("s3://mybucket/file.csv")
|
|
2137
|
-
```
|
|
2138
|
-
|
|
2139
|
-
Reading csv files from a directory as a combined dataset:
|
|
2140
|
-
```py
|
|
2141
|
-
dc = DataChain.from_csv("s3://mybucket/dir")
|
|
2142
|
-
```
|
|
2143
|
-
"""
|
|
2144
|
-
from pandas.io.parsers.readers import STR_NA_VALUES
|
|
2145
|
-
from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
|
|
2146
|
-
from pyarrow.dataset import CsvFileFormat
|
|
2147
|
-
from pyarrow.lib import type_for_alias
|
|
2148
|
-
|
|
2149
|
-
parse_options = parse_options or {}
|
|
2150
|
-
if "delimiter" not in parse_options:
|
|
2151
|
-
parse_options["delimiter"] = ","
|
|
2152
|
-
if delimiter:
|
|
2153
|
-
parse_options["delimiter"] = delimiter
|
|
2154
|
-
|
|
2155
|
-
if column_types:
|
|
2156
|
-
column_types = {
|
|
2157
|
-
name: type_for_alias(typ) if isinstance(typ, str) else typ
|
|
2158
|
-
for name, typ in column_types.items()
|
|
2159
|
-
}
|
|
2160
|
-
else:
|
|
2161
|
-
column_types = {}
|
|
1683
|
+
from .csv import from_csv
|
|
2162
1684
|
|
|
2163
|
-
|
|
2164
|
-
|
|
2165
|
-
|
|
2166
|
-
|
|
2167
|
-
|
|
2168
|
-
if not header:
|
|
2169
|
-
if not output:
|
|
2170
|
-
msg = "error parsing csv - provide output if no header"
|
|
2171
|
-
raise DatasetPrepareError(chain.name, msg)
|
|
2172
|
-
if isinstance(output, Sequence):
|
|
2173
|
-
column_names = output # type: ignore[assignment]
|
|
2174
|
-
elif isinstance(output, dict):
|
|
2175
|
-
column_names = list(output.keys())
|
|
2176
|
-
elif (fr := ModelStore.to_pydantic(output)) is not None:
|
|
2177
|
-
column_names = list(fr.model_fields.keys())
|
|
2178
|
-
else:
|
|
2179
|
-
msg = f"error parsing csv - incompatible output type {type(output)}"
|
|
2180
|
-
raise DatasetPrepareError(chain.name, msg)
|
|
2181
|
-
|
|
2182
|
-
parse_options = ParseOptions(**parse_options)
|
|
2183
|
-
read_options = ReadOptions(column_names=column_names)
|
|
2184
|
-
convert_options = ConvertOptions(
|
|
2185
|
-
strings_can_be_null=True,
|
|
2186
|
-
null_values=STR_NA_VALUES,
|
|
2187
|
-
column_types=column_types,
|
|
2188
|
-
)
|
|
2189
|
-
format = CsvFileFormat(
|
|
2190
|
-
parse_options=parse_options,
|
|
2191
|
-
read_options=read_options,
|
|
2192
|
-
convert_options=convert_options,
|
|
2193
|
-
)
|
|
2194
|
-
return chain.parse_tabular(
|
|
2195
|
-
output=output,
|
|
2196
|
-
object_name=object_name,
|
|
2197
|
-
model_name=model_name,
|
|
2198
|
-
source=source,
|
|
2199
|
-
nrows=nrows,
|
|
2200
|
-
format=format,
|
|
1685
|
+
warnings.warn(
|
|
1686
|
+
"Class method `from_csv` is deprecated. "
|
|
1687
|
+
"Use `from_csv` function instead from top_module.",
|
|
1688
|
+
DeprecationWarning,
|
|
1689
|
+
stacklevel=2,
|
|
2201
1690
|
)
|
|
1691
|
+
return from_csv(*args, **kwargs)
|
|
2202
1692
|
|
|
2203
1693
|
@classmethod
|
|
2204
1694
|
def from_parquet(
|
|
2205
1695
|
cls,
|
|
2206
|
-
|
|
2207
|
-
partitioning: Any = "hive",
|
|
2208
|
-
output: Optional[dict[str, DataType]] = None,
|
|
2209
|
-
object_name: str = "",
|
|
2210
|
-
model_name: str = "",
|
|
2211
|
-
source: bool = True,
|
|
2212
|
-
session: Optional[Session] = None,
|
|
2213
|
-
settings: Optional[dict] = None,
|
|
1696
|
+
*args,
|
|
2214
1697
|
**kwargs,
|
|
2215
1698
|
) -> "DataChain":
|
|
2216
|
-
|
|
2217
|
-
|
|
2218
|
-
Parameters:
|
|
2219
|
-
path : Storage URI with directory. URI must start with storage prefix such
|
|
2220
|
-
as `s3://`, `gs://`, `az://` or "file:///".
|
|
2221
|
-
partitioning : Any pyarrow partitioning schema.
|
|
2222
|
-
output : Dictionary defining column names and their corresponding types.
|
|
2223
|
-
object_name : Created object column name.
|
|
2224
|
-
model_name : Generated model name.
|
|
2225
|
-
source : Whether to include info about the source file.
|
|
2226
|
-
session : Session to use for the chain.
|
|
2227
|
-
settings : Settings to use for the chain.
|
|
2228
|
-
|
|
2229
|
-
Example:
|
|
2230
|
-
Reading a single file:
|
|
2231
|
-
```py
|
|
2232
|
-
dc = DataChain.from_parquet("s3://mybucket/file.parquet")
|
|
2233
|
-
```
|
|
1699
|
+
from .parquet import from_parquet
|
|
2234
1700
|
|
|
2235
|
-
|
|
2236
|
-
|
|
2237
|
-
|
|
2238
|
-
|
|
2239
|
-
|
|
2240
|
-
chain = DataChain.from_storage(
|
|
2241
|
-
path, session=session, settings=settings, **kwargs
|
|
2242
|
-
)
|
|
2243
|
-
return chain.parse_tabular(
|
|
2244
|
-
output=output,
|
|
2245
|
-
object_name=object_name,
|
|
2246
|
-
model_name=model_name,
|
|
2247
|
-
source=source,
|
|
2248
|
-
format="parquet",
|
|
2249
|
-
partitioning=partitioning,
|
|
1701
|
+
warnings.warn(
|
|
1702
|
+
"Class method `from_parquet` is deprecated. "
|
|
1703
|
+
"Use `from_parquet` function instead from top_module.",
|
|
1704
|
+
DeprecationWarning,
|
|
1705
|
+
stacklevel=2,
|
|
2250
1706
|
)
|
|
1707
|
+
return from_parquet(*args, **kwargs)
|
|
2251
1708
|
|
|
2252
1709
|
def to_parquet(
|
|
2253
1710
|
self,
|
|
@@ -2470,69 +1927,18 @@ class DataChain:
|
|
|
2470
1927
|
@classmethod
|
|
2471
1928
|
def from_records(
|
|
2472
1929
|
cls,
|
|
2473
|
-
|
|
2474
|
-
|
|
2475
|
-
|
|
2476
|
-
|
|
2477
|
-
schema: Optional[dict[str, DataType]] = None,
|
|
2478
|
-
) -> "Self":
|
|
2479
|
-
"""Create a DataChain from the provided records. This method can be used for
|
|
2480
|
-
programmatically generating a chain in contrast of reading data from storages
|
|
2481
|
-
or other sources.
|
|
2482
|
-
|
|
2483
|
-
Parameters:
|
|
2484
|
-
to_insert : records (or a single record) to insert. Each record is
|
|
2485
|
-
a dictionary of signals and theirs values.
|
|
2486
|
-
schema : describes chain signals and their corresponding types
|
|
2487
|
-
|
|
2488
|
-
Example:
|
|
2489
|
-
```py
|
|
2490
|
-
single_record = DataChain.from_records(DataChain.DEFAULT_FILE_RECORD)
|
|
2491
|
-
```
|
|
2492
|
-
"""
|
|
2493
|
-
session = Session.get(session, in_memory=in_memory)
|
|
2494
|
-
catalog = session.catalog
|
|
2495
|
-
|
|
2496
|
-
name = session.generate_temp_dataset_name()
|
|
2497
|
-
signal_schema = None
|
|
2498
|
-
columns: list[sqlalchemy.Column] = []
|
|
2499
|
-
|
|
2500
|
-
if schema:
|
|
2501
|
-
signal_schema = SignalSchema(schema)
|
|
2502
|
-
columns = [
|
|
2503
|
-
sqlalchemy.Column(c.name, c.type) # type: ignore[union-attr]
|
|
2504
|
-
for c in signal_schema.db_signals(as_columns=True) # type: ignore[assignment]
|
|
2505
|
-
]
|
|
2506
|
-
else:
|
|
2507
|
-
columns = [
|
|
2508
|
-
sqlalchemy.Column(name, typ)
|
|
2509
|
-
for name, typ in File._datachain_column_types.items()
|
|
2510
|
-
]
|
|
1930
|
+
*args,
|
|
1931
|
+
**kwargs,
|
|
1932
|
+
) -> "DataChain":
|
|
1933
|
+
from .records import from_records
|
|
2511
1934
|
|
|
2512
|
-
|
|
2513
|
-
|
|
2514
|
-
|
|
2515
|
-
|
|
2516
|
-
|
|
2517
|
-
if signal_schema
|
|
2518
|
-
else None
|
|
2519
|
-
),
|
|
1935
|
+
warnings.warn(
|
|
1936
|
+
"Class method `from_records` is deprecated. "
|
|
1937
|
+
"Use `from_records` function instead from top_module.",
|
|
1938
|
+
DeprecationWarning,
|
|
1939
|
+
stacklevel=2,
|
|
2520
1940
|
)
|
|
2521
|
-
|
|
2522
|
-
session.add_dataset_version(dsr, dsr.latest_version)
|
|
2523
|
-
|
|
2524
|
-
if isinstance(to_insert, dict):
|
|
2525
|
-
to_insert = [to_insert]
|
|
2526
|
-
elif not to_insert:
|
|
2527
|
-
to_insert = []
|
|
2528
|
-
|
|
2529
|
-
warehouse = catalog.warehouse
|
|
2530
|
-
dr = warehouse.dataset_rows(dsr)
|
|
2531
|
-
db = warehouse.db
|
|
2532
|
-
insert_q = dr.get_table().insert()
|
|
2533
|
-
for record in to_insert:
|
|
2534
|
-
db.execute(insert_q.values(**record))
|
|
2535
|
-
return cls.from_dataset(name=dsr.name, session=session, settings=settings)
|
|
1941
|
+
return from_records(*args, **kwargs)
|
|
2536
1942
|
|
|
2537
1943
|
def sum(self, fr: DataType): # type: ignore[override]
|
|
2538
1944
|
"""Compute the sum of a column."""
|
|
@@ -2560,9 +1966,10 @@ class DataChain:
|
|
|
2560
1966
|
```py
|
|
2561
1967
|
import anthropic
|
|
2562
1968
|
from anthropic.types import Message
|
|
1969
|
+
import datachain as dc
|
|
2563
1970
|
|
|
2564
1971
|
(
|
|
2565
|
-
|
|
1972
|
+
dc.from_storage(DATA, type="text")
|
|
2566
1973
|
.settings(parallel=4, cache=True)
|
|
2567
1974
|
.setup(client=lambda: anthropic.Anthropic(api_key=API_KEY))
|
|
2568
1975
|
.map(
|
|
@@ -2612,7 +2019,9 @@ class DataChain:
|
|
|
2612
2019
|
Example:
|
|
2613
2020
|
Cross cloud transfer
|
|
2614
2021
|
```py
|
|
2615
|
-
|
|
2022
|
+
import datachain as dc
|
|
2023
|
+
|
|
2024
|
+
ds = dc.from_storage("s3://mybucket")
|
|
2616
2025
|
ds.to_storage("gs://mybucket", placement="filename")
|
|
2617
2026
|
```
|
|
2618
2027
|
"""
|
|
@@ -2728,7 +2137,9 @@ class DataChain:
|
|
|
2728
2137
|
|
|
2729
2138
|
Example:
|
|
2730
2139
|
```py
|
|
2731
|
-
|
|
2140
|
+
import datachain as dc
|
|
2141
|
+
|
|
2142
|
+
chain = dc.from_storage(...)
|
|
2732
2143
|
chunk_1 = query._chunk(0, 2)
|
|
2733
2144
|
chunk_2 = query._chunk(1, 2)
|
|
2734
2145
|
```
|