datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +4 -0
- datachain/asyn.py +11 -12
- datachain/cache.py +5 -5
- datachain/catalog/__init__.py +0 -2
- datachain/catalog/catalog.py +276 -354
- datachain/catalog/dependency.py +164 -0
- datachain/catalog/loader.py +8 -3
- datachain/checkpoint.py +43 -0
- datachain/cli/__init__.py +10 -17
- datachain/cli/commands/__init__.py +1 -8
- datachain/cli/commands/datasets.py +42 -27
- datachain/cli/commands/ls.py +15 -15
- datachain/cli/commands/show.py +2 -2
- datachain/cli/parser/__init__.py +3 -43
- datachain/cli/parser/job.py +1 -1
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +2 -15
- datachain/client/azure.py +2 -2
- datachain/client/fsspec.py +34 -23
- datachain/client/gcs.py +3 -3
- datachain/client/http.py +157 -0
- datachain/client/local.py +11 -7
- datachain/client/s3.py +3 -3
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +12 -6
- datachain/data_storage/job.py +2 -0
- datachain/data_storage/metastore.py +716 -137
- datachain/data_storage/schema.py +20 -27
- datachain/data_storage/serializer.py +105 -15
- datachain/data_storage/sqlite.py +114 -114
- datachain/data_storage/warehouse.py +140 -48
- datachain/dataset.py +109 -89
- datachain/delta.py +117 -42
- datachain/diff/__init__.py +25 -33
- datachain/error.py +24 -0
- datachain/func/aggregate.py +9 -11
- datachain/func/array.py +12 -12
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +9 -13
- datachain/func/func.py +63 -45
- datachain/func/numeric.py +5 -7
- datachain/func/string.py +2 -2
- datachain/hash_utils.py +123 -0
- datachain/job.py +11 -7
- datachain/json.py +138 -0
- datachain/lib/arrow.py +18 -15
- datachain/lib/audio.py +60 -59
- datachain/lib/clip.py +14 -13
- datachain/lib/convert/python_to_sql.py +6 -10
- datachain/lib/convert/values_to_tuples.py +151 -53
- datachain/lib/data_model.py +23 -19
- datachain/lib/dataset_info.py +7 -7
- datachain/lib/dc/__init__.py +2 -1
- datachain/lib/dc/csv.py +22 -26
- datachain/lib/dc/database.py +37 -34
- datachain/lib/dc/datachain.py +518 -324
- datachain/lib/dc/datasets.py +38 -30
- datachain/lib/dc/hf.py +16 -20
- datachain/lib/dc/json.py +17 -18
- datachain/lib/dc/listings.py +5 -8
- datachain/lib/dc/pandas.py +3 -6
- datachain/lib/dc/parquet.py +33 -21
- datachain/lib/dc/records.py +9 -13
- datachain/lib/dc/storage.py +103 -65
- datachain/lib/dc/storage_pattern.py +251 -0
- datachain/lib/dc/utils.py +17 -14
- datachain/lib/dc/values.py +3 -6
- datachain/lib/file.py +187 -50
- datachain/lib/hf.py +7 -5
- datachain/lib/image.py +13 -13
- datachain/lib/listing.py +5 -5
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +2 -3
- datachain/lib/model_store.py +20 -8
- datachain/lib/namespaces.py +59 -7
- datachain/lib/projects.py +51 -9
- datachain/lib/pytorch.py +31 -23
- datachain/lib/settings.py +188 -85
- datachain/lib/signal_schema.py +302 -64
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +103 -63
- datachain/lib/udf_signature.py +59 -34
- datachain/lib/utils.py +20 -0
- datachain/lib/video.py +3 -4
- datachain/lib/webdataset.py +31 -36
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +12 -5
- datachain/model/bbox.py +3 -1
- datachain/namespace.py +22 -3
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +24 -0
- datachain/project.py +4 -4
- datachain/query/batch.py +10 -12
- datachain/query/dataset.py +376 -194
- datachain/query/dispatch.py +112 -84
- datachain/query/metrics.py +3 -4
- datachain/query/params.py +2 -3
- datachain/query/queue.py +2 -1
- datachain/query/schema.py +7 -6
- datachain/query/session.py +190 -33
- datachain/query/udf.py +9 -6
- datachain/remote/studio.py +90 -53
- datachain/script_meta.py +12 -12
- datachain/sql/sqlite/base.py +37 -25
- datachain/sql/sqlite/types.py +1 -1
- datachain/sql/types.py +36 -5
- datachain/studio.py +49 -40
- datachain/toolkit/split.py +31 -10
- datachain/utils.py +39 -48
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
- datachain-0.39.0.dist-info/RECORD +173 -0
- datachain/cli/commands/query.py +0 -54
- datachain/query/utils.py +0 -36
- datachain-0.30.5.dist-info/RECORD +0 -168
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/lib/dc/datasets.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from collections.abc import Sequence
|
|
2
|
-
from typing import TYPE_CHECKING,
|
|
2
|
+
from typing import TYPE_CHECKING, get_origin, get_type_hints
|
|
3
3
|
|
|
4
4
|
from datachain.error import (
|
|
5
5
|
DatasetNotFoundError,
|
|
@@ -26,20 +26,21 @@ if TYPE_CHECKING:
|
|
|
26
26
|
|
|
27
27
|
def read_dataset(
|
|
28
28
|
name: str,
|
|
29
|
-
namespace:
|
|
30
|
-
project:
|
|
31
|
-
version:
|
|
32
|
-
session:
|
|
33
|
-
settings:
|
|
34
|
-
delta:
|
|
35
|
-
delta_on:
|
|
29
|
+
namespace: str | None = None,
|
|
30
|
+
project: str | None = None,
|
|
31
|
+
version: str | int | None = None,
|
|
32
|
+
session: Session | None = None,
|
|
33
|
+
settings: dict | None = None,
|
|
34
|
+
delta: bool | None = False,
|
|
35
|
+
delta_on: str | Sequence[str] | None = (
|
|
36
36
|
"file.path",
|
|
37
37
|
"file.etag",
|
|
38
38
|
"file.version",
|
|
39
39
|
),
|
|
40
|
-
delta_result_on:
|
|
41
|
-
delta_compare:
|
|
42
|
-
delta_retry:
|
|
40
|
+
delta_result_on: str | Sequence[str] | None = None,
|
|
41
|
+
delta_compare: str | Sequence[str] | None = None,
|
|
42
|
+
delta_retry: bool | str | None = None,
|
|
43
|
+
delta_unsafe: bool = False,
|
|
43
44
|
update: bool = False,
|
|
44
45
|
) -> "DataChain":
|
|
45
46
|
"""Get data from a saved Dataset. It returns the chain itself.
|
|
@@ -50,14 +51,14 @@ def read_dataset(
|
|
|
50
51
|
namespace and project. Alternatively, it can be a regular name, in which
|
|
51
52
|
case the explicitly defined namespace and project will be used if they are
|
|
52
53
|
set; otherwise, default values will be applied.
|
|
53
|
-
namespace
|
|
54
|
-
project
|
|
55
|
-
version
|
|
54
|
+
namespace: optional name of namespace in which dataset to read is created
|
|
55
|
+
project: optional name of project in which dataset to read is created
|
|
56
|
+
version: dataset version. Supports:
|
|
56
57
|
- Exact version strings: "1.2.3"
|
|
57
58
|
- Legacy integer versions: 1, 2, 3 (finds latest major version)
|
|
58
59
|
- Version specifiers (PEP 440): ">=1.0.0,<2.0.0", "~=1.4.2", "==1.2.*", etc.
|
|
59
|
-
session
|
|
60
|
-
settings
|
|
60
|
+
session: Session to use for the chain.
|
|
61
|
+
settings: Settings to use for the chain.
|
|
61
62
|
delta: If True, only process new or changed files instead of reprocessing
|
|
62
63
|
everything. This saves time by skipping files that were already processed in
|
|
63
64
|
previous versions. The optimization is working when a new version of the
|
|
@@ -80,6 +81,8 @@ def read_dataset(
|
|
|
80
81
|
update: If True always checks for newer versions available on Studio, even if
|
|
81
82
|
some version of the dataset exists locally already. If False (default), it
|
|
82
83
|
will only fetch the dataset from Studio if it is not found locally.
|
|
84
|
+
delta_unsafe: Allow restricted ops in delta: merge, agg, union, group_by,
|
|
85
|
+
distinct.
|
|
83
86
|
|
|
84
87
|
|
|
85
88
|
Example:
|
|
@@ -197,6 +200,10 @@ def read_dataset(
|
|
|
197
200
|
signals_schema |= SignalSchema.deserialize(query.feature_schema)
|
|
198
201
|
else:
|
|
199
202
|
signals_schema |= SignalSchema.from_column_types(query.column_types or {})
|
|
203
|
+
|
|
204
|
+
if delta:
|
|
205
|
+
signals_schema = signals_schema.clone_without_sys_signals()
|
|
206
|
+
|
|
200
207
|
chain = DataChain(query, _settings, signals_schema)
|
|
201
208
|
|
|
202
209
|
if delta:
|
|
@@ -205,19 +212,20 @@ def read_dataset(
|
|
|
205
212
|
right_on=delta_result_on,
|
|
206
213
|
compare=delta_compare,
|
|
207
214
|
delta_retry=delta_retry,
|
|
215
|
+
delta_unsafe=delta_unsafe,
|
|
208
216
|
)
|
|
209
217
|
|
|
210
218
|
return chain
|
|
211
219
|
|
|
212
220
|
|
|
213
221
|
def datasets(
|
|
214
|
-
session:
|
|
215
|
-
settings:
|
|
222
|
+
session: Session | None = None,
|
|
223
|
+
settings: dict | None = None,
|
|
216
224
|
in_memory: bool = False,
|
|
217
|
-
column:
|
|
225
|
+
column: str | None = None,
|
|
218
226
|
include_listing: bool = False,
|
|
219
227
|
studio: bool = False,
|
|
220
|
-
attrs:
|
|
228
|
+
attrs: list[str] | None = None,
|
|
221
229
|
) -> "DataChain":
|
|
222
230
|
"""Generate chain with list of registered datasets.
|
|
223
231
|
|
|
@@ -294,12 +302,12 @@ def datasets(
|
|
|
294
302
|
|
|
295
303
|
def delete_dataset(
|
|
296
304
|
name: str,
|
|
297
|
-
namespace:
|
|
298
|
-
project:
|
|
299
|
-
version:
|
|
300
|
-
force:
|
|
301
|
-
studio:
|
|
302
|
-
session:
|
|
305
|
+
namespace: str | None = None,
|
|
306
|
+
project: str | None = None,
|
|
307
|
+
version: str | None = None,
|
|
308
|
+
force: bool | None = False,
|
|
309
|
+
studio: bool | None = False,
|
|
310
|
+
session: Session | None = None,
|
|
303
311
|
in_memory: bool = False,
|
|
304
312
|
) -> None:
|
|
305
313
|
"""Removes specific dataset version or all dataset versions, depending on
|
|
@@ -310,9 +318,9 @@ def delete_dataset(
|
|
|
310
318
|
namespace and project. Alternatively, it can be a regular name, in which
|
|
311
319
|
case the explicitly defined namespace and project will be used if they are
|
|
312
320
|
set; otherwise, default values will be applied.
|
|
313
|
-
namespace
|
|
314
|
-
project
|
|
315
|
-
version
|
|
321
|
+
namespace: optional name of namespace in which dataset to delete is created
|
|
322
|
+
project: optional name of project in which dataset to delete is created
|
|
323
|
+
version: Optional dataset version
|
|
316
324
|
force: If true, all datasets versions will be removed. Defaults to False.
|
|
317
325
|
studio: If True, removes dataset from Studio only, otherwise removes local
|
|
318
326
|
dataset. Defaults to False.
|
|
@@ -373,7 +381,7 @@ def delete_dataset(
|
|
|
373
381
|
def move_dataset(
|
|
374
382
|
src: str,
|
|
375
383
|
dest: str,
|
|
376
|
-
session:
|
|
384
|
+
session: Session | None = None,
|
|
377
385
|
in_memory: bool = False,
|
|
378
386
|
) -> None:
|
|
379
387
|
"""Moves an entire dataset between namespaces and projects.
|
datachain/lib/dc/hf.py
CHANGED
|
@@ -1,8 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
2
|
-
TYPE_CHECKING,
|
|
3
|
-
Optional,
|
|
4
|
-
Union,
|
|
5
|
-
)
|
|
1
|
+
from typing import TYPE_CHECKING, Any
|
|
6
2
|
|
|
7
3
|
from datachain.lib.data_model import dict_to_data_model
|
|
8
4
|
from datachain.query import Session
|
|
@@ -19,29 +15,29 @@ if TYPE_CHECKING:
|
|
|
19
15
|
|
|
20
16
|
|
|
21
17
|
def read_hf(
|
|
22
|
-
dataset:
|
|
23
|
-
*args,
|
|
24
|
-
session:
|
|
25
|
-
settings:
|
|
18
|
+
dataset: "HFDatasetType",
|
|
19
|
+
*args: Any,
|
|
20
|
+
session: Session | None = None,
|
|
21
|
+
settings: dict | None = None,
|
|
26
22
|
column: str = "",
|
|
27
23
|
model_name: str = "",
|
|
28
24
|
limit: int = 0,
|
|
29
|
-
**kwargs,
|
|
25
|
+
**kwargs: Any,
|
|
30
26
|
) -> "DataChain":
|
|
31
27
|
"""Generate chain from Hugging Face Hub dataset.
|
|
32
28
|
|
|
33
29
|
Parameters:
|
|
34
|
-
dataset
|
|
30
|
+
dataset: Path or name of the dataset to read from Hugging Face Hub,
|
|
35
31
|
or an instance of `datasets.Dataset`-like object.
|
|
36
|
-
args
|
|
37
|
-
session
|
|
38
|
-
settings
|
|
39
|
-
column
|
|
40
|
-
model_name
|
|
41
|
-
limit
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
kwargs
|
|
32
|
+
args: Additional positional arguments to pass to `datasets.load_dataset`.
|
|
33
|
+
session: Session to use for the chain.
|
|
34
|
+
settings: Settings to use for the chain.
|
|
35
|
+
column: Generated object column name.
|
|
36
|
+
model_name: Generated model name.
|
|
37
|
+
limit: The maximum number of items to read from the HF dataset.
|
|
38
|
+
Applies `take(limit)` to `datasets.load_dataset`.
|
|
39
|
+
Defaults to 0 (no limit).
|
|
40
|
+
kwargs: Parameters to pass to `datasets.load_dataset`.
|
|
45
41
|
|
|
46
42
|
Example:
|
|
47
43
|
Load from Hugging Face Hub:
|
datachain/lib/dc/json.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
|
-
import os.path
|
|
3
2
|
import re
|
|
4
|
-
from typing import TYPE_CHECKING
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
5
4
|
|
|
6
5
|
import cloudpickle
|
|
7
6
|
|
|
@@ -18,30 +17,30 @@ if TYPE_CHECKING:
|
|
|
18
17
|
|
|
19
18
|
|
|
20
19
|
def read_json(
|
|
21
|
-
path:
|
|
20
|
+
path: str | os.PathLike[str],
|
|
22
21
|
type: FileType = "text",
|
|
23
|
-
spec:
|
|
24
|
-
schema_from:
|
|
25
|
-
jmespath:
|
|
26
|
-
column:
|
|
27
|
-
model_name:
|
|
28
|
-
format:
|
|
29
|
-
nrows=None,
|
|
22
|
+
spec: DataType | None = None,
|
|
23
|
+
schema_from: str | None = "auto",
|
|
24
|
+
jmespath: str | None = None,
|
|
25
|
+
column: str | None = "",
|
|
26
|
+
model_name: str | None = None,
|
|
27
|
+
format: str | None = "json",
|
|
28
|
+
nrows: int | None = None,
|
|
30
29
|
**kwargs,
|
|
31
30
|
) -> "DataChain":
|
|
32
31
|
"""Get data from JSON. It returns the chain itself.
|
|
33
32
|
|
|
34
33
|
Parameters:
|
|
35
|
-
path
|
|
34
|
+
path: storage URI with directory. URI must start with storage prefix such
|
|
36
35
|
as `s3://`, `gs://`, `az://` or "file:///"
|
|
37
|
-
type
|
|
38
|
-
spec
|
|
39
|
-
schema_from
|
|
40
|
-
column
|
|
41
|
-
model_name
|
|
36
|
+
type: read file as "binary", "text", or "image" data. Default is "text".
|
|
37
|
+
spec: optional Data Model
|
|
38
|
+
schema_from: path to sample to infer spec (if schema not provided)
|
|
39
|
+
column: generated column name
|
|
40
|
+
model_name: optional generated model name
|
|
42
41
|
format: "json", "jsonl"
|
|
43
|
-
jmespath
|
|
44
|
-
nrows
|
|
42
|
+
jmespath: optional JMESPATH expression to reduce JSON
|
|
43
|
+
nrows: optional row limit for jsonl and JSON arrays
|
|
45
44
|
|
|
46
45
|
Example:
|
|
47
46
|
infer JSON schema from data, reduce using JMESPATH
|
datachain/lib/dc/listings.py
CHANGED
|
@@ -1,7 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
2
|
-
TYPE_CHECKING,
|
|
3
|
-
Optional,
|
|
4
|
-
)
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
5
2
|
|
|
6
3
|
from datachain.lib.listing import LISTING_PREFIX, ls
|
|
7
4
|
from datachain.lib.listing_info import ListingInfo
|
|
@@ -56,7 +53,7 @@ class ReadOnlyQueryStep(QueryStep):
|
|
|
56
53
|
|
|
57
54
|
|
|
58
55
|
def listings(
|
|
59
|
-
session:
|
|
56
|
+
session: Session | None = None,
|
|
60
57
|
in_memory: bool = False,
|
|
61
58
|
column: str = "listing",
|
|
62
59
|
**kwargs,
|
|
@@ -84,10 +81,10 @@ def listings(
|
|
|
84
81
|
|
|
85
82
|
def read_listing_dataset(
|
|
86
83
|
name: str,
|
|
87
|
-
version:
|
|
84
|
+
version: str | None = None,
|
|
88
85
|
path: str = "",
|
|
89
|
-
session:
|
|
90
|
-
settings:
|
|
86
|
+
session: Session | None = None,
|
|
87
|
+
settings: dict | None = None,
|
|
91
88
|
) -> tuple["DataChain", "DatasetVersion"]:
|
|
92
89
|
"""Read a listing dataset and return a DataChain and listing version.
|
|
93
90
|
|
datachain/lib/dc/pandas.py
CHANGED
|
@@ -1,7 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
2
|
-
TYPE_CHECKING,
|
|
3
|
-
Optional,
|
|
4
|
-
)
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
5
2
|
|
|
6
3
|
from datachain.query import Session
|
|
7
4
|
|
|
@@ -19,8 +16,8 @@ if TYPE_CHECKING:
|
|
|
19
16
|
def read_pandas( # type: ignore[override]
|
|
20
17
|
df: "pd.DataFrame",
|
|
21
18
|
name: str = "",
|
|
22
|
-
session:
|
|
23
|
-
settings:
|
|
19
|
+
session: Session | None = None,
|
|
20
|
+
settings: dict | None = None,
|
|
24
21
|
in_memory: bool = False,
|
|
25
22
|
column: str = "",
|
|
26
23
|
) -> "DataChain":
|
datachain/lib/dc/parquet.py
CHANGED
|
@@ -1,8 +1,5 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
Any,
|
|
4
|
-
Optional,
|
|
5
|
-
)
|
|
1
|
+
import os
|
|
2
|
+
from typing import TYPE_CHECKING, Any
|
|
6
3
|
|
|
7
4
|
from datachain.lib.data_model import DataType
|
|
8
5
|
from datachain.query import Session
|
|
@@ -16,28 +13,34 @@ if TYPE_CHECKING:
|
|
|
16
13
|
|
|
17
14
|
|
|
18
15
|
def read_parquet(
|
|
19
|
-
path,
|
|
16
|
+
path: str | os.PathLike[str] | list[str] | list[os.PathLike[str]],
|
|
20
17
|
partitioning: Any = "hive",
|
|
21
|
-
output:
|
|
18
|
+
output: dict[str, DataType] | None = None,
|
|
22
19
|
column: str = "",
|
|
23
20
|
model_name: str = "",
|
|
24
21
|
source: bool = True,
|
|
25
|
-
session:
|
|
26
|
-
settings:
|
|
22
|
+
session: Session | None = None,
|
|
23
|
+
settings: dict | None = None,
|
|
27
24
|
**kwargs,
|
|
28
25
|
) -> "DataChain":
|
|
29
26
|
"""Generate chain from parquet files.
|
|
30
27
|
|
|
31
28
|
Parameters:
|
|
32
|
-
path
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
29
|
+
path: Storage path(s) or URI(s). Can be a local path or start with a
|
|
30
|
+
storage prefix like `s3://`, `gs://`, `az://`, `hf://` or "file:///".
|
|
31
|
+
Supports glob patterns:
|
|
32
|
+
- `*` : wildcard
|
|
33
|
+
- `**` : recursive wildcard
|
|
34
|
+
- `?` : single character
|
|
35
|
+
- `{a,b}` : brace expansion list
|
|
36
|
+
- `{1..9}` : brace numeric or alphabetic range
|
|
37
|
+
partitioning: Any pyarrow partitioning schema.
|
|
38
|
+
output: Dictionary defining column names and their corresponding types.
|
|
39
|
+
column: Created column name.
|
|
40
|
+
model_name: Generated model name.
|
|
41
|
+
source: Whether to include info about the source file.
|
|
42
|
+
session: Session to use for the chain.
|
|
43
|
+
settings: Settings to use for the chain.
|
|
41
44
|
|
|
42
45
|
Example:
|
|
43
46
|
Reading a single file:
|
|
@@ -46,10 +49,19 @@ def read_parquet(
|
|
|
46
49
|
dc.read_parquet("s3://mybucket/file.parquet")
|
|
47
50
|
```
|
|
48
51
|
|
|
49
|
-
|
|
52
|
+
All files from a directory:
|
|
50
53
|
```py
|
|
51
|
-
|
|
52
|
-
|
|
54
|
+
dc.read_parquet("s3://mybucket/dir/")
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Only parquet files from a directory, and all it's subdirectories:
|
|
58
|
+
```py
|
|
59
|
+
dc.read_parquet("s3://mybucket/dir/**/*.parquet")
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Using filename patterns - numeric, list, starting with zeros:
|
|
63
|
+
```py
|
|
64
|
+
dc.read_parquet("s3://mybucket/202{1..4}/{yellow,green}-{01..12}.parquet")
|
|
53
65
|
```
|
|
54
66
|
"""
|
|
55
67
|
from .storage import read_storage
|
datachain/lib/dc/records.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from collections.abc import Iterable
|
|
2
|
-
from typing import TYPE_CHECKING
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
3
|
|
|
4
4
|
import sqlalchemy
|
|
5
5
|
|
|
@@ -19,20 +19,20 @@ READ_RECORDS_BATCH_SIZE = 10000
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
def read_records(
|
|
22
|
-
to_insert:
|
|
23
|
-
session:
|
|
24
|
-
settings:
|
|
22
|
+
to_insert: dict | Iterable[dict] | None,
|
|
23
|
+
session: Session | None = None,
|
|
24
|
+
settings: dict | None = None,
|
|
25
25
|
in_memory: bool = False,
|
|
26
|
-
schema:
|
|
26
|
+
schema: dict[str, DataType] | None = None,
|
|
27
27
|
) -> "DataChain":
|
|
28
28
|
"""Create a DataChain from the provided records. This method can be used for
|
|
29
29
|
programmatically generating a chain in contrast of reading data from storages
|
|
30
30
|
or other sources.
|
|
31
31
|
|
|
32
32
|
Parameters:
|
|
33
|
-
to_insert
|
|
34
|
-
a dictionary of signals and
|
|
35
|
-
schema
|
|
33
|
+
to_insert: records (or a single record) to insert. Each record is
|
|
34
|
+
a dictionary of signals and their values.
|
|
35
|
+
schema: describes chain signals and their corresponding types
|
|
36
36
|
|
|
37
37
|
Example:
|
|
38
38
|
```py
|
|
@@ -45,7 +45,6 @@ def read_records(
|
|
|
45
45
|
"""
|
|
46
46
|
from datachain.query.dataset import adjust_outputs, get_col_types
|
|
47
47
|
from datachain.sql.types import SQLType
|
|
48
|
-
from datachain.utils import batched
|
|
49
48
|
|
|
50
49
|
from .datasets import read_dataset
|
|
51
50
|
|
|
@@ -79,8 +78,6 @@ def read_records(
|
|
|
79
78
|
),
|
|
80
79
|
)
|
|
81
80
|
|
|
82
|
-
session.add_dataset_version(dsr, dsr.latest_version)
|
|
83
|
-
|
|
84
81
|
if isinstance(to_insert, dict):
|
|
85
82
|
to_insert = [to_insert]
|
|
86
83
|
elif not to_insert:
|
|
@@ -96,7 +93,6 @@ def read_records(
|
|
|
96
93
|
{c.name: c.type for c in columns if isinstance(c.type, SQLType)},
|
|
97
94
|
)
|
|
98
95
|
records = (adjust_outputs(warehouse, record, col_types) for record in to_insert)
|
|
99
|
-
|
|
100
|
-
warehouse.insert_rows(table, chunk)
|
|
96
|
+
warehouse.insert_rows(table, records, batch_size=READ_RECORDS_BATCH_SIZE)
|
|
101
97
|
warehouse.insert_rows_done(table)
|
|
102
98
|
return read_dataset(name=dsr.full_name, session=session, settings=settings)
|