datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +20 -0
- datachain/asyn.py +11 -12
- datachain/cache.py +7 -7
- datachain/catalog/__init__.py +2 -2
- datachain/catalog/catalog.py +621 -507
- datachain/catalog/dependency.py +164 -0
- datachain/catalog/loader.py +28 -18
- datachain/checkpoint.py +43 -0
- datachain/cli/__init__.py +24 -33
- datachain/cli/commands/__init__.py +1 -8
- datachain/cli/commands/datasets.py +83 -52
- datachain/cli/commands/ls.py +17 -17
- datachain/cli/commands/show.py +4 -4
- datachain/cli/parser/__init__.py +8 -74
- datachain/cli/parser/job.py +95 -3
- datachain/cli/parser/studio.py +11 -4
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +2 -15
- datachain/client/azure.py +4 -4
- datachain/client/fsspec.py +45 -28
- datachain/client/gcs.py +6 -6
- datachain/client/hf.py +29 -2
- datachain/client/http.py +157 -0
- datachain/client/local.py +15 -11
- datachain/client/s3.py +17 -9
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +12 -6
- datachain/data_storage/job.py +5 -1
- datachain/data_storage/metastore.py +1252 -186
- datachain/data_storage/schema.py +58 -45
- datachain/data_storage/serializer.py +105 -15
- datachain/data_storage/sqlite.py +286 -127
- datachain/data_storage/warehouse.py +250 -113
- datachain/dataset.py +353 -148
- datachain/delta.py +391 -0
- datachain/diff/__init__.py +27 -29
- datachain/error.py +60 -0
- datachain/func/__init__.py +2 -1
- datachain/func/aggregate.py +66 -42
- datachain/func/array.py +242 -38
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +110 -60
- datachain/func/func.py +96 -45
- datachain/func/numeric.py +55 -38
- datachain/func/path.py +32 -20
- datachain/func/random.py +2 -2
- datachain/func/string.py +67 -37
- datachain/func/window.py +7 -8
- datachain/hash_utils.py +123 -0
- datachain/job.py +11 -7
- datachain/json.py +138 -0
- datachain/lib/arrow.py +58 -22
- datachain/lib/audio.py +245 -0
- datachain/lib/clip.py +14 -13
- datachain/lib/convert/flatten.py +5 -3
- datachain/lib/convert/python_to_sql.py +6 -10
- datachain/lib/convert/sql_to_python.py +8 -0
- datachain/lib/convert/values_to_tuples.py +156 -51
- datachain/lib/data_model.py +42 -20
- datachain/lib/dataset_info.py +36 -8
- datachain/lib/dc/__init__.py +8 -2
- datachain/lib/dc/csv.py +25 -28
- datachain/lib/dc/database.py +398 -0
- datachain/lib/dc/datachain.py +1289 -425
- datachain/lib/dc/datasets.py +320 -38
- datachain/lib/dc/hf.py +38 -24
- datachain/lib/dc/json.py +29 -32
- datachain/lib/dc/listings.py +112 -8
- datachain/lib/dc/pandas.py +16 -12
- datachain/lib/dc/parquet.py +35 -23
- datachain/lib/dc/records.py +31 -23
- datachain/lib/dc/storage.py +154 -64
- datachain/lib/dc/storage_pattern.py +251 -0
- datachain/lib/dc/utils.py +24 -16
- datachain/lib/dc/values.py +8 -9
- datachain/lib/file.py +622 -89
- datachain/lib/hf.py +69 -39
- datachain/lib/image.py +14 -14
- datachain/lib/listing.py +14 -11
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +3 -4
- datachain/lib/model_store.py +39 -7
- datachain/lib/namespaces.py +125 -0
- datachain/lib/projects.py +130 -0
- datachain/lib/pytorch.py +32 -21
- datachain/lib/settings.py +192 -56
- datachain/lib/signal_schema.py +427 -104
- datachain/lib/tar.py +1 -2
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +164 -76
- datachain/lib/udf_signature.py +60 -35
- datachain/lib/utils.py +118 -4
- datachain/lib/video.py +17 -9
- datachain/lib/webdataset.py +61 -56
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +22 -10
- datachain/model/bbox.py +3 -1
- datachain/model/ultralytics/bbox.py +16 -12
- datachain/model/ultralytics/pose.py +16 -12
- datachain/model/ultralytics/segment.py +16 -12
- datachain/namespace.py +84 -0
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +24 -0
- datachain/project.py +78 -0
- datachain/query/batch.py +40 -41
- datachain/query/dataset.py +604 -322
- datachain/query/dispatch.py +261 -154
- datachain/query/metrics.py +4 -6
- datachain/query/params.py +2 -3
- datachain/query/queue.py +3 -12
- datachain/query/schema.py +11 -6
- datachain/query/session.py +200 -33
- datachain/query/udf.py +34 -2
- datachain/remote/studio.py +171 -69
- datachain/script_meta.py +12 -12
- datachain/semver.py +68 -0
- datachain/sql/__init__.py +2 -0
- datachain/sql/functions/array.py +33 -1
- datachain/sql/postgresql_dialect.py +9 -0
- datachain/sql/postgresql_types.py +21 -0
- datachain/sql/sqlite/__init__.py +5 -1
- datachain/sql/sqlite/base.py +102 -29
- datachain/sql/sqlite/types.py +8 -13
- datachain/sql/types.py +70 -15
- datachain/studio.py +223 -46
- datachain/toolkit/split.py +31 -10
- datachain/utils.py +101 -59
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
- datachain-0.39.0.dist-info/RECORD +173 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
- datachain/cli/commands/query.py +0 -53
- datachain/query/utils.py +0 -42
- datachain-0.14.2.dist-info/RECORD +0 -158
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/lib/dc/listings.py
CHANGED
|
@@ -1,25 +1,61 @@
|
|
|
1
|
-
from typing import
|
|
2
|
-
TYPE_CHECKING,
|
|
3
|
-
Optional,
|
|
4
|
-
)
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
5
2
|
|
|
3
|
+
from datachain.lib.listing import LISTING_PREFIX, ls
|
|
6
4
|
from datachain.lib.listing_info import ListingInfo
|
|
5
|
+
from datachain.lib.settings import Settings
|
|
6
|
+
from datachain.lib.signal_schema import SignalSchema
|
|
7
7
|
from datachain.query import Session
|
|
8
|
+
from datachain.query.dataset import DatasetQuery, QueryStep, step_result
|
|
8
9
|
|
|
9
10
|
from .values import read_values
|
|
10
11
|
|
|
11
12
|
if TYPE_CHECKING:
|
|
12
13
|
from typing_extensions import ParamSpec
|
|
13
14
|
|
|
15
|
+
from datachain.dataset import DatasetVersion
|
|
16
|
+
from datachain.query.dataset import StepResult
|
|
17
|
+
|
|
14
18
|
from .datachain import DataChain
|
|
15
19
|
|
|
16
20
|
P = ParamSpec("P")
|
|
17
21
|
|
|
18
22
|
|
|
23
|
+
class ReadOnlyQueryStep(QueryStep):
|
|
24
|
+
"""
|
|
25
|
+
This step is used to read the dataset in read-only mode.
|
|
26
|
+
It is used to avoid the need to read the table metadata from the warehouse.
|
|
27
|
+
This is useful when we want to list the files in the dataset.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def apply(self) -> "StepResult":
|
|
31
|
+
import sqlalchemy as sa
|
|
32
|
+
|
|
33
|
+
def q(*columns):
|
|
34
|
+
return sa.select(*columns)
|
|
35
|
+
|
|
36
|
+
table_name = self.catalog.warehouse.dataset_table_name(
|
|
37
|
+
self.dataset, self.dataset_version
|
|
38
|
+
)
|
|
39
|
+
dataset_row_cls = self.catalog.warehouse.schema.dataset_row_cls
|
|
40
|
+
table = dataset_row_cls.new_table(
|
|
41
|
+
table_name,
|
|
42
|
+
columns=(
|
|
43
|
+
[
|
|
44
|
+
*dataset_row_cls.sys_columns(),
|
|
45
|
+
*dataset_row_cls.listing_columns(),
|
|
46
|
+
]
|
|
47
|
+
),
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
return step_result(
|
|
51
|
+
q, table.columns, dependencies=[(self.dataset, self.dataset_version)]
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
19
55
|
def listings(
|
|
20
|
-
session:
|
|
56
|
+
session: Session | None = None,
|
|
21
57
|
in_memory: bool = False,
|
|
22
|
-
|
|
58
|
+
column: str = "listing",
|
|
23
59
|
**kwargs,
|
|
24
60
|
) -> "DataChain":
|
|
25
61
|
"""Generate chain with list of cached listings.
|
|
@@ -38,6 +74,74 @@ def listings(
|
|
|
38
74
|
return read_values(
|
|
39
75
|
session=session,
|
|
40
76
|
in_memory=in_memory,
|
|
41
|
-
output={
|
|
42
|
-
**{
|
|
77
|
+
output={column: ListingInfo},
|
|
78
|
+
**{column: catalog.listings()}, # type: ignore[arg-type]
|
|
43
79
|
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def read_listing_dataset(
|
|
83
|
+
name: str,
|
|
84
|
+
version: str | None = None,
|
|
85
|
+
path: str = "",
|
|
86
|
+
session: Session | None = None,
|
|
87
|
+
settings: dict | None = None,
|
|
88
|
+
) -> tuple["DataChain", "DatasetVersion"]:
|
|
89
|
+
"""Read a listing dataset and return a DataChain and listing version.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
name: Name of the dataset
|
|
93
|
+
version: Version of the dataset
|
|
94
|
+
path: Path within the listing to read. Path can have globs.
|
|
95
|
+
session: Optional Session object to use for reading
|
|
96
|
+
settings: Optional settings dictionary to use for reading
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
tuple[DataChain, DatasetVersion]: A tuple containing:
|
|
100
|
+
- DataChain configured for listing files
|
|
101
|
+
- DatasetVersion object for the specified listing version
|
|
102
|
+
|
|
103
|
+
Example:
|
|
104
|
+
```py
|
|
105
|
+
import datachain as dc
|
|
106
|
+
chain, listing_version = dc.read_listing_dataset(
|
|
107
|
+
"lst__s3://my-bucket/my-path", version="1.0.0", path="my-path"
|
|
108
|
+
)
|
|
109
|
+
chain.show()
|
|
110
|
+
```
|
|
111
|
+
"""
|
|
112
|
+
# Configure and return a DataChain for reading listing dataset files
|
|
113
|
+
# Uses ReadOnlyQueryStep to avoid warehouse metadata lookups
|
|
114
|
+
from datachain.lib.dc import Sys
|
|
115
|
+
from datachain.lib.file import File
|
|
116
|
+
|
|
117
|
+
from .datachain import DataChain
|
|
118
|
+
|
|
119
|
+
if not name.startswith(LISTING_PREFIX):
|
|
120
|
+
name = LISTING_PREFIX + name
|
|
121
|
+
|
|
122
|
+
session = Session.get(session)
|
|
123
|
+
dataset = session.catalog.get_dataset(name)
|
|
124
|
+
if version is None:
|
|
125
|
+
version = dataset.latest_version
|
|
126
|
+
|
|
127
|
+
query = DatasetQuery(name=name, session=session)
|
|
128
|
+
|
|
129
|
+
if settings:
|
|
130
|
+
cfg = {**settings}
|
|
131
|
+
if "prefetch" not in cfg:
|
|
132
|
+
cfg["prefetch"] = 0
|
|
133
|
+
_settings = Settings(**cfg)
|
|
134
|
+
else:
|
|
135
|
+
_settings = Settings(prefetch=0)
|
|
136
|
+
signal_schema = SignalSchema({"sys": Sys, "file": File})
|
|
137
|
+
|
|
138
|
+
query.starting_step = ReadOnlyQueryStep(query.catalog, dataset, version)
|
|
139
|
+
query.version = version
|
|
140
|
+
# We already know that this is a listing dataset,
|
|
141
|
+
# so we can set the listing function to True
|
|
142
|
+
query.set_listing_fn(lambda: True)
|
|
143
|
+
|
|
144
|
+
chain = DataChain(query, _settings, signal_schema)
|
|
145
|
+
chain = ls(chain, path, recursive=True, column="file")
|
|
146
|
+
|
|
147
|
+
return chain, dataset.get_version(version)
|
datachain/lib/dc/pandas.py
CHANGED
|
@@ -1,7 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
2
|
-
TYPE_CHECKING,
|
|
3
|
-
Optional,
|
|
4
|
-
)
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
5
2
|
|
|
6
3
|
from datachain.query import Session
|
|
7
4
|
|
|
@@ -19,10 +16,10 @@ if TYPE_CHECKING:
|
|
|
19
16
|
def read_pandas( # type: ignore[override]
|
|
20
17
|
df: "pd.DataFrame",
|
|
21
18
|
name: str = "",
|
|
22
|
-
session:
|
|
23
|
-
settings:
|
|
19
|
+
session: Session | None = None,
|
|
20
|
+
settings: dict | None = None,
|
|
24
21
|
in_memory: bool = False,
|
|
25
|
-
|
|
22
|
+
column: str = "",
|
|
26
23
|
) -> "DataChain":
|
|
27
24
|
"""Generate chain from pandas data-frame.
|
|
28
25
|
|
|
@@ -37,20 +34,27 @@ def read_pandas( # type: ignore[override]
|
|
|
37
34
|
"""
|
|
38
35
|
from .utils import DatasetPrepareError
|
|
39
36
|
|
|
40
|
-
|
|
37
|
+
def get_col_name(col):
|
|
38
|
+
if isinstance(col, tuple):
|
|
39
|
+
# Join tuple elements with underscore for MultiIndex columns
|
|
40
|
+
return "_".join(map(str, col)).lower()
|
|
41
|
+
# Handle regular string column names
|
|
42
|
+
return str(col).lower()
|
|
41
43
|
|
|
42
|
-
for
|
|
43
|
-
|
|
44
|
+
fr_map = {get_col_name(col): df[col].tolist() for col in df.columns}
|
|
45
|
+
|
|
46
|
+
for c in fr_map:
|
|
47
|
+
if not c.isidentifier():
|
|
44
48
|
raise DatasetPrepareError(
|
|
45
49
|
name,
|
|
46
|
-
f"import from pandas error - '{
|
|
50
|
+
f"import from pandas error - '{c}' cannot be a column name",
|
|
47
51
|
)
|
|
48
52
|
|
|
49
53
|
return read_values(
|
|
50
54
|
name,
|
|
51
55
|
session,
|
|
52
56
|
settings=settings,
|
|
53
|
-
|
|
57
|
+
column=column,
|
|
54
58
|
in_memory=in_memory,
|
|
55
59
|
**fr_map,
|
|
56
60
|
)
|
datachain/lib/dc/parquet.py
CHANGED
|
@@ -1,8 +1,5 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
Any,
|
|
4
|
-
Optional,
|
|
5
|
-
)
|
|
1
|
+
import os
|
|
2
|
+
from typing import TYPE_CHECKING, Any
|
|
6
3
|
|
|
7
4
|
from datachain.lib.data_model import DataType
|
|
8
5
|
from datachain.query import Session
|
|
@@ -16,28 +13,34 @@ if TYPE_CHECKING:
|
|
|
16
13
|
|
|
17
14
|
|
|
18
15
|
def read_parquet(
|
|
19
|
-
path,
|
|
16
|
+
path: str | os.PathLike[str] | list[str] | list[os.PathLike[str]],
|
|
20
17
|
partitioning: Any = "hive",
|
|
21
|
-
output:
|
|
22
|
-
|
|
18
|
+
output: dict[str, DataType] | None = None,
|
|
19
|
+
column: str = "",
|
|
23
20
|
model_name: str = "",
|
|
24
21
|
source: bool = True,
|
|
25
|
-
session:
|
|
26
|
-
settings:
|
|
22
|
+
session: Session | None = None,
|
|
23
|
+
settings: dict | None = None,
|
|
27
24
|
**kwargs,
|
|
28
25
|
) -> "DataChain":
|
|
29
26
|
"""Generate chain from parquet files.
|
|
30
27
|
|
|
31
28
|
Parameters:
|
|
32
|
-
path
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
29
|
+
path: Storage path(s) or URI(s). Can be a local path or start with a
|
|
30
|
+
storage prefix like `s3://`, `gs://`, `az://`, `hf://` or "file:///".
|
|
31
|
+
Supports glob patterns:
|
|
32
|
+
- `*` : wildcard
|
|
33
|
+
- `**` : recursive wildcard
|
|
34
|
+
- `?` : single character
|
|
35
|
+
- `{a,b}` : brace expansion list
|
|
36
|
+
- `{1..9}` : brace numeric or alphabetic range
|
|
37
|
+
partitioning: Any pyarrow partitioning schema.
|
|
38
|
+
output: Dictionary defining column names and their corresponding types.
|
|
39
|
+
column: Created column name.
|
|
40
|
+
model_name: Generated model name.
|
|
41
|
+
source: Whether to include info about the source file.
|
|
42
|
+
session: Session to use for the chain.
|
|
43
|
+
settings: Settings to use for the chain.
|
|
41
44
|
|
|
42
45
|
Example:
|
|
43
46
|
Reading a single file:
|
|
@@ -46,10 +49,19 @@ def read_parquet(
|
|
|
46
49
|
dc.read_parquet("s3://mybucket/file.parquet")
|
|
47
50
|
```
|
|
48
51
|
|
|
49
|
-
|
|
52
|
+
All files from a directory:
|
|
50
53
|
```py
|
|
51
|
-
|
|
52
|
-
|
|
54
|
+
dc.read_parquet("s3://mybucket/dir/")
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Only parquet files from a directory, and all it's subdirectories:
|
|
58
|
+
```py
|
|
59
|
+
dc.read_parquet("s3://mybucket/dir/**/*.parquet")
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Using filename patterns - numeric, list, starting with zeros:
|
|
63
|
+
```py
|
|
64
|
+
dc.read_parquet("s3://mybucket/202{1..4}/{yellow,green}-{01..12}.parquet")
|
|
53
65
|
```
|
|
54
66
|
"""
|
|
55
67
|
from .storage import read_storage
|
|
@@ -57,7 +69,7 @@ def read_parquet(
|
|
|
57
69
|
chain = read_storage(path, session=session, settings=settings, **kwargs)
|
|
58
70
|
return chain.parse_tabular(
|
|
59
71
|
output=output,
|
|
60
|
-
|
|
72
|
+
column=column,
|
|
61
73
|
model_name=model_name,
|
|
62
74
|
source=source,
|
|
63
75
|
format="parquet",
|
datachain/lib/dc/records.py
CHANGED
|
@@ -1,15 +1,10 @@
|
|
|
1
|
-
from
|
|
2
|
-
|
|
3
|
-
Optional,
|
|
4
|
-
Union,
|
|
5
|
-
)
|
|
1
|
+
from collections.abc import Iterable
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
6
3
|
|
|
7
4
|
import sqlalchemy
|
|
8
5
|
|
|
9
6
|
from datachain.lib.data_model import DataType
|
|
10
|
-
from datachain.lib.file import
|
|
11
|
-
File,
|
|
12
|
-
)
|
|
7
|
+
from datachain.lib.file import File
|
|
13
8
|
from datachain.lib.signal_schema import SignalSchema
|
|
14
9
|
from datachain.query import Session
|
|
15
10
|
|
|
@@ -20,29 +15,37 @@ if TYPE_CHECKING:
|
|
|
20
15
|
|
|
21
16
|
P = ParamSpec("P")
|
|
22
17
|
|
|
18
|
+
READ_RECORDS_BATCH_SIZE = 10000
|
|
19
|
+
|
|
23
20
|
|
|
24
21
|
def read_records(
|
|
25
|
-
to_insert:
|
|
26
|
-
session:
|
|
27
|
-
settings:
|
|
22
|
+
to_insert: dict | Iterable[dict] | None,
|
|
23
|
+
session: Session | None = None,
|
|
24
|
+
settings: dict | None = None,
|
|
28
25
|
in_memory: bool = False,
|
|
29
|
-
schema:
|
|
26
|
+
schema: dict[str, DataType] | None = None,
|
|
30
27
|
) -> "DataChain":
|
|
31
28
|
"""Create a DataChain from the provided records. This method can be used for
|
|
32
29
|
programmatically generating a chain in contrast of reading data from storages
|
|
33
30
|
or other sources.
|
|
34
31
|
|
|
35
32
|
Parameters:
|
|
36
|
-
to_insert
|
|
37
|
-
a dictionary of signals and
|
|
38
|
-
schema
|
|
33
|
+
to_insert: records (or a single record) to insert. Each record is
|
|
34
|
+
a dictionary of signals and their values.
|
|
35
|
+
schema: describes chain signals and their corresponding types
|
|
39
36
|
|
|
40
37
|
Example:
|
|
41
38
|
```py
|
|
42
39
|
import datachain as dc
|
|
43
40
|
single_record = dc.read_records(dc.DEFAULT_FILE_RECORD)
|
|
44
41
|
```
|
|
42
|
+
|
|
43
|
+
Notes:
|
|
44
|
+
This call blocks until all records are inserted.
|
|
45
45
|
"""
|
|
46
|
+
from datachain.query.dataset import adjust_outputs, get_col_types
|
|
47
|
+
from datachain.sql.types import SQLType
|
|
48
|
+
|
|
46
49
|
from .datasets import read_dataset
|
|
47
50
|
|
|
48
51
|
session = Session.get(session, in_memory=in_memory)
|
|
@@ -56,7 +59,7 @@ def read_records(
|
|
|
56
59
|
signal_schema = SignalSchema(schema)
|
|
57
60
|
columns = [
|
|
58
61
|
sqlalchemy.Column(c.name, c.type) # type: ignore[union-attr]
|
|
59
|
-
for c in signal_schema.db_signals(as_columns=True)
|
|
62
|
+
for c in signal_schema.db_signals(as_columns=True)
|
|
60
63
|
]
|
|
61
64
|
else:
|
|
62
65
|
columns = [
|
|
@@ -66,6 +69,7 @@ def read_records(
|
|
|
66
69
|
|
|
67
70
|
dsr = catalog.create_dataset(
|
|
68
71
|
name,
|
|
72
|
+
catalog.metastore.default_project,
|
|
69
73
|
columns=columns,
|
|
70
74
|
feature_schema=(
|
|
71
75
|
signal_schema.clone_without_sys_signals().serialize()
|
|
@@ -74,8 +78,6 @@ def read_records(
|
|
|
74
78
|
),
|
|
75
79
|
)
|
|
76
80
|
|
|
77
|
-
session.add_dataset_version(dsr, dsr.latest_version)
|
|
78
|
-
|
|
79
81
|
if isinstance(to_insert, dict):
|
|
80
82
|
to_insert = [to_insert]
|
|
81
83
|
elif not to_insert:
|
|
@@ -83,8 +85,14 @@ def read_records(
|
|
|
83
85
|
|
|
84
86
|
warehouse = catalog.warehouse
|
|
85
87
|
dr = warehouse.dataset_rows(dsr)
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
for
|
|
89
|
-
|
|
90
|
-
|
|
88
|
+
table = dr.get_table()
|
|
89
|
+
|
|
90
|
+
# Optimization: Compute row types once, rather than for every row.
|
|
91
|
+
col_types = get_col_types(
|
|
92
|
+
warehouse,
|
|
93
|
+
{c.name: c.type for c in columns if isinstance(c.type, SQLType)},
|
|
94
|
+
)
|
|
95
|
+
records = (adjust_outputs(warehouse, record, col_types) for record in to_insert)
|
|
96
|
+
warehouse.insert_rows(table, records, batch_size=READ_RECORDS_BATCH_SIZE)
|
|
97
|
+
warehouse.insert_rows_done(table)
|
|
98
|
+
return read_dataset(name=dsr.full_name, session=session, settings=settings)
|