datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. datachain/__init__.py +4 -0
  2. datachain/asyn.py +11 -12
  3. datachain/cache.py +5 -5
  4. datachain/catalog/__init__.py +0 -2
  5. datachain/catalog/catalog.py +276 -354
  6. datachain/catalog/dependency.py +164 -0
  7. datachain/catalog/loader.py +8 -3
  8. datachain/checkpoint.py +43 -0
  9. datachain/cli/__init__.py +10 -17
  10. datachain/cli/commands/__init__.py +1 -8
  11. datachain/cli/commands/datasets.py +42 -27
  12. datachain/cli/commands/ls.py +15 -15
  13. datachain/cli/commands/show.py +2 -2
  14. datachain/cli/parser/__init__.py +3 -43
  15. datachain/cli/parser/job.py +1 -1
  16. datachain/cli/parser/utils.py +1 -2
  17. datachain/cli/utils.py +2 -15
  18. datachain/client/azure.py +2 -2
  19. datachain/client/fsspec.py +34 -23
  20. datachain/client/gcs.py +3 -3
  21. datachain/client/http.py +157 -0
  22. datachain/client/local.py +11 -7
  23. datachain/client/s3.py +3 -3
  24. datachain/config.py +4 -8
  25. datachain/data_storage/db_engine.py +12 -6
  26. datachain/data_storage/job.py +2 -0
  27. datachain/data_storage/metastore.py +716 -137
  28. datachain/data_storage/schema.py +20 -27
  29. datachain/data_storage/serializer.py +105 -15
  30. datachain/data_storage/sqlite.py +114 -114
  31. datachain/data_storage/warehouse.py +140 -48
  32. datachain/dataset.py +109 -89
  33. datachain/delta.py +117 -42
  34. datachain/diff/__init__.py +25 -33
  35. datachain/error.py +24 -0
  36. datachain/func/aggregate.py +9 -11
  37. datachain/func/array.py +12 -12
  38. datachain/func/base.py +7 -4
  39. datachain/func/conditional.py +9 -13
  40. datachain/func/func.py +63 -45
  41. datachain/func/numeric.py +5 -7
  42. datachain/func/string.py +2 -2
  43. datachain/hash_utils.py +123 -0
  44. datachain/job.py +11 -7
  45. datachain/json.py +138 -0
  46. datachain/lib/arrow.py +18 -15
  47. datachain/lib/audio.py +60 -59
  48. datachain/lib/clip.py +14 -13
  49. datachain/lib/convert/python_to_sql.py +6 -10
  50. datachain/lib/convert/values_to_tuples.py +151 -53
  51. datachain/lib/data_model.py +23 -19
  52. datachain/lib/dataset_info.py +7 -7
  53. datachain/lib/dc/__init__.py +2 -1
  54. datachain/lib/dc/csv.py +22 -26
  55. datachain/lib/dc/database.py +37 -34
  56. datachain/lib/dc/datachain.py +518 -324
  57. datachain/lib/dc/datasets.py +38 -30
  58. datachain/lib/dc/hf.py +16 -20
  59. datachain/lib/dc/json.py +17 -18
  60. datachain/lib/dc/listings.py +5 -8
  61. datachain/lib/dc/pandas.py +3 -6
  62. datachain/lib/dc/parquet.py +33 -21
  63. datachain/lib/dc/records.py +9 -13
  64. datachain/lib/dc/storage.py +103 -65
  65. datachain/lib/dc/storage_pattern.py +251 -0
  66. datachain/lib/dc/utils.py +17 -14
  67. datachain/lib/dc/values.py +3 -6
  68. datachain/lib/file.py +187 -50
  69. datachain/lib/hf.py +7 -5
  70. datachain/lib/image.py +13 -13
  71. datachain/lib/listing.py +5 -5
  72. datachain/lib/listing_info.py +1 -2
  73. datachain/lib/meta_formats.py +2 -3
  74. datachain/lib/model_store.py +20 -8
  75. datachain/lib/namespaces.py +59 -7
  76. datachain/lib/projects.py +51 -9
  77. datachain/lib/pytorch.py +31 -23
  78. datachain/lib/settings.py +188 -85
  79. datachain/lib/signal_schema.py +302 -64
  80. datachain/lib/text.py +8 -7
  81. datachain/lib/udf.py +103 -63
  82. datachain/lib/udf_signature.py +59 -34
  83. datachain/lib/utils.py +20 -0
  84. datachain/lib/video.py +3 -4
  85. datachain/lib/webdataset.py +31 -36
  86. datachain/lib/webdataset_laion.py +15 -16
  87. datachain/listing.py +12 -5
  88. datachain/model/bbox.py +3 -1
  89. datachain/namespace.py +22 -3
  90. datachain/node.py +6 -6
  91. datachain/nodes_thread_pool.py +0 -1
  92. datachain/plugins.py +24 -0
  93. datachain/project.py +4 -4
  94. datachain/query/batch.py +10 -12
  95. datachain/query/dataset.py +376 -194
  96. datachain/query/dispatch.py +112 -84
  97. datachain/query/metrics.py +3 -4
  98. datachain/query/params.py +2 -3
  99. datachain/query/queue.py +2 -1
  100. datachain/query/schema.py +7 -6
  101. datachain/query/session.py +190 -33
  102. datachain/query/udf.py +9 -6
  103. datachain/remote/studio.py +90 -53
  104. datachain/script_meta.py +12 -12
  105. datachain/sql/sqlite/base.py +37 -25
  106. datachain/sql/sqlite/types.py +1 -1
  107. datachain/sql/types.py +36 -5
  108. datachain/studio.py +49 -40
  109. datachain/toolkit/split.py +31 -10
  110. datachain/utils.py +39 -48
  111. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
  112. datachain-0.39.0.dist-info/RECORD +173 -0
  113. datachain/cli/commands/query.py +0 -54
  114. datachain/query/utils.py +0 -36
  115. datachain-0.30.5.dist-info/RECORD +0 -168
  116. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
  117. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
  118. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
  119. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  from collections.abc import Sequence
2
- from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
2
+ from typing import TYPE_CHECKING, get_origin, get_type_hints
3
3
 
4
4
  from datachain.error import (
5
5
  DatasetNotFoundError,
@@ -26,20 +26,21 @@ if TYPE_CHECKING:
26
26
 
27
27
  def read_dataset(
28
28
  name: str,
29
- namespace: Optional[str] = None,
30
- project: Optional[str] = None,
31
- version: Optional[Union[str, int]] = None,
32
- session: Optional[Session] = None,
33
- settings: Optional[dict] = None,
34
- delta: Optional[bool] = False,
35
- delta_on: Optional[Union[str, Sequence[str]]] = (
29
+ namespace: str | None = None,
30
+ project: str | None = None,
31
+ version: str | int | None = None,
32
+ session: Session | None = None,
33
+ settings: dict | None = None,
34
+ delta: bool | None = False,
35
+ delta_on: str | Sequence[str] | None = (
36
36
  "file.path",
37
37
  "file.etag",
38
38
  "file.version",
39
39
  ),
40
- delta_result_on: Optional[Union[str, Sequence[str]]] = None,
41
- delta_compare: Optional[Union[str, Sequence[str]]] = None,
42
- delta_retry: Optional[Union[bool, str]] = None,
40
+ delta_result_on: str | Sequence[str] | None = None,
41
+ delta_compare: str | Sequence[str] | None = None,
42
+ delta_retry: bool | str | None = None,
43
+ delta_unsafe: bool = False,
43
44
  update: bool = False,
44
45
  ) -> "DataChain":
45
46
  """Get data from a saved Dataset. It returns the chain itself.
@@ -50,14 +51,14 @@ def read_dataset(
50
51
  namespace and project. Alternatively, it can be a regular name, in which
51
52
  case the explicitly defined namespace and project will be used if they are
52
53
  set; otherwise, default values will be applied.
53
- namespace : optional name of namespace in which dataset to read is created
54
- project : optional name of project in which dataset to read is created
55
- version : dataset version. Supports:
54
+ namespace: optional name of namespace in which dataset to read is created
55
+ project: optional name of project in which dataset to read is created
56
+ version: dataset version. Supports:
56
57
  - Exact version strings: "1.2.3"
57
58
  - Legacy integer versions: 1, 2, 3 (finds latest major version)
58
59
  - Version specifiers (PEP 440): ">=1.0.0,<2.0.0", "~=1.4.2", "==1.2.*", etc.
59
- session : Session to use for the chain.
60
- settings : Settings to use for the chain.
60
+ session: Session to use for the chain.
61
+ settings: Settings to use for the chain.
61
62
  delta: If True, only process new or changed files instead of reprocessing
62
63
  everything. This saves time by skipping files that were already processed in
63
64
  previous versions. The optimization is working when a new version of the
@@ -80,6 +81,8 @@ def read_dataset(
80
81
  update: If True always checks for newer versions available on Studio, even if
81
82
  some version of the dataset exists locally already. If False (default), it
82
83
  will only fetch the dataset from Studio if it is not found locally.
84
+ delta_unsafe: Allow restricted ops in delta: merge, agg, union, group_by,
85
+ distinct.
83
86
 
84
87
 
85
88
  Example:
@@ -197,6 +200,10 @@ def read_dataset(
197
200
  signals_schema |= SignalSchema.deserialize(query.feature_schema)
198
201
  else:
199
202
  signals_schema |= SignalSchema.from_column_types(query.column_types or {})
203
+
204
+ if delta:
205
+ signals_schema = signals_schema.clone_without_sys_signals()
206
+
200
207
  chain = DataChain(query, _settings, signals_schema)
201
208
 
202
209
  if delta:
@@ -205,19 +212,20 @@ def read_dataset(
205
212
  right_on=delta_result_on,
206
213
  compare=delta_compare,
207
214
  delta_retry=delta_retry,
215
+ delta_unsafe=delta_unsafe,
208
216
  )
209
217
 
210
218
  return chain
211
219
 
212
220
 
213
221
  def datasets(
214
- session: Optional[Session] = None,
215
- settings: Optional[dict] = None,
222
+ session: Session | None = None,
223
+ settings: dict | None = None,
216
224
  in_memory: bool = False,
217
- column: Optional[str] = None,
225
+ column: str | None = None,
218
226
  include_listing: bool = False,
219
227
  studio: bool = False,
220
- attrs: Optional[list[str]] = None,
228
+ attrs: list[str] | None = None,
221
229
  ) -> "DataChain":
222
230
  """Generate chain with list of registered datasets.
223
231
 
@@ -294,12 +302,12 @@ def datasets(
294
302
 
295
303
  def delete_dataset(
296
304
  name: str,
297
- namespace: Optional[str] = None,
298
- project: Optional[str] = None,
299
- version: Optional[str] = None,
300
- force: Optional[bool] = False,
301
- studio: Optional[bool] = False,
302
- session: Optional[Session] = None,
305
+ namespace: str | None = None,
306
+ project: str | None = None,
307
+ version: str | None = None,
308
+ force: bool | None = False,
309
+ studio: bool | None = False,
310
+ session: Session | None = None,
303
311
  in_memory: bool = False,
304
312
  ) -> None:
305
313
  """Removes specific dataset version or all dataset versions, depending on
@@ -310,9 +318,9 @@ def delete_dataset(
310
318
  namespace and project. Alternatively, it can be a regular name, in which
311
319
  case the explicitly defined namespace and project will be used if they are
312
320
  set; otherwise, default values will be applied.
313
- namespace : optional name of namespace in which dataset to delete is created
314
- project : optional name of project in which dataset to delete is created
315
- version : Optional dataset version
321
+ namespace: optional name of namespace in which dataset to delete is created
322
+ project: optional name of project in which dataset to delete is created
323
+ version: Optional dataset version
316
324
  force: If true, all datasets versions will be removed. Defaults to False.
317
325
  studio: If True, removes dataset from Studio only, otherwise removes local
318
326
  dataset. Defaults to False.
@@ -373,7 +381,7 @@ def delete_dataset(
373
381
  def move_dataset(
374
382
  src: str,
375
383
  dest: str,
376
- session: Optional[Session] = None,
384
+ session: Session | None = None,
377
385
  in_memory: bool = False,
378
386
  ) -> None:
379
387
  """Moves an entire dataset between namespaces and projects.
datachain/lib/dc/hf.py CHANGED
@@ -1,8 +1,4 @@
1
- from typing import (
2
- TYPE_CHECKING,
3
- Optional,
4
- Union,
5
- )
1
+ from typing import TYPE_CHECKING, Any
6
2
 
7
3
  from datachain.lib.data_model import dict_to_data_model
8
4
  from datachain.query import Session
@@ -19,29 +15,29 @@ if TYPE_CHECKING:
19
15
 
20
16
 
21
17
  def read_hf(
22
- dataset: Union[str, "HFDatasetType"],
23
- *args,
24
- session: Optional[Session] = None,
25
- settings: Optional[dict] = None,
18
+ dataset: "HFDatasetType",
19
+ *args: Any,
20
+ session: Session | None = None,
21
+ settings: dict | None = None,
26
22
  column: str = "",
27
23
  model_name: str = "",
28
24
  limit: int = 0,
29
- **kwargs,
25
+ **kwargs: Any,
30
26
  ) -> "DataChain":
31
27
  """Generate chain from Hugging Face Hub dataset.
32
28
 
33
29
  Parameters:
34
- dataset : Path or name of the dataset to read from Hugging Face Hub,
30
+ dataset: Path or name of the dataset to read from Hugging Face Hub,
35
31
  or an instance of `datasets.Dataset`-like object.
36
- args : Additional positional arguments to pass to `datasets.load_dataset`.
37
- session : Session to use for the chain.
38
- settings : Settings to use for the chain.
39
- column : Generated object column name.
40
- model_name : Generated model name.
41
- limit : Limit the number of items to read from the HF dataset.
42
- Adds `take(limit)` to the `datasets.load_dataset`.
43
- Defaults to 0 (no limit).
44
- kwargs : Parameters to pass to `datasets.load_dataset`.
32
+ args: Additional positional arguments to pass to `datasets.load_dataset`.
33
+ session: Session to use for the chain.
34
+ settings: Settings to use for the chain.
35
+ column: Generated object column name.
36
+ model_name: Generated model name.
37
+ limit: The maximum number of items to read from the HF dataset.
38
+ Applies `take(limit)` to `datasets.load_dataset`.
39
+ Defaults to 0 (no limit).
40
+ kwargs: Parameters to pass to `datasets.load_dataset`.
45
41
 
46
42
  Example:
47
43
  Load from Hugging Face Hub:
datachain/lib/dc/json.py CHANGED
@@ -1,7 +1,6 @@
1
1
  import os
2
- import os.path
3
2
  import re
4
- from typing import TYPE_CHECKING, Optional, Union
3
+ from typing import TYPE_CHECKING
5
4
 
6
5
  import cloudpickle
7
6
 
@@ -18,30 +17,30 @@ if TYPE_CHECKING:
18
17
 
19
18
 
20
19
  def read_json(
21
- path: Union[str, os.PathLike[str]],
20
+ path: str | os.PathLike[str],
22
21
  type: FileType = "text",
23
- spec: Optional[DataType] = None,
24
- schema_from: Optional[str] = "auto",
25
- jmespath: Optional[str] = None,
26
- column: Optional[str] = "",
27
- model_name: Optional[str] = None,
28
- format: Optional[str] = "json",
29
- nrows=None,
22
+ spec: DataType | None = None,
23
+ schema_from: str | None = "auto",
24
+ jmespath: str | None = None,
25
+ column: str | None = "",
26
+ model_name: str | None = None,
27
+ format: str | None = "json",
28
+ nrows: int | None = None,
30
29
  **kwargs,
31
30
  ) -> "DataChain":
32
31
  """Get data from JSON. It returns the chain itself.
33
32
 
34
33
  Parameters:
35
- path : storage URI with directory. URI must start with storage prefix such
34
+ path: storage URI with directory. URI must start with storage prefix such
36
35
  as `s3://`, `gs://`, `az://` or "file:///"
37
- type : read file as "binary", "text", or "image" data. Default is "text".
38
- spec : optional Data Model
39
- schema_from : path to sample to infer spec (if schema not provided)
40
- column : generated column name
41
- model_name : optional generated model name
36
+ type: read file as "binary", "text", or "image" data. Default is "text".
37
+ spec: optional Data Model
38
+ schema_from: path to sample to infer spec (if schema not provided)
39
+ column: generated column name
40
+ model_name: optional generated model name
42
41
  format: "json", "jsonl"
43
- jmespath : optional JMESPATH expression to reduce JSON
44
- nrows : optional row limit for jsonl and JSON arrays
42
+ jmespath: optional JMESPATH expression to reduce JSON
43
+ nrows: optional row limit for jsonl and JSON arrays
45
44
 
46
45
  Example:
47
46
  infer JSON schema from data, reduce using JMESPATH
@@ -1,7 +1,4 @@
1
- from typing import (
2
- TYPE_CHECKING,
3
- Optional,
4
- )
1
+ from typing import TYPE_CHECKING
5
2
 
6
3
  from datachain.lib.listing import LISTING_PREFIX, ls
7
4
  from datachain.lib.listing_info import ListingInfo
@@ -56,7 +53,7 @@ class ReadOnlyQueryStep(QueryStep):
56
53
 
57
54
 
58
55
  def listings(
59
- session: Optional[Session] = None,
56
+ session: Session | None = None,
60
57
  in_memory: bool = False,
61
58
  column: str = "listing",
62
59
  **kwargs,
@@ -84,10 +81,10 @@ def listings(
84
81
 
85
82
  def read_listing_dataset(
86
83
  name: str,
87
- version: Optional[str] = None,
84
+ version: str | None = None,
88
85
  path: str = "",
89
- session: Optional["Session"] = None,
90
- settings: Optional[dict] = None,
86
+ session: Session | None = None,
87
+ settings: dict | None = None,
91
88
  ) -> tuple["DataChain", "DatasetVersion"]:
92
89
  """Read a listing dataset and return a DataChain and listing version.
93
90
 
@@ -1,7 +1,4 @@
1
- from typing import (
2
- TYPE_CHECKING,
3
- Optional,
4
- )
1
+ from typing import TYPE_CHECKING
5
2
 
6
3
  from datachain.query import Session
7
4
 
@@ -19,8 +16,8 @@ if TYPE_CHECKING:
19
16
  def read_pandas( # type: ignore[override]
20
17
  df: "pd.DataFrame",
21
18
  name: str = "",
22
- session: Optional[Session] = None,
23
- settings: Optional[dict] = None,
19
+ session: Session | None = None,
20
+ settings: dict | None = None,
24
21
  in_memory: bool = False,
25
22
  column: str = "",
26
23
  ) -> "DataChain":
@@ -1,8 +1,5 @@
1
- from typing import (
2
- TYPE_CHECKING,
3
- Any,
4
- Optional,
5
- )
1
+ import os
2
+ from typing import TYPE_CHECKING, Any
6
3
 
7
4
  from datachain.lib.data_model import DataType
8
5
  from datachain.query import Session
@@ -16,28 +13,34 @@ if TYPE_CHECKING:
16
13
 
17
14
 
18
15
  def read_parquet(
19
- path,
16
+ path: str | os.PathLike[str] | list[str] | list[os.PathLike[str]],
20
17
  partitioning: Any = "hive",
21
- output: Optional[dict[str, DataType]] = None,
18
+ output: dict[str, DataType] | None = None,
22
19
  column: str = "",
23
20
  model_name: str = "",
24
21
  source: bool = True,
25
- session: Optional[Session] = None,
26
- settings: Optional[dict] = None,
22
+ session: Session | None = None,
23
+ settings: dict | None = None,
27
24
  **kwargs,
28
25
  ) -> "DataChain":
29
26
  """Generate chain from parquet files.
30
27
 
31
28
  Parameters:
32
- path : Storage URI with directory. URI must start with storage prefix such
33
- as `s3://`, `gs://`, `az://` or "file:///".
34
- partitioning : Any pyarrow partitioning schema.
35
- output : Dictionary defining column names and their corresponding types.
36
- column : Created column name.
37
- model_name : Generated model name.
38
- source : Whether to include info about the source file.
39
- session : Session to use for the chain.
40
- settings : Settings to use for the chain.
29
+ path: Storage path(s) or URI(s). Can be a local path or start with a
30
+ storage prefix like `s3://`, `gs://`, `az://`, `hf://` or "file:///".
31
+ Supports glob patterns:
32
+ - `*` : wildcard
33
+ - `**` : recursive wildcard
34
+ - `?` : single character
35
+ - `{a,b}` : brace expansion list
36
+ - `{1..9}` : brace numeric or alphabetic range
37
+ partitioning: Any pyarrow partitioning schema.
38
+ output: Dictionary defining column names and their corresponding types.
39
+ column: Created column name.
40
+ model_name: Generated model name.
41
+ source: Whether to include info about the source file.
42
+ session: Session to use for the chain.
43
+ settings: Settings to use for the chain.
41
44
 
42
45
  Example:
43
46
  Reading a single file:
@@ -46,10 +49,19 @@ def read_parquet(
46
49
  dc.read_parquet("s3://mybucket/file.parquet")
47
50
  ```
48
51
 
49
- Reading a partitioned dataset from a directory:
52
+ All files from a directory:
50
53
  ```py
51
- import datachain as dc
52
- dc.read_parquet("s3://mybucket/dir")
54
+ dc.read_parquet("s3://mybucket/dir/")
55
+ ```
56
+
57
+ Only parquet files from a directory, and all it's subdirectories:
58
+ ```py
59
+ dc.read_parquet("s3://mybucket/dir/**/*.parquet")
60
+ ```
61
+
62
+ Using filename patterns - numeric, list, starting with zeros:
63
+ ```py
64
+ dc.read_parquet("s3://mybucket/202{1..4}/{yellow,green}-{01..12}.parquet")
53
65
  ```
54
66
  """
55
67
  from .storage import read_storage
@@ -1,5 +1,5 @@
1
1
  from collections.abc import Iterable
2
- from typing import TYPE_CHECKING, Optional, Union
2
+ from typing import TYPE_CHECKING
3
3
 
4
4
  import sqlalchemy
5
5
 
@@ -19,20 +19,20 @@ READ_RECORDS_BATCH_SIZE = 10000
19
19
 
20
20
 
21
21
  def read_records(
22
- to_insert: Optional[Union[dict, Iterable[dict]]],
23
- session: Optional[Session] = None,
24
- settings: Optional[dict] = None,
22
+ to_insert: dict | Iterable[dict] | None,
23
+ session: Session | None = None,
24
+ settings: dict | None = None,
25
25
  in_memory: bool = False,
26
- schema: Optional[dict[str, DataType]] = None,
26
+ schema: dict[str, DataType] | None = None,
27
27
  ) -> "DataChain":
28
28
  """Create a DataChain from the provided records. This method can be used for
29
29
  programmatically generating a chain in contrast of reading data from storages
30
30
  or other sources.
31
31
 
32
32
  Parameters:
33
- to_insert : records (or a single record) to insert. Each record is
34
- a dictionary of signals and theirs values.
35
- schema : describes chain signals and their corresponding types
33
+ to_insert: records (or a single record) to insert. Each record is
34
+ a dictionary of signals and their values.
35
+ schema: describes chain signals and their corresponding types
36
36
 
37
37
  Example:
38
38
  ```py
@@ -45,7 +45,6 @@ def read_records(
45
45
  """
46
46
  from datachain.query.dataset import adjust_outputs, get_col_types
47
47
  from datachain.sql.types import SQLType
48
- from datachain.utils import batched
49
48
 
50
49
  from .datasets import read_dataset
51
50
 
@@ -79,8 +78,6 @@ def read_records(
79
78
  ),
80
79
  )
81
80
 
82
- session.add_dataset_version(dsr, dsr.latest_version)
83
-
84
81
  if isinstance(to_insert, dict):
85
82
  to_insert = [to_insert]
86
83
  elif not to_insert:
@@ -96,7 +93,6 @@ def read_records(
96
93
  {c.name: c.type for c in columns if isinstance(c.type, SQLType)},
97
94
  )
98
95
  records = (adjust_outputs(warehouse, record, col_types) for record in to_insert)
99
- for chunk in batched(records, READ_RECORDS_BATCH_SIZE):
100
- warehouse.insert_rows(table, chunk)
96
+ warehouse.insert_rows(table, records, batch_size=READ_RECORDS_BATCH_SIZE)
101
97
  warehouse.insert_rows_done(table)
102
98
  return read_dataset(name=dsr.full_name, session=session, settings=settings)