deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  import inspect
2
- from typing import Any, Dict, List
2
+ import functools
3
+ from typing import Any, Callable, Dict, List
3
4
 
4
5
 
5
6
  def sanitize_kwargs_to_callable(callable: Any, kwargs: Dict) -> Dict:
@@ -42,3 +43,34 @@ def sanitize_kwargs_by_supported_kwargs(
42
43
  new_kwargs[key] = kwargs[key]
43
44
 
44
45
  return new_kwargs
46
+
47
+
48
+ def alias(aliases: Dict[str, str]) -> Callable:
49
+ """
50
+ This decorator allows for aliases to be used for function arguments.
51
+ :param aliases: A dictionary of aliases to use for the function arguments.
52
+ :return: A decorator that can be used to decorate a function.
53
+
54
+ For example:
55
+ >>> @alias({'long_parameter_name': 'param'})
56
+ >>> def example_fn(long_parameter_name='foo', **kwargs):
57
+ ... print(long_parameter_name)
58
+ >>> example_fn(long_parameter_name="bar")
59
+ >>> bar
60
+ >>> example_fn(param="baz")
61
+ >>> baz
62
+ >>> example_fn()
63
+ >>> foo
64
+ """
65
+
66
+ def decorator(func: Callable) -> Callable:
67
+ @functools.wraps(func)
68
+ def wrapper(**kwargs: Any) -> Any:
69
+ for name, alias in aliases.items():
70
+ if name not in kwargs and alias in kwargs:
71
+ kwargs[name] = kwargs[alias]
72
+ return func(**kwargs)
73
+
74
+ return wrapper
75
+
76
+ return decorator
deltacat/utils/daft.py CHANGED
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Optional, List, Any, Dict, Callable, Iterator
2
+ from typing import Optional, List, Any, Dict, Callable, Iterator, Union
3
3
 
4
4
  from daft.daft import (
5
5
  StorageConfig,
@@ -34,9 +34,10 @@ from daft.io.scan import (
34
34
  make_partition_field,
35
35
  )
36
36
  import pyarrow as pa
37
+ import pyarrow.fs as pafs
38
+ from fsspec import AbstractFileSystem
37
39
 
38
40
  from deltacat import logs
39
- from deltacat.catalog.model.table_definition import TableDefinition
40
41
  from deltacat.utils.common import ReadKwargsProvider
41
42
  from deltacat.utils.schema import coerce_pyarrow_table_to_schema
42
43
  from deltacat.types.media import ContentType, ContentEncoding
@@ -44,14 +45,16 @@ from deltacat.aws.constants import (
44
45
  BOTO_MAX_RETRIES,
45
46
  DAFT_MAX_S3_CONNECTIONS_PER_FILE,
46
47
  AWS_REGION,
47
- DEFAULT_FILE_READ_TIMEOUT_MS,
48
48
  )
49
+ from deltacat.constants import DEFAULT_FILE_READ_TIMEOUT_MS
49
50
  from deltacat.utils.performance import timed_invocation
50
51
 
51
52
  from deltacat.types.partial_download import (
52
53
  PartialFileDownloadParams,
53
54
  )
54
- from deltacat.storage import (
55
+
56
+ # Import directly from storage model modules to avoid circular import
57
+ from deltacat.storage.model.transform import (
55
58
  Transform,
56
59
  IdentityTransform,
57
60
  HourTransform,
@@ -61,9 +64,10 @@ from deltacat.storage import (
61
64
  BucketTransform,
62
65
  BucketingStrategy,
63
66
  TruncateTransform,
64
- PartitionKey,
65
- Schema,
67
+ TruncateStrategy,
66
68
  )
69
+ from deltacat.storage.model.partition import PartitionKey
70
+ from deltacat.storage.model.schema import Schema
67
71
  from deltacat.storage.model.interop import ModelMapper
68
72
  from deltacat.storage.model.expression import (
69
73
  Expression,
@@ -225,150 +229,13 @@ class DaftToDeltacatVisitor(PredicateVisitor[Expression]):
225
229
  return Not.of(IsNull.of(self.visit(expr)))
226
230
 
227
231
 
228
- def s3_files_to_dataframe(
229
- uris: List[str],
230
- content_type: str,
231
- content_encoding: str,
232
- column_names: Optional[List[str]] = None,
233
- include_columns: Optional[List[str]] = None,
234
- read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
235
- ray_options_provider: Optional[Callable[[int, Any], Dict[str, Any]]] = None,
236
- s3_client_kwargs: Optional[Any] = None,
237
- ray_init_options: Optional[Dict[str, Any]] = None,
238
- ) -> DataFrame:
239
-
240
- if ray_init_options is None:
241
- ray_init_options = {}
242
-
243
- assert (
244
- content_type == ContentType.PARQUET.value
245
- ), f"daft native reader currently only supports parquet, got {content_type}"
246
-
247
- assert (
248
- content_encoding == ContentEncoding.IDENTITY.value
249
- ), f"daft native reader currently only supports identity encoding, got {content_encoding}"
250
-
251
- if not ray.is_initialized():
252
- ray.init(ignore_reinit_error=True, **ray_init_options)
253
-
254
- daft.context.set_runner_ray(noop_if_initialized=True)
255
-
256
- if s3_client_kwargs is None:
257
- s3_client_kwargs = {}
258
-
259
- kwargs = {}
260
- if read_func_kwargs_provider is not None:
261
- kwargs = read_func_kwargs_provider(content_type, kwargs)
262
-
263
- # TODO(raghumdani): pass in coerce_int96_timestamp arg
264
- # https://github.com/Eventual-Inc/Daft/issues/1894
265
-
266
- io_config = _get_s3_io_config(s3_client_kwargs=s3_client_kwargs)
267
-
268
- logger.debug(
269
- f"Preparing to read S3 object from {len(uris)} files into daft dataframe"
270
- )
271
-
272
- df, latency = timed_invocation(daft.read_parquet, path=uris, io_config=io_config)
273
-
274
- logger.debug(f"Time to create daft dataframe from {len(uris)} files is {latency}s")
275
-
276
- columns_to_read = include_columns or column_names
277
-
278
- logger.debug(f"Taking columns {columns_to_read} from the daft df.")
279
-
280
- if columns_to_read:
281
- return df.select(*columns_to_read)
282
- else:
283
- return df
284
-
285
-
286
- def daft_s3_file_to_table(
287
- s3_url: str,
288
- content_type: str,
289
- content_encoding: str,
290
- column_names: Optional[List[str]] = None,
291
- include_columns: Optional[List[str]] = None,
292
- pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
293
- partial_file_download_params: Optional[PartialFileDownloadParams] = None,
294
- **s3_client_kwargs,
295
- ):
296
- assert (
297
- content_type == ContentType.PARQUET.value
298
- ), f"daft native reader currently only supports parquet, got {content_type}"
299
-
300
- assert (
301
- content_encoding == ContentEncoding.IDENTITY.value
302
- ), f"daft native reader currently only supports identity encoding, got {content_encoding}"
303
-
304
- kwargs = {}
305
- if pa_read_func_kwargs_provider is not None:
306
- kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
307
-
308
- coerce_int96_timestamp_unit = TimeUnit.from_str(
309
- kwargs.get("coerce_int96_timestamp_unit", "ms")
310
- )
311
- file_timeout_ms = kwargs.get("file_timeout_ms", DEFAULT_FILE_READ_TIMEOUT_MS)
312
-
313
- row_groups = None
314
- if (
315
- partial_file_download_params is not None
316
- and partial_file_download_params.row_groups_to_download is not None
317
- ):
318
- row_groups = partial_file_download_params.row_groups_to_download
319
-
320
- io_config = _get_s3_io_config(s3_client_kwargs=s3_client_kwargs)
321
-
322
- logger.debug(f"Preparing to read S3 object from {s3_url} into daft table")
323
-
324
- pa_table, latency = timed_invocation(
325
- read_parquet_into_pyarrow,
326
- path=s3_url,
327
- columns=include_columns or column_names,
328
- row_groups=row_groups,
329
- io_config=io_config,
330
- coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
331
- multithreaded_io=False,
332
- file_timeout_ms=file_timeout_ms,
333
- )
334
-
335
- logger.debug(f"Time to read S3 object from {s3_url} into daft table: {latency}s")
336
-
337
- if kwargs.get("schema") is not None:
338
- input_schema = kwargs["schema"]
339
- if include_columns is not None:
340
- input_schema = pa.schema(
341
- [input_schema.field(col) for col in include_columns],
342
- metadata=input_schema.metadata,
343
- )
344
- elif column_names is not None:
345
- input_schema = pa.schema(
346
- [input_schema.field(col) for col in column_names],
347
- metadata=input_schema.metadata,
348
- )
349
- return coerce_pyarrow_table_to_schema(pa_table, input_schema)
350
- else:
351
- return pa_table
352
-
353
-
354
- def _get_s3_io_config(s3_client_kwargs) -> IOConfig:
355
- return IOConfig(
356
- s3=S3Config(
357
- key_id=s3_client_kwargs.get("aws_access_key_id"),
358
- access_key=s3_client_kwargs.get("aws_secret_access_key"),
359
- session_token=s3_client_kwargs.get("aws_session_token"),
360
- region_name=AWS_REGION,
361
- retry_mode="adaptive",
362
- num_tries=BOTO_MAX_RETRIES,
363
- max_connections=DAFT_MAX_S3_CONNECTIONS_PER_FILE,
364
- connect_timeout_ms=5_000, # Timeout to connect to server
365
- read_timeout_ms=10_000, # Timeout for first byte from server
366
- )
367
- )
368
-
369
-
370
232
  class DeltaCatScanOperator(ScanOperator):
371
- def __init__(self, table: TableDefinition, storage_config: StorageConfig) -> None:
233
+ def __init__(self, table, storage_config: StorageConfig) -> None:
234
+ # Import inside method to avoid circular import
235
+ from deltacat.catalog.model.table_definition import TableDefinition
236
+
237
+ if not isinstance(table, TableDefinition):
238
+ raise TypeError("table must be a TableDefinition instance")
372
239
  super().__init__()
373
240
  self.table = table
374
241
  self._schema = self._infer_schema()
@@ -463,6 +330,158 @@ class DeltaCatScanOperator(ScanOperator):
463
330
  return partition_fields
464
331
 
465
332
 
333
+ def read_csv(
334
+ path: Union[str, List[str]],
335
+ *,
336
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
337
+ fs_open_kwargs: Dict[str, Any] = {},
338
+ content_encoding: str = ContentEncoding.IDENTITY.value,
339
+ content_type: Optional[str] = None,
340
+ **read_kwargs,
341
+ ) -> DataFrame:
342
+ """
343
+ Read a CSV file into a Daft DataFrame.
344
+
345
+ Args:
346
+ path: Path to the CSV file
347
+ filesystem: Optional filesystem to use
348
+ fs_open_kwargs: Optional filesystem open kwargs
349
+ content_encoding: Content encoding (IDENTITY or GZIP supported)
350
+ content_type: Optional content type (PARQUET, JSON, CSV, etc.)
351
+ **read_kwargs: Additional arguments passed to daft.read_csv
352
+
353
+ Returns:
354
+ DataFrame: The Daft DataFrame
355
+ """
356
+ logger.debug(
357
+ f"Reading CSV file {path} into Daft DataFrame with kwargs: {read_kwargs}"
358
+ )
359
+
360
+ # If content_type is provided, add appropriate reader kwargs
361
+ if content_type is not None:
362
+ content_kwargs = content_type_to_reader_kwargs(content_type)
363
+ read_kwargs.update(content_kwargs)
364
+ logger.debug(f"Added content type kwargs for {content_type}: {content_kwargs}")
365
+
366
+ # Files should now be written with proper extensions, so we can read them directly
367
+ logger.debug(f"Reading CSV with Daft from: {path}")
368
+ df, latency = timed_invocation(daft.read_csv, path, **read_kwargs)
369
+
370
+ logger.debug(f"Time to read CSV {path} into Daft DataFrame: {latency}s")
371
+ return df
372
+
373
+
374
+ def read_json(
375
+ path: Union[str, List[str]],
376
+ *,
377
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
378
+ fs_open_kwargs: Dict[str, Any] = {},
379
+ content_encoding: str = ContentEncoding.IDENTITY.value,
380
+ **read_kwargs,
381
+ ) -> DataFrame:
382
+ """
383
+ Read a JSON file into a Daft DataFrame.
384
+
385
+ Args:
386
+ path: Path to the JSON file (supports line-delimited JSON)
387
+ filesystem: Optional filesystem to use
388
+ fs_open_kwargs: Optional filesystem open kwargs
389
+ content_encoding: Content encoding (IDENTITY or GZIP supported)
390
+ **read_kwargs: Additional arguments passed to daft.read_json
391
+
392
+ Returns:
393
+ DataFrame: The Daft DataFrame
394
+ """
395
+ logger.debug(
396
+ f"Reading JSON file {path} into Daft DataFrame with kwargs: {read_kwargs}"
397
+ )
398
+
399
+ # Files should now be written with proper extensions, so we can read them directly
400
+ logger.debug(f"Reading JSON with Daft from: {path}")
401
+ df, latency = timed_invocation(daft.read_json, path, **read_kwargs)
402
+
403
+ logger.debug(f"Time to read JSON {path} into Daft DataFrame: {latency}s")
404
+ return df
405
+
406
+
407
+ def read_parquet(
408
+ path: Union[str, List[str]],
409
+ *,
410
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
411
+ fs_open_kwargs: Dict[str, Any] = {},
412
+ content_encoding: str = ContentEncoding.IDENTITY.value,
413
+ **read_kwargs,
414
+ ) -> DataFrame:
415
+ """
416
+ Read a Parquet file into a Daft DataFrame.
417
+
418
+ Args:
419
+ path: Path to the Parquet file
420
+ filesystem: Optional filesystem to use
421
+ fs_open_kwargs: Optional filesystem open kwargs
422
+ content_encoding: Content encoding (IDENTITY or GZIP supported)
423
+ **read_kwargs: Additional arguments passed to daft.read_parquet
424
+
425
+ Returns:
426
+ DataFrame: The Daft DataFrame
427
+ """
428
+ logger.debug(
429
+ f"Reading Parquet file {path} into Daft DataFrame with kwargs: {read_kwargs}"
430
+ )
431
+ logger.debug(f"Reading Parquet with Daft from: {path}")
432
+ df, latency = timed_invocation(daft.read_parquet, path=path, **read_kwargs)
433
+ logger.debug(f"Time to read Parquet {path} into Daft DataFrame: {latency}s")
434
+ return df
435
+
436
+
437
+ # Map content types to their respective Daft read functions
438
+ CONTENT_TYPE_TO_READ_FN: Dict[str, Callable] = {
439
+ ContentType.UNESCAPED_TSV.value: read_csv,
440
+ ContentType.TSV.value: read_csv,
441
+ ContentType.CSV.value: read_csv,
442
+ ContentType.PSV.value: read_csv,
443
+ ContentType.PARQUET.value: read_parquet,
444
+ ContentType.JSON.value: read_json,
445
+ }
446
+
447
+
448
+ def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
449
+ """
450
+ Returns reader kwargs for the given content type when reading with Daft.
451
+ """
452
+ if content_type == ContentType.UNESCAPED_TSV.value:
453
+ return {
454
+ "delimiter": "\t",
455
+ "has_headers": False,
456
+ "double_quote": False,
457
+ "allow_variable_columns": True,
458
+ }
459
+ if content_type == ContentType.TSV.value:
460
+ return {
461
+ "delimiter": "\t",
462
+ "has_headers": False,
463
+ "allow_variable_columns": True,
464
+ }
465
+ if content_type == ContentType.CSV.value:
466
+ return {
467
+ "delimiter": ",",
468
+ "has_headers": False,
469
+ "allow_variable_columns": True,
470
+ }
471
+ if content_type == ContentType.PSV.value:
472
+ return {
473
+ "delimiter": "|",
474
+ "has_headers": False,
475
+ "allow_variable_columns": True,
476
+ }
477
+ if content_type in {
478
+ ContentType.PARQUET.value,
479
+ ContentType.JSON.value,
480
+ }:
481
+ return {}
482
+ raise ValueError(f"Unsupported content type for Daft reader: {content_type}")
483
+
484
+
466
485
  class DaftFieldMapper(ModelMapper[DaftField, PaField]):
467
486
  @staticmethod
468
487
  def map(
@@ -569,7 +588,12 @@ class DaftTransformMapper(ModelMapper[DaftTransform, Transform]):
569
588
  f"Unsupported Bucketing Strategy: {obj.parameters.bucketing_strategy}"
570
589
  )
571
590
  elif isinstance(obj, TruncateTransform):
572
- return DaftTransform.iceberg_truncate(obj.parameters.width)
591
+ if obj.parameters.truncate_strategy == TruncateStrategy.ICEBERG:
592
+ return DaftTransform.iceberg_truncate(obj.parameters.width)
593
+ else:
594
+ raise ValueError(
595
+ f"Unsupported Truncate Strategy: {obj.parameters.truncate_strategy}"
596
+ )
573
597
 
574
598
  raise ValueError(f"Unsupported Transform: {obj}")
575
599
 
@@ -693,3 +717,240 @@ class DaftPartitionKeyMapper(ModelMapper[DaftPartitionField, PartitionKey]):
693
717
  name=partition_field_name,
694
718
  dtype=result_type,
695
719
  )
720
+
721
+
722
+ def files_to_dataframe(
723
+ uris: List[str],
724
+ content_type: str,
725
+ content_encoding: str,
726
+ column_names: Optional[List[str]] = None,
727
+ include_columns: Optional[List[str]] = None,
728
+ read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
729
+ ray_options_provider: Optional[Callable[[int, Any], Dict[str, Any]]] = None,
730
+ ray_init_options: Optional[Dict[str, Any]] = None,
731
+ **kwargs,
732
+ ) -> DataFrame:
733
+ """
734
+ Read multiple files into a Daft DataFrame using any filesystem.
735
+
736
+ This function supports reading PARQUET, CSV, JSON, TSV, and PSV files.
737
+
738
+ Args:
739
+ uris: List of file URIs to read
740
+ content_type: The content type (PARQUET, CSV, JSON, TSV, UNESCAPED_TSV, PSV)
741
+ content_encoding: The content encoding (currently only IDENTITY is supported)
742
+ column_names: Optional column names to assign
743
+ include_columns: Optional columns to include in the result
744
+ read_func_kwargs_provider: Optional kwargs provider for customization
745
+ ray_options_provider: Optional Ray options provider
746
+ ray_init_options: Optional Ray initialization options
747
+ **kwargs: Additional kwargs, including optional 'io_config' for filesystem configuration
748
+
749
+ Returns:
750
+ DataFrame: The Daft DataFrame
751
+
752
+ Raises:
753
+ AssertionError: If content_type is not supported or content_encoding is not IDENTITY
754
+
755
+ Examples:
756
+ # Read local parquet files (filesystem auto-inferred)
757
+ df = files_to_dataframe(
758
+ uris=["file1.parquet", "file2.parquet"],
759
+ content_type=ContentType.PARQUET.value,
760
+ content_encoding=ContentEncoding.IDENTITY.value
761
+ )
762
+
763
+ # Read CSV files
764
+ df = files_to_dataframe(
765
+ uris=["file1.csv", "file2.csv"],
766
+ content_type=ContentType.CSV.value,
767
+ content_encoding=ContentEncoding.IDENTITY.value
768
+ )
769
+
770
+ # Read S3 files with custom IOConfig
771
+ from daft.io import IOConfig, S3Config
772
+ s3_config = IOConfig(s3=S3Config(...))
773
+ df = files_to_dataframe(
774
+ uris=["s3://bucket/file1.parquet", "s3://bucket/file2.parquet"],
775
+ content_type=ContentType.PARQUET.value,
776
+ content_encoding=ContentEncoding.IDENTITY.value,
777
+ io_config=s3_config
778
+ )
779
+ """
780
+ if ray_init_options is None:
781
+ ray_init_options = {}
782
+
783
+ if content_type not in CONTENT_TYPE_TO_READ_FN.keys():
784
+ raise NotImplementedError(
785
+ f"Daft native reader supports {CONTENT_TYPE_TO_READ_FN.keys()}, got {content_type}."
786
+ f"Try using the Ray Dataset reader instead."
787
+ )
788
+
789
+ # Handle content encoding - for now, we only support identity and gzip
790
+ if content_encoding not in [
791
+ ContentEncoding.IDENTITY.value,
792
+ ContentEncoding.GZIP.value,
793
+ ]:
794
+ raise NotImplementedError(
795
+ f"Daft native reader currently supports identity and gzip encoding, got {content_encoding}"
796
+ )
797
+
798
+ if not ray.is_initialized():
799
+ ray.init(**ray_init_options)
800
+
801
+ daft.context.set_runner_ray(noop_if_initialized=True)
802
+
803
+ read_kwargs = {}
804
+ if read_func_kwargs_provider is not None:
805
+ read_kwargs = read_func_kwargs_provider(content_type, read_kwargs)
806
+
807
+ # Add content-type-specific reader kwargs
808
+ content_type_kwargs = content_type_to_reader_kwargs(content_type)
809
+ read_kwargs.update(content_type_kwargs)
810
+
811
+ # Extract io_config from kwargs if provided, otherwise use None
812
+ io_config = kwargs.pop("io_config", None)
813
+
814
+ # Merge any remaining kwargs into read_kwargs (including file_path_column for native Daft support)
815
+ read_kwargs.update(kwargs)
816
+
817
+ logger.debug(f"Preparing to read {len(uris)} files into daft dataframe")
818
+ logger.debug(f"Content type: {content_type}")
819
+ logger.debug(f"Final read_kwargs: {read_kwargs}")
820
+
821
+ # Get the appropriate Daft reader function based on content type
822
+ daft_read_func = CONTENT_TYPE_TO_READ_FN.get(content_type)
823
+ if not daft_read_func:
824
+ raise NotImplementedError(
825
+ f"Daft reader for content type '{content_type}' not implemented. "
826
+ f"Known content types: {list(CONTENT_TYPE_TO_READ_FN.keys())}"
827
+ )
828
+
829
+ # Handle schema for all supported formats
830
+ table_version_schema = kwargs.get("table_version_schema")
831
+ if table_version_schema is not None:
832
+ # Convert PyArrow schema to Daft schema using the official API
833
+ daft_schema = daft.Schema.from_pyarrow_schema(table_version_schema)
834
+ # Convert DaftSchema to dictionary format required by Daft readers
835
+ schema_dict = {field.name: field.dtype for field in daft_schema}
836
+ # Remove table_version_schema from kwargs since Daft readers don't recognize it
837
+ read_kwargs.pop("table_version_schema", None)
838
+ # Use explicit schema with infer_schema=False for correctness and performance
839
+ read_kwargs.update({"infer_schema": False, "schema": schema_dict})
840
+ else:
841
+ # Remove table_version_schema parameter if present but None
842
+ read_kwargs.pop("table_version_schema", None)
843
+
844
+ logger.debug(f"Reading {len(uris)} files with Daft using {daft_read_func}.")
845
+
846
+ # Call the appropriate Daft reader function
847
+ if io_config is not None and content_type == ContentType.PARQUET.value:
848
+ # Only parquet reader supports io_config parameter
849
+ df, latency = timed_invocation(
850
+ daft_read_func, path=uris, io_config=io_config, **read_kwargs
851
+ )
852
+ else:
853
+ df, latency = timed_invocation(daft_read_func, path=uris, **read_kwargs)
854
+
855
+ logger.debug(f"Daft read {len(uris)} files in {latency}s.")
856
+
857
+ # Apply column selection after reading
858
+ columns_to_read = include_columns or column_names
859
+ file_path_column = read_kwargs.get("file_path_column")
860
+ if file_path_column and columns_to_read and file_path_column not in columns_to_read:
861
+ # Add file_path_column to selection if it was specified
862
+ columns_to_read.append(file_path_column)
863
+
864
+ if columns_to_read:
865
+ logger.debug(f"Selecting columns {columns_to_read} with Daft.")
866
+ return df.select(*columns_to_read)
867
+ else:
868
+ return df
869
+
870
+
871
+ def daft_file_to_pyarrow_table(
872
+ path: str,
873
+ content_type: str,
874
+ content_encoding: str,
875
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
876
+ column_names: Optional[List[str]] = None,
877
+ include_columns: Optional[List[str]] = None,
878
+ pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
879
+ partial_file_download_params: Optional[PartialFileDownloadParams] = None,
880
+ **kwargs,
881
+ ) -> pa.Table:
882
+ assert (
883
+ content_type == ContentType.PARQUET.value
884
+ ), f"daft native reader currently only supports parquet, got {content_type}"
885
+
886
+ assert (
887
+ content_encoding == ContentEncoding.IDENTITY.value
888
+ ), f"daft native reader currently only supports identity encoding, got {content_encoding}"
889
+
890
+ kwargs = {}
891
+ if pa_read_func_kwargs_provider is not None:
892
+ kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
893
+
894
+ coerce_int96_timestamp_unit = TimeUnit.from_str(
895
+ kwargs.get("coerce_int96_timestamp_unit", "ms")
896
+ )
897
+ file_timeout_ms = kwargs.get("file_timeout_ms", DEFAULT_FILE_READ_TIMEOUT_MS)
898
+
899
+ row_groups = None
900
+ if (
901
+ partial_file_download_params is not None
902
+ and partial_file_download_params.row_groups_to_download is not None
903
+ ):
904
+ row_groups = partial_file_download_params.row_groups_to_download
905
+
906
+ # Extract io_config from kwargs if provided
907
+ io_config = kwargs.pop("io_config", None)
908
+ if not io_config and path.startswith("s3://"):
909
+ io_config = _get_s3_io_config(kwargs)
910
+
911
+ logger.debug(f"Preparing to read object from {path} into daft table")
912
+
913
+ pa_table, latency = timed_invocation(
914
+ read_parquet_into_pyarrow,
915
+ path=path,
916
+ columns=include_columns or column_names,
917
+ row_groups=row_groups,
918
+ io_config=io_config,
919
+ coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
920
+ multithreaded_io=False,
921
+ file_timeout_ms=file_timeout_ms,
922
+ )
923
+
924
+ logger.debug(f"Time to read object from {path} into daft table: {latency}s")
925
+
926
+ if kwargs.get("schema") is not None:
927
+ input_schema = kwargs["schema"]
928
+ if include_columns is not None:
929
+ input_schema = pa.schema(
930
+ [input_schema.field(col) for col in include_columns],
931
+ metadata=input_schema.metadata,
932
+ )
933
+ elif column_names is not None:
934
+ input_schema = pa.schema(
935
+ [input_schema.field(col) for col in column_names],
936
+ metadata=input_schema.metadata,
937
+ )
938
+ return coerce_pyarrow_table_to_schema(pa_table, input_schema)
939
+ else:
940
+ return pa_table
941
+
942
+
943
+ def _get_s3_io_config(s3_client_kwargs) -> IOConfig:
944
+ return IOConfig(
945
+ s3=S3Config(
946
+ key_id=s3_client_kwargs.get("aws_access_key_id"),
947
+ access_key=s3_client_kwargs.get("aws_secret_access_key"),
948
+ session_token=s3_client_kwargs.get("aws_session_token"),
949
+ region_name=AWS_REGION,
950
+ retry_mode="adaptive",
951
+ num_tries=BOTO_MAX_RETRIES,
952
+ max_connections=DAFT_MAX_S3_CONNECTIONS_PER_FILE,
953
+ connect_timeout_ms=5_000, # Timeout to connect to server
954
+ read_timeout_ms=10_000, # Timeout for first byte from server
955
+ )
956
+ )