deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
deltacat/utils/daft.py CHANGED
@@ -1,11 +1,41 @@
1
1
  import logging
2
- from typing import Optional, List, Any, Dict, Callable
2
+ from typing import Optional, List, Any, Dict, Callable, Iterator, Union
3
+
4
+ from daft.daft import (
5
+ StorageConfig,
6
+ PartitionField,
7
+ Pushdowns as DaftRustPushdowns,
8
+ ScanTask,
9
+ FileFormatConfig,
10
+ ParquetSourceConfig,
11
+ PartitionTransform as DaftTransform,
12
+ PartitionField as DaftPartitionField,
13
+ )
14
+ from daft.expressions import Expression as DaftExpression
15
+ from daft.expressions.visitor import PredicateVisitor
16
+ from pyarrow import Field as PaField
17
+
3
18
  import daft
4
19
  import ray
5
- from daft import TimeUnit, DataFrame
20
+ from daft import (
21
+ TimeUnit,
22
+ DataFrame,
23
+ Schema as DaftSchema,
24
+ DataType,
25
+ )
26
+ from daft.logical.schema import Field as DaftField
6
27
  from daft.recordbatch import read_parquet_into_pyarrow
7
- from daft.io import IOConfig, S3Config
28
+ from daft.io import (
29
+ IOConfig,
30
+ S3Config,
31
+ )
32
+ from daft.io.scan import (
33
+ ScanOperator,
34
+ make_partition_field,
35
+ )
8
36
  import pyarrow as pa
37
+ import pyarrow.fs as pafs
38
+ from fsspec import AbstractFileSystem
9
39
 
10
40
  from deltacat import logs
11
41
  from deltacat.utils.common import ReadKwargsProvider
@@ -15,19 +45,681 @@ from deltacat.aws.constants import (
15
45
  BOTO_MAX_RETRIES,
16
46
  DAFT_MAX_S3_CONNECTIONS_PER_FILE,
17
47
  AWS_REGION,
18
- DEFAULT_FILE_READ_TIMEOUT_MS,
19
48
  )
49
+ from deltacat.constants import DEFAULT_FILE_READ_TIMEOUT_MS
20
50
  from deltacat.utils.performance import timed_invocation
21
51
 
22
52
  from deltacat.types.partial_download import (
23
53
  PartialFileDownloadParams,
24
54
  )
25
55
 
56
+ # Import directly from storage model modules to avoid circular import
57
+ from deltacat.storage.model.transform import (
58
+ Transform,
59
+ IdentityTransform,
60
+ HourTransform,
61
+ DayTransform,
62
+ MonthTransform,
63
+ YearTransform,
64
+ BucketTransform,
65
+ BucketingStrategy,
66
+ TruncateTransform,
67
+ TruncateStrategy,
68
+ )
69
+ from deltacat.storage.model.partition import PartitionKey
70
+ from deltacat.storage.model.schema import Schema
71
+ from deltacat.storage.model.interop import ModelMapper
72
+ from deltacat.storage.model.expression import (
73
+ Expression,
74
+ Reference,
75
+ Literal,
76
+ Equal,
77
+ NotEqual,
78
+ GreaterThan,
79
+ LessThan,
80
+ GreaterThanEqual,
81
+ LessThanEqual,
82
+ And,
83
+ Or,
84
+ Not,
85
+ IsNull,
86
+ )
87
+ from deltacat.storage.model.scan.push_down import (
88
+ PartitionFilter,
89
+ Pushdown as DeltaCatPushdown,
90
+ )
26
91
 
27
92
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
28
93
 
29
94
 
30
- def s3_files_to_dataframe(
95
+ def translate_pushdown(pushdown: DaftRustPushdowns) -> DeltaCatPushdown:
96
+ """
97
+ Helper method to translate a Daft Pushdowns object into a Deltacat Pushdown.
98
+ Args:
99
+ pushdown: Daft Daft Pushdowns object
100
+ Returns:
101
+ Pushdown: Deltacat Pushdown object with translated filters
102
+ """
103
+ translator = DaftToDeltacatVisitor()
104
+
105
+ partition_filters = None
106
+ if pushdown.partition_filters is not None:
107
+ daft_expr = DaftExpression._from_pyexpr(pushdown.partition_filters)
108
+ partition_filters = PartitionFilter.of(translator.visit(daft_expr))
109
+
110
+ filters = None
111
+ if pushdown.filters is not None:
112
+ daft_expr = DaftExpression._from_pyexpr(pushdown.filters)
113
+ # TODO: support deltacat row filters
114
+ # filters = RowFilter.of(translator.visit(daft_expr))
115
+
116
+ columns = None
117
+ limit = None
118
+
119
+ return DeltaCatPushdown.of(
120
+ partition_filter=partition_filters,
121
+ column_filter=columns,
122
+ row_filter=filters,
123
+ limit=limit,
124
+ )
125
+
126
+
127
+ class DaftToDeltacatVisitor(PredicateVisitor[Expression]):
128
+ """PredicateVisitor implementation to translate Daft Expressions into Deltacat Expressions"""
129
+
130
+ def visit_col(self, name: str) -> Expression:
131
+ return Reference.of(name)
132
+
133
+ def visit_lit(self, value: Any) -> Expression:
134
+ return Literal.of(value)
135
+
136
+ def visit_cast(self, expr: DaftExpression, dtype: DataType) -> Expression:
137
+ # deltacat expressions do not support explicit casting
138
+ # pyarrow should handle any type casting
139
+ return self.visit(expr)
140
+
141
+ def visit_alias(self, expr: DaftExpression, alias: str) -> Expression:
142
+ return self.visit(expr)
143
+
144
+ def visit_function(self, name: str, args: List[DaftExpression]) -> Expression:
145
+ # TODO: Add Deltacat expression function support
146
+ raise ValueError("Function not supported")
147
+
148
+ def visit_and(self, left: DaftExpression, right: DaftExpression) -> Expression:
149
+ """Visit an 'and' expression."""
150
+ return And.of(self.visit(left), self.visit(right))
151
+
152
+ def visit_or(self, left: DaftExpression, right: DaftExpression) -> Expression:
153
+ """Visit an 'or' expression."""
154
+ return Or.of(self.visit(left), self.visit(right))
155
+
156
+ def visit_not(self, expr: DaftExpression) -> Expression:
157
+ """Visit a 'not' expression."""
158
+ return Not.of(self.visit(expr))
159
+
160
+ def visit_equal(self, left: DaftExpression, right: DaftExpression) -> Expression:
161
+ """Visit an 'equals' comparison predicate."""
162
+ return Equal.of(self.visit(left), self.visit(right))
163
+
164
+ def visit_not_equal(
165
+ self, left: DaftExpression, right: DaftExpression
166
+ ) -> Expression:
167
+ """Visit a 'not equals' comparison predicate."""
168
+ return NotEqual.of(self.visit(left), self.visit(right))
169
+
170
+ def visit_less_than(
171
+ self, left: DaftExpression, right: DaftExpression
172
+ ) -> Expression:
173
+ """Visit a 'less than' comparison predicate."""
174
+ return LessThan.of(self.visit(left), self.visit(right))
175
+
176
+ def visit_less_than_or_equal(
177
+ self, left: DaftExpression, right: DaftExpression
178
+ ) -> Expression:
179
+ """Visit a 'less than or equal' comparison predicate."""
180
+ return LessThanEqual.of(self.visit(left), self.visit(right))
181
+
182
+ def visit_greater_than(
183
+ self, left: DaftExpression, right: DaftExpression
184
+ ) -> Expression:
185
+ """Visit a 'greater than' comparison predicate."""
186
+ return GreaterThan.of(self.visit(left), self.visit(right))
187
+
188
+ def visit_greater_than_or_equal(
189
+ self, left: DaftExpression, right: DaftExpression
190
+ ) -> Expression:
191
+ """Visit a 'greater than or equal' comparison predicate."""
192
+ return GreaterThanEqual.of(self.visit(left), self.visit(right))
193
+
194
+ def visit_between(
195
+ self, expr: DaftExpression, lower: DaftExpression, upper: DaftExpression
196
+ ) -> Expression:
197
+ """Visit a 'between' predicate."""
198
+ # Implement BETWEEN as lower <= expr <= upper
199
+ lower_bound = LessThanEqual.of(self.visit(lower), self.visit(expr))
200
+ upper_bound = LessThanEqual.of(self.visit(expr), self.visit(upper))
201
+ return And.of(lower_bound, upper_bound)
202
+
203
+ def visit_is_in(
204
+ self, expr: DaftExpression, items: list[DaftExpression]
205
+ ) -> Expression:
206
+ """Visit an 'is_in' predicate."""
207
+ # For empty list, return false literal
208
+ if not items:
209
+ return Literal(pa.scalar(False))
210
+
211
+ # Implement IN as a series of equality checks combined with OR
212
+ visited_expr = self.visit(expr)
213
+ equals_exprs = [Equal.of(visited_expr, self.visit(item)) for item in items]
214
+
215
+ # Combine with OR
216
+ result = equals_exprs[0]
217
+ for eq_expr in equals_exprs[1:]:
218
+ result = Or.of(result, eq_expr)
219
+
220
+ return result
221
+
222
+ def visit_is_null(self, expr: DaftExpression) -> Expression:
223
+ """Visit an 'is_null' predicate."""
224
+ return IsNull.of(self.visit(expr))
225
+
226
+ def visit_not_null(self, expr: DaftExpression) -> Expression:
227
+ """Visit an 'not_null' predicate."""
228
+ # NOT NULL is implemented as NOT(IS NULL)
229
+ return Not.of(IsNull.of(self.visit(expr)))
230
+
231
+
232
+ class DeltaCatScanOperator(ScanOperator):
233
+ def __init__(self, table, storage_config: StorageConfig) -> None:
234
+ # Import inside method to avoid circular import
235
+ from deltacat.catalog.model.table_definition import TableDefinition
236
+
237
+ if not isinstance(table, TableDefinition):
238
+ raise TypeError("table must be a TableDefinition instance")
239
+ super().__init__()
240
+ self.table = table
241
+ self._schema = self._infer_schema()
242
+ self.partition_keys = self._infer_partition_keys()
243
+ self.storage_config = storage_config
244
+
245
+ def schema(self) -> DaftSchema:
246
+ return self._schema
247
+
248
+ def name(self) -> str:
249
+ return "DeltaCatScanOperator"
250
+
251
+ def display_name(self) -> str:
252
+ return f"DeltaCATScanOperator({self.table.table.namespace}.{self.table.table.table_name})"
253
+
254
+ def partitioning_keys(self) -> list[PartitionField]:
255
+ return self.partition_keys
256
+
257
+ def multiline_display(self) -> list[str]:
258
+ return [
259
+ self.display_name(),
260
+ f"Schema = {self._schema}",
261
+ f"Partitioning keys = {self.partitioning_keys}",
262
+ f"Storage config = {self.storage_config}",
263
+ ]
264
+
265
+ def to_scan_tasks(self, pushdowns: DaftRustPushdowns) -> Iterator[ScanTask]:
266
+ dc_pushdown = translate_pushdown(pushdowns)
267
+ dc_scan_plan = self.table.create_scan_plan(pushdown=dc_pushdown)
268
+ scan_tasks = []
269
+ file_format_config = FileFormatConfig.from_parquet_config(
270
+ # maybe this: ParquetSourceConfig(field_id_mapping=self._field_id_mapping)
271
+ ParquetSourceConfig()
272
+ )
273
+ for dc_scan_task in dc_scan_plan.scan_tasks:
274
+ for data_file in dc_scan_task.data_files():
275
+ st = ScanTask.catalog_scan_task(
276
+ file=data_file.file_path,
277
+ file_format=file_format_config,
278
+ schema=self._schema._schema,
279
+ storage_config=self.storage_config,
280
+ pushdowns=pushdowns,
281
+ )
282
+ scan_tasks.append(st)
283
+ return iter(scan_tasks)
284
+
285
+ def can_absorb_filter(self) -> bool:
286
+ return False
287
+
288
+ def can_absorb_limit(self) -> bool:
289
+ return False
290
+
291
+ def can_absorb_select(self) -> bool:
292
+ return True
293
+
294
+ def _infer_schema(self) -> DaftSchema:
295
+
296
+ if not (
297
+ self.table and self.table.table_version and self.table.table_version.schema
298
+ ):
299
+ raise RuntimeError(
300
+ f"Failed to infer schema for DeltaCAT Table "
301
+ f"{self.table.table.namespace}.{self.table.table.table_name}"
302
+ )
303
+
304
+ return DaftSchema.from_pyarrow_schema(self.table.table_version.schema.arrow)
305
+
306
+ def _infer_partition_keys(self) -> list[PartitionField]:
307
+ if not (
308
+ self.table
309
+ and self.table.table_version
310
+ and self.table.table_version.partition_scheme
311
+ and self.table.table_version.schema
312
+ ):
313
+ raise RuntimeError(
314
+ f"Failed to infer partition keys for DeltaCAT Table "
315
+ f"{self.table.table.namespace}.{self.table.table.table_name}"
316
+ )
317
+
318
+ schema = self.table.table_version.schema
319
+ partition_keys = self.table.table_version.partition_scheme.keys
320
+ if not partition_keys:
321
+ return []
322
+
323
+ partition_fields = []
324
+ for key in partition_keys:
325
+ field = DaftPartitionKeyMapper.unmap(key, schema)
326
+ # Assert that the returned value is not None.
327
+ assert field is not None, f"Unmapping failed for key {key}"
328
+ partition_fields.append(field)
329
+
330
+ return partition_fields
331
+
332
+
333
+ def read_csv(
334
+ path: Union[str, List[str]],
335
+ *,
336
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
337
+ fs_open_kwargs: Dict[str, Any] = {},
338
+ content_encoding: str = ContentEncoding.IDENTITY.value,
339
+ content_type: Optional[str] = None,
340
+ **read_kwargs,
341
+ ) -> DataFrame:
342
+ """
343
+ Read a CSV file into a Daft DataFrame.
344
+
345
+ Args:
346
+ path: Path to the CSV file
347
+ filesystem: Optional filesystem to use
348
+ fs_open_kwargs: Optional filesystem open kwargs
349
+ content_encoding: Content encoding (IDENTITY or GZIP supported)
350
+ content_type: Optional content type (PARQUET, JSON, CSV, etc.)
351
+ **read_kwargs: Additional arguments passed to daft.read_csv
352
+
353
+ Returns:
354
+ DataFrame: The Daft DataFrame
355
+ """
356
+ logger.debug(
357
+ f"Reading CSV file {path} into Daft DataFrame with kwargs: {read_kwargs}"
358
+ )
359
+
360
+ # If content_type is provided, add appropriate reader kwargs
361
+ if content_type is not None:
362
+ content_kwargs = content_type_to_reader_kwargs(content_type)
363
+ read_kwargs.update(content_kwargs)
364
+ logger.debug(f"Added content type kwargs for {content_type}: {content_kwargs}")
365
+
366
+ # Files should now be written with proper extensions, so we can read them directly
367
+ logger.debug(f"Reading CSV with Daft from: {path}")
368
+ df, latency = timed_invocation(daft.read_csv, path, **read_kwargs)
369
+
370
+ logger.debug(f"Time to read CSV {path} into Daft DataFrame: {latency}s")
371
+ return df
372
+
373
+
374
+ def read_json(
375
+ path: Union[str, List[str]],
376
+ *,
377
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
378
+ fs_open_kwargs: Dict[str, Any] = {},
379
+ content_encoding: str = ContentEncoding.IDENTITY.value,
380
+ **read_kwargs,
381
+ ) -> DataFrame:
382
+ """
383
+ Read a JSON file into a Daft DataFrame.
384
+
385
+ Args:
386
+ path: Path to the JSON file (supports line-delimited JSON)
387
+ filesystem: Optional filesystem to use
388
+ fs_open_kwargs: Optional filesystem open kwargs
389
+ content_encoding: Content encoding (IDENTITY or GZIP supported)
390
+ **read_kwargs: Additional arguments passed to daft.read_json
391
+
392
+ Returns:
393
+ DataFrame: The Daft DataFrame
394
+ """
395
+ logger.debug(
396
+ f"Reading JSON file {path} into Daft DataFrame with kwargs: {read_kwargs}"
397
+ )
398
+
399
+ # Files should now be written with proper extensions, so we can read them directly
400
+ logger.debug(f"Reading JSON with Daft from: {path}")
401
+ df, latency = timed_invocation(daft.read_json, path, **read_kwargs)
402
+
403
+ logger.debug(f"Time to read JSON {path} into Daft DataFrame: {latency}s")
404
+ return df
405
+
406
+
407
+ def read_parquet(
408
+ path: Union[str, List[str]],
409
+ *,
410
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
411
+ fs_open_kwargs: Dict[str, Any] = {},
412
+ content_encoding: str = ContentEncoding.IDENTITY.value,
413
+ **read_kwargs,
414
+ ) -> DataFrame:
415
+ """
416
+ Read a Parquet file into a Daft DataFrame.
417
+
418
+ Args:
419
+ path: Path to the Parquet file
420
+ filesystem: Optional filesystem to use
421
+ fs_open_kwargs: Optional filesystem open kwargs
422
+ content_encoding: Content encoding (IDENTITY or GZIP supported)
423
+ **read_kwargs: Additional arguments passed to daft.read_parquet
424
+
425
+ Returns:
426
+ DataFrame: The Daft DataFrame
427
+ """
428
+ logger.debug(
429
+ f"Reading Parquet file {path} into Daft DataFrame with kwargs: {read_kwargs}"
430
+ )
431
+ logger.debug(f"Reading Parquet with Daft from: {path}")
432
+ df, latency = timed_invocation(daft.read_parquet, path=path, **read_kwargs)
433
+ logger.debug(f"Time to read Parquet {path} into Daft DataFrame: {latency}s")
434
+ return df
435
+
436
+
437
+ # Map content types to their respective Daft read functions
438
+ CONTENT_TYPE_TO_READ_FN: Dict[str, Callable] = {
439
+ ContentType.UNESCAPED_TSV.value: read_csv,
440
+ ContentType.TSV.value: read_csv,
441
+ ContentType.CSV.value: read_csv,
442
+ ContentType.PSV.value: read_csv,
443
+ ContentType.PARQUET.value: read_parquet,
444
+ ContentType.JSON.value: read_json,
445
+ }
446
+
447
+
448
+ def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
449
+ """
450
+ Returns reader kwargs for the given content type when reading with Daft.
451
+ """
452
+ if content_type == ContentType.UNESCAPED_TSV.value:
453
+ return {
454
+ "delimiter": "\t",
455
+ "has_headers": False,
456
+ "double_quote": False,
457
+ "allow_variable_columns": True,
458
+ }
459
+ if content_type == ContentType.TSV.value:
460
+ return {
461
+ "delimiter": "\t",
462
+ "has_headers": False,
463
+ "allow_variable_columns": True,
464
+ }
465
+ if content_type == ContentType.CSV.value:
466
+ return {
467
+ "delimiter": ",",
468
+ "has_headers": False,
469
+ "allow_variable_columns": True,
470
+ }
471
+ if content_type == ContentType.PSV.value:
472
+ return {
473
+ "delimiter": "|",
474
+ "has_headers": False,
475
+ "allow_variable_columns": True,
476
+ }
477
+ if content_type in {
478
+ ContentType.PARQUET.value,
479
+ ContentType.JSON.value,
480
+ }:
481
+ return {}
482
+ raise ValueError(f"Unsupported content type for Daft reader: {content_type}")
483
+
484
+
485
+ class DaftFieldMapper(ModelMapper[DaftField, PaField]):
486
+ @staticmethod
487
+ def map(
488
+ obj: Optional[DaftField],
489
+ **kwargs,
490
+ ) -> Optional[PaField]:
491
+ """Convert Daft Field to PyArrow Field.
492
+
493
+ Args:
494
+ obj: The Daft Field to convert
495
+ **kwargs: Additional arguments
496
+
497
+ Returns:
498
+ Converted PyArrow Field object
499
+ """
500
+ if obj is None:
501
+ return None
502
+
503
+ return pa.field(
504
+ name=obj.name,
505
+ type=obj.dtype.to_arrow_dtype(),
506
+ )
507
+
508
+ @staticmethod
509
+ def unmap(
510
+ obj: Optional[PaField],
511
+ **kwargs,
512
+ ) -> Optional[DaftField]:
513
+ """Convert PyArrow Field to Daft Field.
514
+
515
+ Args:
516
+ obj: The PyArrow Field to convert
517
+ **kwargs: Additional arguments
518
+
519
+ Returns:
520
+ Converted Daft Field object
521
+ """
522
+ if obj is None:
523
+ return None
524
+
525
+ return DaftField.create(
526
+ name=obj.name,
527
+ dtype=DataType.from_arrow_type(obj.type), # type: ignore
528
+ )
529
+
530
+
531
+ class DaftTransformMapper(ModelMapper[DaftTransform, Transform]):
532
+ @staticmethod
533
+ def map(
534
+ obj: Optional[DaftTransform],
535
+ **kwargs,
536
+ ) -> Optional[Transform]:
537
+ """Convert DaftTransform to DeltaCAT Transform.
538
+
539
+ Args:
540
+ obj: The DaftTransform to convert
541
+ **kwargs: Additional arguments
542
+
543
+ Returns:
544
+ Converted Transform object
545
+ """
546
+
547
+ # daft.PartitionTransform doesn't have a Python interface for accessing its attributes,
548
+ # thus conversion is not possible.
549
+ # TODO: request Daft to expose Python friendly interface for daft.PartitionTransform
550
+ raise NotImplementedError(
551
+ "Converting transform from Daft to DeltaCAT is not supported"
552
+ )
553
+
554
+ @staticmethod
555
+ def unmap(
556
+ obj: Optional[Transform],
557
+ **kwargs,
558
+ ) -> Optional[DaftTransform]:
559
+ """Convert DeltaCAT Transform to DaftTransform.
560
+
561
+ Args:
562
+ obj: The Transform to convert
563
+ **kwargs: Additional arguments
564
+
565
+ Returns:
566
+ Converted DaftTransform object
567
+ """
568
+ if obj is None:
569
+ return None
570
+
571
+ # Map DeltaCAT transforms to Daft transforms using isinstance
572
+
573
+ if isinstance(obj, IdentityTransform):
574
+ return DaftTransform.identity()
575
+ elif isinstance(obj, HourTransform):
576
+ return DaftTransform.hour()
577
+ elif isinstance(obj, DayTransform):
578
+ return DaftTransform.day()
579
+ elif isinstance(obj, MonthTransform):
580
+ return DaftTransform.month()
581
+ elif isinstance(obj, YearTransform):
582
+ return DaftTransform.year()
583
+ elif isinstance(obj, BucketTransform):
584
+ if obj.parameters.bucketing_strategy == BucketingStrategy.ICEBERG:
585
+ return DaftTransform.iceberg_bucket(obj.parameters.num_buckets)
586
+ else:
587
+ raise ValueError(
588
+ f"Unsupported Bucketing Strategy: {obj.parameters.bucketing_strategy}"
589
+ )
590
+ elif isinstance(obj, TruncateTransform):
591
+ if obj.parameters.truncate_strategy == TruncateStrategy.ICEBERG:
592
+ return DaftTransform.iceberg_truncate(obj.parameters.width)
593
+ else:
594
+ raise ValueError(
595
+ f"Unsupported Truncate Strategy: {obj.parameters.truncate_strategy}"
596
+ )
597
+
598
+ raise ValueError(f"Unsupported Transform: {obj}")
599
+
600
+
601
+ class DaftPartitionKeyMapper(ModelMapper[DaftPartitionField, PartitionKey]):
602
+ @staticmethod
603
+ def map(
604
+ obj: Optional[DaftPartitionField],
605
+ schema: Optional[DaftSchema] = None,
606
+ **kwargs,
607
+ ) -> Optional[PartitionKey]:
608
+ """Convert DaftPartitionField to PartitionKey.
609
+
610
+ Args:
611
+ obj: The DaftPartitionField to convert
612
+ schema: The Daft schema containing field information
613
+ **kwargs: Additional arguments
614
+
615
+ Returns:
616
+ Converted PartitionKey object
617
+ """
618
+ # Daft PartitionField only exposes 1 attribute `field` which is not enough
619
+ # to convert to DeltaCAT PartitionKey
620
+ # TODO: request Daft to expose more Python friendly interface for PartitionField
621
+ raise NotImplementedError(
622
+ f"Converting Daft PartitionField to DeltaCAT PartitionKey is not supported"
623
+ )
624
+
625
+ @staticmethod
626
+ def unmap(
627
+ obj: Optional[PartitionKey],
628
+ schema: Optional[Schema] = None,
629
+ **kwargs,
630
+ ) -> Optional[DaftPartitionField]:
631
+ """Convert PartitionKey to DaftPartitionField.
632
+
633
+ Args:
634
+ obj: The DeltaCAT PartitionKey to convert
635
+ schema: The Schema containing field information
636
+ **kwargs: Additional arguments
637
+
638
+ Returns:
639
+ Converted DaftPartitionField object
640
+ """
641
+ if obj is None:
642
+ return None
643
+ if obj.name is None:
644
+ raise ValueError("Name is required for PartitionKey conversion")
645
+ if not schema:
646
+ raise ValueError("Schema is required for PartitionKey conversion")
647
+ if len(obj.key) < 1:
648
+ raise ValueError(
649
+ f"At least 1 PartitionKey FieldLocator is expected, instead got {len(obj.key)}. FieldLocators: {obj.key}."
650
+ )
651
+
652
+ # Get the source field from schema - FieldLocator in PartitionKey.key points to the source field of partition field
653
+ dc_source_field = schema.field(obj.key[0]).arrow
654
+ daft_source_field = DaftFieldMapper.unmap(obj=dc_source_field)
655
+ # Convert transform if present
656
+ daft_transform = DaftTransformMapper.unmap(obj.transform)
657
+ daft_partition_field = DaftPartitionKeyMapper.get_daft_partition_field(
658
+ partition_field_name=obj.name,
659
+ daft_source_field=daft_source_field,
660
+ dc_transform=obj.transform,
661
+ )
662
+
663
+ # Create DaftPartitionField
664
+ return make_partition_field(
665
+ field=daft_partition_field,
666
+ source_field=daft_source_field,
667
+ transform=daft_transform,
668
+ )
669
+
670
+ @staticmethod
671
+ def get_daft_partition_field(
672
+ partition_field_name: str,
673
+ daft_source_field: Optional[DaftField],
674
+ # TODO: replace DeltaCAT transform with Daft Transform for uniformality
675
+ # We cannot use Daft Transform here because Daft Transform doesn't have a Python interface for us to
676
+ # access its attributes.
677
+ # TODO: request Daft to provide a more python friendly interface for Daft Tranform
678
+ dc_transform: Optional[Transform],
679
+ ) -> DaftField:
680
+ """Generate Daft Partition Field given partition field name, source field and transform.
681
+ Partition field type is inferred using source field type and transform.
682
+
683
+ Args:
684
+ partition_field_name (str): the specified result field name
685
+ daft_source_field (DaftField): the source field of the partition field
686
+ daft_transform (DaftTransform): transform applied on the source field to create partition field
687
+
688
+ Returns:
689
+ DaftField: Daft Field representing the partition field
690
+ """
691
+ if daft_source_field is None:
692
+ raise ValueError("Source field is required for PartitionField conversion")
693
+ if dc_transform is None:
694
+ raise ValueError("Transform is required for PartitionField conversion")
695
+
696
+ result_type = None
697
+ # Below type conversion logic references Daft - Iceberg conversion logic:
698
+ # https://github.com/Eventual-Inc/Daft/blob/7f2e9b5fb50fdfe858be17572f132b37dd6e5ab2/daft/iceberg/iceberg_scan.py#L61-L85
699
+ if isinstance(dc_transform, IdentityTransform):
700
+ result_type = daft_source_field.dtype
701
+ elif isinstance(dc_transform, YearTransform):
702
+ result_type = DataType.int32()
703
+ elif isinstance(dc_transform, MonthTransform):
704
+ result_type = DataType.int32()
705
+ elif isinstance(dc_transform, DayTransform):
706
+ result_type = DataType.int32()
707
+ elif isinstance(dc_transform, HourTransform):
708
+ result_type = DataType.int32()
709
+ elif isinstance(dc_transform, BucketTransform):
710
+ result_type = DataType.int32()
711
+ elif isinstance(dc_transform, TruncateTransform):
712
+ result_type = daft_source_field.dtype
713
+ else:
714
+ raise ValueError(f"Unsupported transform: {dc_transform}")
715
+
716
+ return DaftField.create(
717
+ name=partition_field_name,
718
+ dtype=result_type,
719
+ )
720
+
721
+
722
+ def files_to_dataframe(
31
723
  uris: List[str],
32
724
  content_type: str,
33
725
  content_encoding: str,
@@ -35,66 +727,158 @@ def s3_files_to_dataframe(
35
727
  include_columns: Optional[List[str]] = None,
36
728
  read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
37
729
  ray_options_provider: Optional[Callable[[int, Any], Dict[str, Any]]] = None,
38
- s3_client_kwargs: Optional[Any] = None,
39
730
  ray_init_options: Optional[Dict[str, Any]] = None,
731
+ **kwargs,
40
732
  ) -> DataFrame:
733
+ """
734
+ Read multiple files into a Daft DataFrame using any filesystem.
735
+
736
+ This function supports reading PARQUET, CSV, JSON, TSV, and PSV files.
737
+
738
+ Args:
739
+ uris: List of file URIs to read
740
+ content_type: The content type (PARQUET, CSV, JSON, TSV, UNESCAPED_TSV, PSV)
741
+ content_encoding: The content encoding (currently only IDENTITY is supported)
742
+ column_names: Optional column names to assign
743
+ include_columns: Optional columns to include in the result
744
+ read_func_kwargs_provider: Optional kwargs provider for customization
745
+ ray_options_provider: Optional Ray options provider
746
+ ray_init_options: Optional Ray initialization options
747
+ **kwargs: Additional kwargs, including optional 'io_config' for filesystem configuration
748
+
749
+ Returns:
750
+ DataFrame: The Daft DataFrame
751
+
752
+ Raises:
753
+ AssertionError: If content_type is not supported or content_encoding is not IDENTITY
754
+
755
+ Examples:
756
+ # Read local parquet files (filesystem auto-inferred)
757
+ df = files_to_dataframe(
758
+ uris=["file1.parquet", "file2.parquet"],
759
+ content_type=ContentType.PARQUET.value,
760
+ content_encoding=ContentEncoding.IDENTITY.value
761
+ )
762
+
763
+ # Read CSV files
764
+ df = files_to_dataframe(
765
+ uris=["file1.csv", "file2.csv"],
766
+ content_type=ContentType.CSV.value,
767
+ content_encoding=ContentEncoding.IDENTITY.value
768
+ )
41
769
 
770
+ # Read S3 files with custom IOConfig
771
+ from daft.io import IOConfig, S3Config
772
+ s3_config = IOConfig(s3=S3Config(...))
773
+ df = files_to_dataframe(
774
+ uris=["s3://bucket/file1.parquet", "s3://bucket/file2.parquet"],
775
+ content_type=ContentType.PARQUET.value,
776
+ content_encoding=ContentEncoding.IDENTITY.value,
777
+ io_config=s3_config
778
+ )
779
+ """
42
780
  if ray_init_options is None:
43
781
  ray_init_options = {}
44
782
 
45
- assert (
46
- content_type == ContentType.PARQUET.value
47
- ), f"daft native reader currently only supports parquet, got {content_type}"
783
+ if content_type not in CONTENT_TYPE_TO_READ_FN.keys():
784
+ raise NotImplementedError(
785
+ f"Daft native reader supports {CONTENT_TYPE_TO_READ_FN.keys()}, got {content_type}."
786
+ f"Try using the Ray Dataset reader instead."
787
+ )
48
788
 
49
- assert (
50
- content_encoding == ContentEncoding.IDENTITY.value
51
- ), f"daft native reader currently only supports identity encoding, got {content_encoding}"
789
+ # Handle content encoding - for now, we only support identity and gzip
790
+ if content_encoding not in [
791
+ ContentEncoding.IDENTITY.value,
792
+ ContentEncoding.GZIP.value,
793
+ ]:
794
+ raise NotImplementedError(
795
+ f"Daft native reader currently supports identity and gzip encoding, got {content_encoding}"
796
+ )
52
797
 
53
798
  if not ray.is_initialized():
54
- ray.init(ignore_reinit_error=True, **ray_init_options)
799
+ ray.init(**ray_init_options)
55
800
 
56
801
  daft.context.set_runner_ray(noop_if_initialized=True)
57
802
 
58
- if s3_client_kwargs is None:
59
- s3_client_kwargs = {}
60
-
61
- kwargs = {}
803
+ read_kwargs = {}
62
804
  if read_func_kwargs_provider is not None:
63
- kwargs = read_func_kwargs_provider(content_type, kwargs)
805
+ read_kwargs = read_func_kwargs_provider(content_type, read_kwargs)
64
806
 
65
- # TODO(raghumdani): pass in coerce_int96_timestamp arg
66
- # https://github.com/Eventual-Inc/Daft/issues/1894
807
+ # Add content-type-specific reader kwargs
808
+ content_type_kwargs = content_type_to_reader_kwargs(content_type)
809
+ read_kwargs.update(content_type_kwargs)
67
810
 
68
- io_config = _get_s3_io_config(s3_client_kwargs=s3_client_kwargs)
811
+ # Extract io_config from kwargs if provided, otherwise use None
812
+ io_config = kwargs.pop("io_config", None)
69
813
 
70
- logger.debug(
71
- f"Preparing to read S3 object from {len(uris)} files into daft dataframe"
72
- )
814
+ # Merge any remaining kwargs into read_kwargs (including file_path_column for native Daft support)
815
+ read_kwargs.update(kwargs)
73
816
 
74
- df, latency = timed_invocation(daft.read_parquet, path=uris, io_config=io_config)
817
+ logger.debug(f"Preparing to read {len(uris)} files into daft dataframe")
818
+ logger.debug(f"Content type: {content_type}")
819
+ logger.debug(f"Final read_kwargs: {read_kwargs}")
75
820
 
76
- logger.debug(f"Time to create daft dataframe from {len(uris)} files is {latency}s")
821
+ # Get the appropriate Daft reader function based on content type
822
+ daft_read_func = CONTENT_TYPE_TO_READ_FN.get(content_type)
823
+ if not daft_read_func:
824
+ raise NotImplementedError(
825
+ f"Daft reader for content type '{content_type}' not implemented. "
826
+ f"Known content types: {list(CONTENT_TYPE_TO_READ_FN.keys())}"
827
+ )
77
828
 
78
- columns_to_read = include_columns or column_names
829
+ # Handle schema for all supported formats
830
+ table_version_schema = kwargs.get("table_version_schema")
831
+ if table_version_schema is not None:
832
+ # Convert PyArrow schema to Daft schema using the official API
833
+ daft_schema = daft.Schema.from_pyarrow_schema(table_version_schema)
834
+ # Convert DaftSchema to dictionary format required by Daft readers
835
+ schema_dict = {field.name: field.dtype for field in daft_schema}
836
+ # Remove table_version_schema from kwargs since Daft readers don't recognize it
837
+ read_kwargs.pop("table_version_schema", None)
838
+ # Use explicit schema with infer_schema=False for correctness and performance
839
+ read_kwargs.update({"infer_schema": False, "schema": schema_dict})
840
+ else:
841
+ # Remove table_version_schema parameter if present but None
842
+ read_kwargs.pop("table_version_schema", None)
843
+
844
+ logger.debug(f"Reading {len(uris)} files with Daft using {daft_read_func}.")
79
845
 
80
- logger.debug(f"Taking columns {columns_to_read} from the daft df.")
846
+ # Call the appropriate Daft reader function
847
+ if io_config is not None and content_type == ContentType.PARQUET.value:
848
+ # Only parquet reader supports io_config parameter
849
+ df, latency = timed_invocation(
850
+ daft_read_func, path=uris, io_config=io_config, **read_kwargs
851
+ )
852
+ else:
853
+ df, latency = timed_invocation(daft_read_func, path=uris, **read_kwargs)
854
+
855
+ logger.debug(f"Daft read {len(uris)} files in {latency}s.")
856
+
857
+ # Apply column selection after reading
858
+ columns_to_read = include_columns or column_names
859
+ file_path_column = read_kwargs.get("file_path_column")
860
+ if file_path_column and columns_to_read and file_path_column not in columns_to_read:
861
+ # Add file_path_column to selection if it was specified
862
+ columns_to_read.append(file_path_column)
81
863
 
82
864
  if columns_to_read:
865
+ logger.debug(f"Selecting columns {columns_to_read} with Daft.")
83
866
  return df.select(*columns_to_read)
84
867
  else:
85
868
  return df
86
869
 
87
870
 
88
- def daft_s3_file_to_table(
89
- s3_url: str,
871
+ def daft_file_to_pyarrow_table(
872
+ path: str,
90
873
  content_type: str,
91
874
  content_encoding: str,
875
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
92
876
  column_names: Optional[List[str]] = None,
93
877
  include_columns: Optional[List[str]] = None,
94
878
  pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
95
879
  partial_file_download_params: Optional[PartialFileDownloadParams] = None,
96
- **s3_client_kwargs,
97
- ):
880
+ **kwargs,
881
+ ) -> pa.Table:
98
882
  assert (
99
883
  content_type == ContentType.PARQUET.value
100
884
  ), f"daft native reader currently only supports parquet, got {content_type}"
@@ -119,13 +903,16 @@ def daft_s3_file_to_table(
119
903
  ):
120
904
  row_groups = partial_file_download_params.row_groups_to_download
121
905
 
122
- io_config = _get_s3_io_config(s3_client_kwargs=s3_client_kwargs)
906
+ # Extract io_config from kwargs if provided
907
+ io_config = kwargs.pop("io_config", None)
908
+ if not io_config and path.startswith("s3://"):
909
+ io_config = _get_s3_io_config(kwargs)
123
910
 
124
- logger.debug(f"Preparing to read S3 object from {s3_url} into daft table")
911
+ logger.debug(f"Preparing to read object from {path} into daft table")
125
912
 
126
913
  pa_table, latency = timed_invocation(
127
914
  read_parquet_into_pyarrow,
128
- path=s3_url,
915
+ path=path,
129
916
  columns=include_columns or column_names,
130
917
  row_groups=row_groups,
131
918
  io_config=io_config,
@@ -134,7 +921,7 @@ def daft_s3_file_to_table(
134
921
  file_timeout_ms=file_timeout_ms,
135
922
  )
136
923
 
137
- logger.debug(f"Time to read S3 object from {s3_url} into daft table: {latency}s")
924
+ logger.debug(f"Time to read object from {path} into daft table: {latency}s")
138
925
 
139
926
  if kwargs.get("schema") is not None:
140
927
  input_schema = kwargs["schema"]