deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,9 @@
1
1
  import unittest
2
2
  from deltacat.types.media import ContentEncoding, ContentType
3
- from deltacat.utils.daft import daft_s3_file_to_table, s3_files_to_dataframe
3
+ from deltacat.utils.daft import (
4
+ daft_file_to_pyarrow_table,
5
+ files_to_dataframe,
6
+ )
4
7
  from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowSchemaOverride
5
8
  from deltacat.types.partial_download import PartialParquetParameters
6
9
  import pyarrow as pa
@@ -8,11 +11,11 @@ import pyarrow as pa
8
11
  from pyarrow import parquet as pq
9
12
 
10
13
 
11
- class TestDaftS3FileToTable(unittest.TestCase):
14
+ class TestDaftFileToPyarrowTable(unittest.TestCase):
12
15
  MVP_PATH = "deltacat/tests/utils/data/mvp.parquet"
13
16
 
14
- def test_read_from_s3_all_columns(self):
15
- table = daft_s3_file_to_table(
17
+ def test_read_from_local_all_columns(self):
18
+ table = daft_file_to_pyarrow_table(
16
19
  self.MVP_PATH,
17
20
  content_encoding=ContentEncoding.IDENTITY.value,
18
21
  content_type=ContentType.PARQUET.value,
@@ -20,8 +23,8 @@ class TestDaftS3FileToTable(unittest.TestCase):
20
23
  self.assertEqual(table.schema.names, ["a", "b"])
21
24
  self.assertEqual(table.num_rows, 100)
22
25
 
23
- def test_read_from_s3_single_column_via_include_columns(self):
24
- table = daft_s3_file_to_table(
26
+ def test_read_from_local_single_column_via_include_columns(self):
27
+ table = daft_file_to_pyarrow_table(
25
28
  self.MVP_PATH,
26
29
  content_encoding=ContentEncoding.IDENTITY.value,
27
30
  content_type=ContentType.PARQUET.value,
@@ -30,8 +33,8 @@ class TestDaftS3FileToTable(unittest.TestCase):
30
33
  self.assertEqual(table.schema.names, ["b"])
31
34
  self.assertEqual(table.num_rows, 100)
32
35
 
33
- def test_read_from_s3_single_column_via_column_names(self):
34
- table = daft_s3_file_to_table(
36
+ def test_read_from_local_single_column_via_column_names(self):
37
+ table = daft_file_to_pyarrow_table(
35
38
  self.MVP_PATH,
36
39
  content_encoding=ContentEncoding.IDENTITY.value,
37
40
  content_type=ContentType.PARQUET.value,
@@ -40,12 +43,12 @@ class TestDaftS3FileToTable(unittest.TestCase):
40
43
  self.assertEqual(table.schema.names, ["b"])
41
44
  self.assertEqual(table.num_rows, 100)
42
45
 
43
- def test_read_from_s3_single_column_with_schema(self):
46
+ def test_read_from_local_single_column_with_schema(self):
44
47
  schema = pa.schema([("a", pa.int8()), ("b", pa.string())])
45
48
  pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
46
49
  schema=schema
47
50
  )
48
- table = daft_s3_file_to_table(
51
+ table = daft_file_to_pyarrow_table(
49
52
  self.MVP_PATH,
50
53
  content_encoding=ContentEncoding.IDENTITY.value,
51
54
  content_type=ContentType.PARQUET.value,
@@ -56,12 +59,12 @@ class TestDaftS3FileToTable(unittest.TestCase):
56
59
  self.assertEqual(table.schema.field("a").type, pa.int8())
57
60
  self.assertEqual(table.num_rows, 100)
58
61
 
59
- def test_read_from_s3_single_column_with_schema_reverse_order(self):
62
+ def test_read_from_local_single_column_with_schema_reverse_order(self):
60
63
  schema = pa.schema([("b", pa.string()), ("a", pa.int8())])
61
64
  pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
62
65
  schema=schema
63
66
  )
64
- table = daft_s3_file_to_table(
67
+ table = daft_file_to_pyarrow_table(
65
68
  self.MVP_PATH,
66
69
  content_encoding=ContentEncoding.IDENTITY.value,
67
70
  content_type=ContentType.PARQUET.value,
@@ -71,12 +74,12 @@ class TestDaftS3FileToTable(unittest.TestCase):
71
74
  self.assertEqual(table.schema.field("a").type, pa.int8())
72
75
  self.assertEqual(table.num_rows, 100)
73
76
 
74
- def test_read_from_s3_single_column_with_schema_subset_cols(self):
77
+ def test_read_from_local_single_column_with_schema_subset_cols(self):
75
78
  schema = pa.schema([("a", pa.int8())])
76
79
  pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
77
80
  schema=schema
78
81
  )
79
- table = daft_s3_file_to_table(
82
+ table = daft_file_to_pyarrow_table(
80
83
  self.MVP_PATH,
81
84
  content_encoding=ContentEncoding.IDENTITY.value,
82
85
  content_type=ContentType.PARQUET.value,
@@ -86,12 +89,12 @@ class TestDaftS3FileToTable(unittest.TestCase):
86
89
  self.assertEqual(table.schema.field("a").type, pa.int8())
87
90
  self.assertEqual(table.num_rows, 100)
88
91
 
89
- def test_read_from_s3_single_column_with_schema_extra_cols(self):
92
+ def test_read_from_local_single_column_with_schema_extra_cols(self):
90
93
  schema = pa.schema([("a", pa.int8()), ("MISSING", pa.string())])
91
94
  pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
92
95
  schema=schema
93
96
  )
94
- table = daft_s3_file_to_table(
97
+ table = daft_file_to_pyarrow_table(
95
98
  self.MVP_PATH,
96
99
  content_encoding=ContentEncoding.IDENTITY.value,
97
100
  content_type=ContentType.PARQUET.value,
@@ -104,12 +107,12 @@ class TestDaftS3FileToTable(unittest.TestCase):
104
107
  self.assertEqual(table.schema.field("MISSING").type, pa.string())
105
108
  self.assertEqual(table.num_rows, 100)
106
109
 
107
- def test_read_from_s3_single_column_with_schema_extra_cols_column_names(self):
110
+ def test_read_from_local_single_column_with_schema_extra_cols_column_names(self):
108
111
  schema = pa.schema([("a", pa.int8()), ("MISSING", pa.string())])
109
112
  pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
110
113
  schema=schema
111
114
  )
112
- table = daft_s3_file_to_table(
115
+ table = daft_file_to_pyarrow_table(
113
116
  self.MVP_PATH,
114
117
  content_encoding=ContentEncoding.IDENTITY.value,
115
118
  content_type=ContentType.PARQUET.value,
@@ -123,12 +126,12 @@ class TestDaftS3FileToTable(unittest.TestCase):
123
126
  self.assertEqual(table.schema.field("MISSING").type, pa.string())
124
127
  self.assertEqual(table.num_rows, 100)
125
128
 
126
- def test_read_from_s3_single_column_with_schema_only_missing_col(self):
129
+ def test_read_from_local_single_column_with_schema_only_missing_col(self):
127
130
  schema = pa.schema([("a", pa.int8()), ("MISSING", pa.string())])
128
131
  pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
129
132
  schema=schema
130
133
  )
131
- table = daft_s3_file_to_table(
134
+ table = daft_file_to_pyarrow_table(
132
135
  self.MVP_PATH,
133
136
  content_encoding=ContentEncoding.IDENTITY.value,
134
137
  content_type=ContentType.PARQUET.value,
@@ -142,12 +145,12 @@ class TestDaftS3FileToTable(unittest.TestCase):
142
145
  self.assertEqual(table.schema.field("MISSING").type, pa.string())
143
146
  self.assertEqual(table.num_rows, 0)
144
147
 
145
- def test_read_from_s3_single_column_with_row_groups(self):
148
+ def test_read_from_local_single_column_with_row_groups(self):
146
149
 
147
150
  metadata = pq.read_metadata(self.MVP_PATH)
148
151
  ppp = PartialParquetParameters.of(pq_metadata=metadata)
149
152
  ppp["row_groups_to_download"] = ppp.row_groups_to_download[1:2]
150
- table = daft_s3_file_to_table(
153
+ table = daft_file_to_pyarrow_table(
151
154
  self.MVP_PATH,
152
155
  content_encoding=ContentEncoding.IDENTITY.value,
153
156
  content_type=ContentType.PARQUET.value,
@@ -158,11 +161,11 @@ class TestDaftS3FileToTable(unittest.TestCase):
158
161
  self.assertEqual(table.num_rows, 10)
159
162
 
160
163
 
161
- class TestDaftS3FilesToDataFrame(unittest.TestCase):
164
+ class TestFilesToDataFrame(unittest.TestCase):
162
165
  MVP_PATH = "deltacat/tests/utils/data/mvp.parquet"
163
166
 
164
- def test_read_from_s3_all_columns(self):
165
- df = s3_files_to_dataframe(
167
+ def test_read_local_files_all_columns(self):
168
+ df = files_to_dataframe(
166
169
  uris=[self.MVP_PATH],
167
170
  content_encoding=ContentEncoding.IDENTITY.value,
168
171
  content_type=ContentType.PARQUET.value,
@@ -173,30 +176,117 @@ class TestDaftS3FilesToDataFrame(unittest.TestCase):
173
176
  self.assertEqual(table.schema.names, ["a", "b"])
174
177
  self.assertEqual(table.num_rows, 100)
175
178
 
176
- def test_does_not_read_from_s3_if_not_materialized(self):
177
- df = s3_files_to_dataframe(
179
+ def test_read_local_files_with_column_selection(self):
180
+ df = files_to_dataframe(
178
181
  uris=[self.MVP_PATH],
179
182
  content_encoding=ContentEncoding.IDENTITY.value,
180
183
  content_type=ContentType.PARQUET.value,
184
+ include_columns=["b"],
181
185
  ray_init_options={"local_mode": True, "ignore_reinit_error": True},
182
186
  )
183
187
 
188
+ table = df.to_arrow()
189
+ self.assertEqual(table.schema.names, ["b"])
190
+ self.assertEqual(table.num_rows, 100)
191
+
192
+ def test_read_local_files_does_not_materialize_by_default(self):
193
+ df = files_to_dataframe(
194
+ uris=[self.MVP_PATH],
195
+ content_encoding=ContentEncoding.IDENTITY.value,
196
+ content_type=ContentType.PARQUET.value,
197
+ ray_init_options={"local_mode": True, "ignore_reinit_error": True},
198
+ )
199
+
200
+ # Should raise RuntimeError because df is not materialized yet
184
201
  self.assertRaises(RuntimeError, lambda: len(df))
202
+
203
+ # After collecting, it should work
185
204
  df.collect()
186
205
  self.assertEqual(len(df), 100)
187
206
 
188
- def test_raises_error_if_not_supported_content_type(self):
207
+ def test_supports_unescaped_tsv_content_type(self):
208
+ # Test that UNESCAPED_TSV is now supported (was previously unsupported)
209
+ # Use a CSV file since we're testing TSV reader functionality
210
+ csv_path = "deltacat/tests/utils/data/non_empty_valid.csv"
211
+ df = files_to_dataframe(
212
+ uris=[csv_path],
213
+ content_encoding=ContentEncoding.IDENTITY.value,
214
+ content_type=ContentType.UNESCAPED_TSV.value,
215
+ ray_init_options={"local_mode": True, "ignore_reinit_error": True},
216
+ )
217
+ # Should succeed without raising an exception - this tests that UNESCAPED_TSV is supported
218
+ table = df.to_arrow()
219
+ # Just verify we got some data back, don't assert specific schema since we're reading CSV as TSV
220
+ self.assertGreater(table.num_rows, 0)
221
+ self.assertGreater(len(table.schema.names), 0)
189
222
 
223
+ def test_supports_gzip_content_encoding(self):
224
+ # Test that GZIP encoding is now supported (was previously unsupported)
225
+ df = files_to_dataframe(
226
+ uris=[self.MVP_PATH],
227
+ content_encoding=ContentEncoding.GZIP.value,
228
+ content_type=ContentType.PARQUET.value,
229
+ ray_init_options={"local_mode": True, "ignore_reinit_error": True},
230
+ )
231
+ # Should succeed without raising an exception
232
+ table = df.to_arrow()
233
+ self.assertEqual(table.schema.names, ["a", "b"])
234
+ self.assertEqual(table.num_rows, 100)
235
+
236
+ def test_raises_error_if_not_supported_content_type(self):
237
+ # Test that truly unsupported content types raise NotImplementedError
190
238
  self.assertRaises(
191
- AssertionError,
192
- lambda: s3_files_to_dataframe(
239
+ NotImplementedError,
240
+ lambda: files_to_dataframe(
193
241
  uris=[self.MVP_PATH],
194
242
  content_encoding=ContentEncoding.IDENTITY.value,
195
- content_type=ContentType.UNESCAPED_TSV.value,
243
+ content_type=ContentType.AVRO.value, # AVRO is actually unsupported
196
244
  ray_init_options={"local_mode": True, "ignore_reinit_error": True},
197
245
  ),
198
246
  )
199
247
 
248
+ def test_raises_error_if_not_supported_content_encoding(self):
249
+ # Test that truly unsupported content encodings raise NotImplementedError
250
+ self.assertRaises(
251
+ NotImplementedError,
252
+ lambda: files_to_dataframe(
253
+ uris=[self.MVP_PATH],
254
+ content_encoding=ContentEncoding.ZSTD.value, # ZSTD is actually unsupported
255
+ content_type=ContentType.PARQUET.value,
256
+ ray_init_options={"local_mode": True, "ignore_reinit_error": True},
257
+ ),
258
+ )
259
+
260
+ def test_accepts_custom_kwargs(self):
261
+ # Test that custom kwargs are passed through to daft.read_parquet
262
+ df = files_to_dataframe(
263
+ uris=[self.MVP_PATH],
264
+ content_encoding=ContentEncoding.IDENTITY.value,
265
+ content_type=ContentType.PARQUET.value,
266
+ ray_init_options={"local_mode": True, "ignore_reinit_error": True},
267
+ # Custom kwarg that should be passed to daft.read_parquet
268
+ coerce_int96_timestamp_unit="ns",
269
+ )
270
+
271
+ table = df.to_arrow()
272
+ self.assertEqual(table.schema.names, ["a", "b"])
273
+ self.assertEqual(table.num_rows, 100)
274
+
275
+ def test_accepts_io_config(self):
276
+ # Test that io_config parameter is accepted and passed correctly
277
+ df = files_to_dataframe(
278
+ uris=[self.MVP_PATH],
279
+ content_encoding=ContentEncoding.IDENTITY.value,
280
+ content_type=ContentType.PARQUET.value,
281
+ ray_init_options={"local_mode": True, "ignore_reinit_error": True},
282
+ # io_config=None should work fine for local files
283
+ io_config=None,
284
+ )
285
+
286
+ table = df.to_arrow()
287
+ self.assertEqual(table.schema.names, ["a", "b"])
288
+ self.assertEqual(table.num_rows, 100)
289
+
200
290
 
201
291
  if __name__ == "__main__":
202
292
  unittest.main()