deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1106 @@
1
+ from unittest import TestCase
2
+ import csv
3
+ import pandas as pd
4
+ import tempfile
5
+ import fsspec
6
+ import gzip
7
+ import json
8
+ import polars as pl
9
+ from deltacat.types.media import ContentType, ContentEncoding
10
+ from deltacat.utils.pandas import (
11
+ dataframe_to_file,
12
+ file_to_dataframe,
13
+ content_type_to_reader_kwargs,
14
+ _add_column_kwargs,
15
+ ReadKwargsProviderPandasCsvPureUtf8,
16
+ concat_dataframes,
17
+ )
18
+
19
+
20
+ class TestPandasWriters(TestCase):
21
+ def setUp(self):
22
+ # Create a test DataFrame with data that includes delimiters
23
+ self.df = pd.DataFrame({"col1": ["a,b\tc|d", "e,f\tg|h"], "col2": [1, 2]})
24
+ self.fs = fsspec.filesystem("file")
25
+ self.base_path = tempfile.mkdtemp()
26
+ self.fs.makedirs(self.base_path, exist_ok=True)
27
+
28
+ def tearDown(self):
29
+ self.fs.rm(self.base_path, recursive=True)
30
+
31
+ def test_write_feather(self):
32
+ path = f"{self.base_path}/test.feather"
33
+
34
+ dataframe_to_file(
35
+ self.df,
36
+ path,
37
+ self.fs,
38
+ lambda x: path,
39
+ content_type=ContentType.FEATHER.value,
40
+ )
41
+ assert self.fs.exists(path), "file was not written"
42
+
43
+ # Verify content
44
+ result = pd.read_feather(path)
45
+ pd.testing.assert_frame_equal(result, self.df)
46
+
47
+ def test_write_csv(self):
48
+ path = f"{self.base_path}/test.csv"
49
+
50
+ dataframe_to_file(
51
+ self.df, path, self.fs, lambda x: path, content_type=ContentType.CSV.value
52
+ )
53
+ assert self.fs.exists(path), "file was not written"
54
+
55
+ # Verify content (should be GZIP compressed)
56
+ with self.fs.open(path, "rb") as f:
57
+ with gzip.GzipFile(fileobj=f) as gz:
58
+ content = gz.read().decode("utf-8")
59
+ # Should be quoted due to commas in data
60
+ assert '"a,b\tc|d",1' in content
61
+ assert '"e,f\tg|h",2' in content
62
+
63
+ def test_write_tsv(self):
64
+ path = f"{self.base_path}/test.tsv"
65
+
66
+ dataframe_to_file(
67
+ self.df,
68
+ path,
69
+ self.fs,
70
+ lambda x: path,
71
+ content_type=ContentType.TSV.value,
72
+ )
73
+ assert self.fs.exists(path), "file was not written"
74
+
75
+ # Verify content (should be GZIP compressed)
76
+ with self.fs.open(path, "rb") as f:
77
+ with gzip.GzipFile(fileobj=f) as gz:
78
+ content = gz.read().decode("utf-8")
79
+ # Should be quoted due to tabs in data
80
+ assert '"a,b\tc|d"\t1' in content
81
+ assert '"e,f\tg|h"\t2' in content
82
+
83
+ def test_write_psv(self):
84
+ path = f"{self.base_path}/test.psv"
85
+
86
+ dataframe_to_file(
87
+ self.df,
88
+ path,
89
+ self.fs,
90
+ lambda x: path,
91
+ content_type=ContentType.PSV.value,
92
+ )
93
+ assert self.fs.exists(path), "file was not written"
94
+
95
+ # Verify content (should be GZIP compressed)
96
+ with self.fs.open(path, "rb") as f:
97
+ with gzip.GzipFile(fileobj=f) as gz:
98
+ content = gz.read().decode("utf-8")
99
+ # Should be quoted due to pipes in data
100
+ assert '"a,b\tc|d"|1' in content
101
+ assert '"e,f\tg|h"|2' in content
102
+
103
+ def test_write_unescaped_tsv(self):
104
+ # Create DataFrame without delimiters for unescaped TSV
105
+ df = pd.DataFrame({"col1": ["abc", "def"], "col2": [1, 2]})
106
+ path = f"{self.base_path}/test.tsv"
107
+
108
+ dataframe_to_file(
109
+ df,
110
+ path,
111
+ self.fs,
112
+ lambda x: path,
113
+ content_type=ContentType.UNESCAPED_TSV.value,
114
+ )
115
+ assert self.fs.exists(path), "file was not written"
116
+
117
+ # Verify content (should be GZIP compressed)
118
+ with self.fs.open(path, "rb") as f:
119
+ with gzip.GzipFile(fileobj=f) as gz:
120
+ content = gz.read().decode("utf-8")
121
+ # With quoting_style="none", strings should not be quoted
122
+ assert "abc\t1" in content
123
+ assert "def\t2" in content
124
+
125
+ def test_write_orc(self):
126
+ path = f"{self.base_path}/test.orc"
127
+
128
+ dataframe_to_file(
129
+ self.df, path, self.fs, lambda x: path, content_type=ContentType.ORC.value
130
+ )
131
+ assert self.fs.exists(path), "file was not written"
132
+
133
+ # Verify content
134
+ result = pd.read_orc(path)
135
+ pd.testing.assert_frame_equal(result, self.df)
136
+
137
+ def test_write_parquet(self):
138
+ path = f"{self.base_path}/test.parquet"
139
+
140
+ dataframe_to_file(
141
+ self.df,
142
+ path,
143
+ self.fs,
144
+ lambda x: path,
145
+ content_type=ContentType.PARQUET.value,
146
+ )
147
+ assert self.fs.exists(path), "file was not written"
148
+
149
+ # Verify content
150
+ result = pd.read_parquet(path)
151
+ pd.testing.assert_frame_equal(result, self.df)
152
+
153
+ def test_write_json(self):
154
+ path = f"{self.base_path}/test.json"
155
+
156
+ dataframe_to_file(
157
+ self.df,
158
+ path,
159
+ self.fs,
160
+ lambda x: path,
161
+ content_type=ContentType.JSON.value,
162
+ orient="records", # Write each record as a separate JSON object
163
+ lines=True, # This should create NDJSON format
164
+ )
165
+ assert self.fs.exists(path), "file was not written"
166
+
167
+ # Verify content (should be GZIP compressed NDJSON format)
168
+ with self.fs.open(path, "rb") as f:
169
+ with gzip.GzipFile(fileobj=f) as gz:
170
+ content = gz.read().decode("utf-8").strip()
171
+ # Content should be NDJSON format: each line is a separate JSON object
172
+ lines = content.split("\n")
173
+ assert len(lines) == 2 # 2 records
174
+
175
+ # Parse each line as a separate JSON object
176
+ data = [json.loads(line) for line in lines]
177
+ assert data[0] == {"col1": "a,b\tc|d", "col2": 1}
178
+ assert data[1] == {"col1": "e,f\tg|h", "col2": 2}
179
+
180
+ def test_write_avro(self):
181
+ path = f"{self.base_path}/test.avro"
182
+
183
+ dataframe_to_file(
184
+ self.df, path, self.fs, lambda x: path, content_type=ContentType.AVRO.value
185
+ )
186
+ assert self.fs.exists(path), "file was not written"
187
+
188
+ # Verify content by reading with polars
189
+ result = pl.read_avro(path).to_pandas()
190
+ pd.testing.assert_frame_equal(result, self.df)
191
+
192
+
193
+ class TestPandasReaders(TestCase):
194
+ def setUp(self):
195
+ # Create test data files for reading
196
+ self.fs = fsspec.filesystem("file")
197
+ self.base_path = tempfile.mkdtemp()
198
+ self.fs.makedirs(self.base_path, exist_ok=True)
199
+
200
+ # Create test DataFrame
201
+ self.df = pd.DataFrame(
202
+ {
203
+ "col1": ["a,b\tc|d", "e,f\tg|h", "test"],
204
+ "col2": [1, 2, 3],
205
+ "col3": [1.1, 2.2, 3.3],
206
+ }
207
+ )
208
+
209
+ # Write test files in different formats
210
+ self._create_test_files()
211
+
212
+ def tearDown(self):
213
+ self.fs.rm(self.base_path, recursive=True)
214
+
215
+ def _create_test_files(self):
216
+ """Create test files in different formats with different compression types."""
217
+ import gzip
218
+
219
+ # Create CSV file (GZIP compressed)
220
+ csv_path = f"{self.base_path}/test.csv"
221
+ with self.fs.open(csv_path, "wb") as f:
222
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
223
+ content = '"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n'
224
+ gz.write(content.encode("utf-8"))
225
+
226
+ # Create TSV file (GZIP compressed)
227
+ tsv_path = f"{self.base_path}/test.tsv"
228
+ with self.fs.open(tsv_path, "wb") as f:
229
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
230
+ content = '"a,b\tc|d"\t1\t1.1\n"e,f\tg|h"\t2\t2.2\ntest\t3\t3.3\n'
231
+ gz.write(content.encode("utf-8"))
232
+
233
+ # Create PSV file (GZIP compressed)
234
+ psv_path = f"{self.base_path}/test.psv"
235
+ with self.fs.open(psv_path, "wb") as f:
236
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
237
+ content = '"a,b\tc|d"|1|1.1\n"e,f\tg|h"|2|2.2\ntest|3|3.3\n'
238
+ gz.write(content.encode("utf-8"))
239
+
240
+ # Create unescaped TSV file (GZIP compressed)
241
+ unescaped_tsv_path = f"{self.base_path}/test_unescaped.tsv"
242
+ pd.DataFrame(
243
+ {"col1": ["abc", "def", "ghi"], "col2": [1, 2, 3], "col3": [1.1, 2.2, 3.3]}
244
+ )
245
+ with self.fs.open(unescaped_tsv_path, "wb") as f:
246
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
247
+ content = "abc\t1\t1.1\ndef\t2\t2.2\nghi\t3\t3.3\n"
248
+ gz.write(content.encode("utf-8"))
249
+
250
+ # Create Parquet file
251
+ parquet_path = f"{self.base_path}/test.parquet"
252
+ self.df.to_parquet(parquet_path, index=False)
253
+
254
+ # Create Feather file
255
+ feather_path = f"{self.base_path}/test.feather"
256
+ self.df.to_feather(feather_path)
257
+
258
+ # Create JSON file (GZIP compressed, NDJSON format)
259
+ json_path = f"{self.base_path}/test.json"
260
+ with self.fs.open(json_path, "wb") as f:
261
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
262
+ json_str = self.df.to_json(orient="records", lines=True)
263
+ gz.write(json_str.encode("utf-8"))
264
+
265
+ # Create Avro file using polars (since pandas delegates to polars for Avro)
266
+ avro_path = f"{self.base_path}/test.avro"
267
+ pl_df = pl.from_pandas(self.df)
268
+ pl_df.write_avro(avro_path)
269
+
270
+ # Create ORC file
271
+ orc_path = f"{self.base_path}/test.orc"
272
+ self.df.to_orc(orc_path, index=False)
273
+
274
+ def test_content_type_to_reader_kwargs(self):
275
+ # Test CSV kwargs
276
+ csv_kwargs = content_type_to_reader_kwargs(ContentType.CSV.value)
277
+ expected_csv = {"sep": ",", "header": None}
278
+ assert csv_kwargs == expected_csv
279
+
280
+ # Test TSV kwargs
281
+ tsv_kwargs = content_type_to_reader_kwargs(ContentType.TSV.value)
282
+ expected_tsv = {"sep": "\t", "header": None}
283
+ assert tsv_kwargs == expected_tsv
284
+
285
+ # Test PSV kwargs
286
+ psv_kwargs = content_type_to_reader_kwargs(ContentType.PSV.value)
287
+ expected_psv = {"sep": "|", "header": None}
288
+ assert psv_kwargs == expected_psv
289
+
290
+ # Test unescaped TSV kwargs
291
+ unescaped_kwargs = content_type_to_reader_kwargs(
292
+ ContentType.UNESCAPED_TSV.value
293
+ )
294
+ expected_unescaped = {
295
+ "sep": "\t",
296
+ "header": None,
297
+ "na_values": [""],
298
+ "keep_default_na": False,
299
+ "quoting": csv.QUOTE_NONE,
300
+ }
301
+ assert unescaped_kwargs == expected_unescaped
302
+
303
+ # Test Parquet kwargs (should be empty)
304
+ parquet_kwargs = content_type_to_reader_kwargs(ContentType.PARQUET.value)
305
+ assert parquet_kwargs == {}
306
+
307
+ # Test Avro kwargs (should be empty)
308
+ avro_kwargs = content_type_to_reader_kwargs(ContentType.AVRO.value)
309
+ assert avro_kwargs == {}
310
+
311
+ def test_add_column_kwargs(self):
312
+ kwargs = {}
313
+ column_names = ["col1", "col2", "col3"]
314
+ include_columns = ["col1", "col2"]
315
+
316
+ # Test CSV column kwargs
317
+ _add_column_kwargs(ContentType.CSV.value, column_names, include_columns, kwargs)
318
+ assert kwargs["names"] == column_names
319
+ assert kwargs["usecols"] == include_columns
320
+
321
+ # Test Parquet column kwargs
322
+ kwargs = {}
323
+ _add_column_kwargs(
324
+ ContentType.PARQUET.value, column_names, include_columns, kwargs
325
+ )
326
+ assert kwargs["columns"] == include_columns
327
+ assert "names" not in kwargs
328
+
329
+ def test_file_to_dataframe_csv(self):
330
+ # Test reading CSV with file_to_dataframe
331
+ csv_path = f"{self.base_path}/test.csv"
332
+
333
+ result = file_to_dataframe(
334
+ csv_path,
335
+ ContentType.CSV.value,
336
+ ContentEncoding.GZIP.value,
337
+ filesystem=self.fs,
338
+ column_names=["col1", "col2", "col3"],
339
+ )
340
+
341
+ assert len(result) == 3
342
+ assert list(result.columns) == ["col1", "col2", "col3"]
343
+ assert result["col1"].tolist() == ["a,b\tc|d", "e,f\tg|h", "test"]
344
+
345
+ def test_file_to_dataframe_tsv(self):
346
+ # Test reading TSV with file_to_dataframe
347
+ tsv_path = f"{self.base_path}/test.tsv"
348
+
349
+ result = file_to_dataframe(
350
+ tsv_path,
351
+ ContentType.TSV.value,
352
+ ContentEncoding.GZIP.value,
353
+ filesystem=self.fs,
354
+ column_names=["col1", "col2", "col3"],
355
+ )
356
+
357
+ assert len(result) == 3
358
+ assert list(result.columns) == ["col1", "col2", "col3"]
359
+ assert result["col1"].tolist() == ["a,b\tc|d", "e,f\tg|h", "test"]
360
+
361
+ def test_file_to_dataframe_psv(self):
362
+ # Test reading PSV with file_to_dataframe
363
+ psv_path = f"{self.base_path}/test.psv"
364
+
365
+ result = file_to_dataframe(
366
+ psv_path,
367
+ ContentType.PSV.value,
368
+ ContentEncoding.GZIP.value,
369
+ filesystem=self.fs,
370
+ column_names=["col1", "col2", "col3"],
371
+ )
372
+
373
+ assert len(result) == 3
374
+ assert list(result.columns) == ["col1", "col2", "col3"]
375
+ assert result["col1"].tolist() == ["a,b\tc|d", "e,f\tg|h", "test"]
376
+
377
+ def test_file_to_dataframe_unescaped_tsv(self):
378
+ # Test reading unescaped TSV with file_to_dataframe
379
+ unescaped_tsv_path = f"{self.base_path}/test_unescaped.tsv"
380
+
381
+ result = file_to_dataframe(
382
+ unescaped_tsv_path,
383
+ ContentType.UNESCAPED_TSV.value,
384
+ ContentEncoding.GZIP.value,
385
+ filesystem=self.fs,
386
+ column_names=["col1", "col2", "col3"],
387
+ )
388
+
389
+ assert len(result) == 3
390
+ assert list(result.columns) == ["col1", "col2", "col3"]
391
+ assert result["col1"].tolist() == ["abc", "def", "ghi"]
392
+
393
+ def test_file_to_dataframe_parquet(self):
394
+ # Test reading Parquet with file_to_dataframe
395
+ parquet_path = f"{self.base_path}/test.parquet"
396
+
397
+ result = file_to_dataframe(
398
+ parquet_path, ContentType.PARQUET.value, filesystem=self.fs
399
+ )
400
+
401
+ assert len(result) == 3
402
+ assert list(result.columns) == ["col1", "col2", "col3"]
403
+ pd.testing.assert_frame_equal(result, self.df)
404
+
405
+ def test_file_to_dataframe_feather(self):
406
+ # Test reading Feather with file_to_dataframe
407
+ feather_path = f"{self.base_path}/test.feather"
408
+
409
+ result = file_to_dataframe(
410
+ feather_path, ContentType.FEATHER.value, filesystem=self.fs
411
+ )
412
+
413
+ assert len(result) == 3
414
+ assert list(result.columns) == ["col1", "col2", "col3"]
415
+ pd.testing.assert_frame_equal(result, self.df)
416
+
417
+ def test_file_to_dataframe_json(self):
418
+ # Test reading JSON with file_to_dataframe
419
+ json_path = f"{self.base_path}/test.json"
420
+
421
+ result = file_to_dataframe(
422
+ json_path,
423
+ ContentType.JSON.value,
424
+ ContentEncoding.GZIP.value,
425
+ filesystem=self.fs,
426
+ )
427
+
428
+ assert len(result) == 3
429
+ assert set(result.columns) == {"col1", "col2", "col3"}
430
+ assert result["col1"].tolist() == ["a,b\tc|d", "e,f\tg|h", "test"]
431
+
432
+ def test_file_to_dataframe_avro(self):
433
+ # Test reading Avro with file_to_dataframe
434
+ avro_path = f"{self.base_path}/test.avro"
435
+
436
+ result = file_to_dataframe(
437
+ avro_path, ContentType.AVRO.value, filesystem=self.fs
438
+ )
439
+
440
+ assert len(result) == 3
441
+ assert list(result.columns) == ["col1", "col2", "col3"]
442
+ # Avro may have different dtypes, so compare values
443
+ assert result["col1"].tolist() == ["a,b\tc|d", "e,f\tg|h", "test"]
444
+
445
+ def test_file_to_dataframe_orc(self):
446
+ # Test reading ORC with file_to_dataframe
447
+ orc_path = f"{self.base_path}/test.orc"
448
+
449
+ result = file_to_dataframe(orc_path, ContentType.ORC.value, filesystem=self.fs)
450
+
451
+ assert len(result) == 3
452
+ assert list(result.columns) == ["col1", "col2", "col3"]
453
+ pd.testing.assert_frame_equal(result, self.df)
454
+
455
+ def test_file_to_dataframe_with_column_selection(self):
456
+ # Test reading with column selection
457
+ csv_path = f"{self.base_path}/test.csv"
458
+
459
+ result = file_to_dataframe(
460
+ csv_path,
461
+ ContentType.CSV.value,
462
+ ContentEncoding.GZIP.value,
463
+ filesystem=self.fs,
464
+ column_names=["col1", "col2", "col3"],
465
+ include_columns=["col1", "col2"],
466
+ )
467
+
468
+ assert len(result) == 3
469
+ assert len(result.columns) == 2 # Should only have 2 columns
470
+ assert list(result.columns) == ["col1", "col2"]
471
+
472
+ def test_file_to_dataframe_with_kwargs_provider(self):
473
+ # Test reading with kwargs provider
474
+ csv_path = f"{self.base_path}/test.csv"
475
+ provider = ReadKwargsProviderPandasCsvPureUtf8(
476
+ include_columns=["col1", "col2", "col3"]
477
+ )
478
+
479
+ result = file_to_dataframe(
480
+ csv_path,
481
+ ContentType.CSV.value,
482
+ ContentEncoding.GZIP.value,
483
+ filesystem=self.fs,
484
+ column_names=["col1", "col2", "col3"],
485
+ pd_read_func_kwargs_provider=provider,
486
+ )
487
+
488
+ assert len(result) == 3
489
+ assert list(result.columns) == ["col1", "col2", "col3"]
490
+ # With string types provider, all columns should be strings
491
+ assert all(result[col].dtype == "object" for col in result.columns)
492
+
493
+ def test_file_to_dataframe_filesystem_inference(self):
494
+ # Test filesystem inference when no filesystem is provided
495
+ # Use JSON file since Parquet requires seekable files
496
+ json_path = f"{self.base_path}/test.json"
497
+
498
+ result = file_to_dataframe(
499
+ json_path,
500
+ ContentType.JSON.value,
501
+ ContentEncoding.GZIP.value
502
+ # No filesystem provided - should be inferred
503
+ )
504
+
505
+ assert len(result) == 3
506
+ assert set(result.columns) == {"col1", "col2", "col3"}
507
+ assert result["col1"].tolist() == ["a,b\tc|d", "e,f\tg|h", "test"]
508
+
509
+ def test_file_to_dataframe_unsupported_content_type(self):
510
+ # Test error handling for unsupported content type
511
+ parquet_path = f"{self.base_path}/test.parquet"
512
+
513
+ with self.assertRaises(NotImplementedError) as context:
514
+ file_to_dataframe(
515
+ parquet_path, "unsupported/content-type", filesystem=self.fs
516
+ )
517
+
518
+ assert "not implemented" in str(context.exception)
519
+
520
+ def test_file_to_dataframe_bzip2_compression(self):
521
+ # Test BZIP2 compression handling
522
+ import bz2
523
+
524
+ # Create a BZIP2 compressed CSV file
525
+ csv_content = '"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n'
526
+ compressed_content = bz2.compress(csv_content.encode("utf-8"))
527
+
528
+ bz2_path = f"{self.base_path}/test.csv.bz2"
529
+ with self.fs.open(bz2_path, "wb") as f:
530
+ f.write(compressed_content)
531
+
532
+ result = file_to_dataframe(
533
+ bz2_path,
534
+ ContentType.CSV.value,
535
+ ContentEncoding.BZIP2.value,
536
+ filesystem=self.fs,
537
+ column_names=["col1", "col2", "col3"],
538
+ )
539
+
540
+ assert len(result) == 3
541
+ assert list(result.columns) == ["col1", "col2", "col3"]
542
+ assert result["col1"].tolist() == ["a,b\tc|d", "e,f\tg|h", "test"]
543
+
544
+ def test_concat_dataframes(self):
545
+ # Test concatenation of multiple dataframes
546
+ df1 = pd.DataFrame({"col1": ["a"], "col2": [1]})
547
+ df2 = pd.DataFrame({"col1": ["b"], "col2": [2]})
548
+ df3 = pd.DataFrame({"col1": ["c"], "col2": [3]})
549
+
550
+ # Test normal concatenation
551
+ result = concat_dataframes([df1, df2, df3])
552
+ assert len(result) == 3
553
+ assert result["col1"].tolist() == ["a", "b", "c"]
554
+
555
+ # Test single dataframe
556
+ result = concat_dataframes([df1])
557
+ pd.testing.assert_frame_equal(result, df1)
558
+
559
+ # Test empty list
560
+ result = concat_dataframes([])
561
+ assert result is None
562
+
563
+ # Test None input
564
+ result = concat_dataframes(None)
565
+ assert result is None
566
+
567
+
568
+ class TestPandasFileSystemSupport(TestCase):
569
+ """
570
+ Comprehensive tests for encoding-aware reader functions with different filesystem types.
571
+ Tests fsspec AbstractFileSystem, PyArrow FileSystem, and auto-inferred filesystem.
572
+ """
573
+
574
+ def setUp(self):
575
+ import pyarrow.fs as pafs
576
+
577
+ # Create test data
578
+ self.test_data = pd.DataFrame(
579
+ {
580
+ "col1": ["value1", "value2", "value3"],
581
+ "col2": [1, 2, 3],
582
+ "col3": [1.1, 2.2, 3.3],
583
+ }
584
+ )
585
+
586
+ # Set up temporary directory
587
+ self.temp_dir = tempfile.mkdtemp()
588
+
589
+ # Set up different filesystem types
590
+ self.fsspec_fs = fsspec.filesystem("file")
591
+ self.pyarrow_fs = pafs.LocalFileSystem()
592
+
593
+ # Create test files for each content type
594
+ self._create_test_files()
595
+
596
+ def tearDown(self):
597
+ import shutil
598
+
599
+ shutil.rmtree(self.temp_dir)
600
+
601
+ def _create_test_files(self):
602
+ """Create test files in different formats with different compression types."""
603
+ import gzip
604
+ import bz2
605
+
606
+ # CSV files without headers to match test data structure
607
+ csv_data = "value1,1,1.1\nvalue2,2,2.2\nvalue3,3,3.3\n"
608
+
609
+ # Create uncompressed CSV
610
+ with open(f"{self.temp_dir}/test.csv", "w") as f:
611
+ f.write(csv_data)
612
+
613
+ # Create GZIP compressed CSV
614
+ with gzip.open(f"{self.temp_dir}/test_gzip.csv.gz", "wt") as f:
615
+ f.write(csv_data)
616
+
617
+ # Create BZIP2 compressed CSV
618
+ with bz2.open(f"{self.temp_dir}/test_bzip2.csv.bz2", "wt") as f:
619
+ f.write(csv_data)
620
+
621
+ # Parquet file
622
+ self.test_data.to_parquet(f"{self.temp_dir}/test.parquet", index=False)
623
+
624
+ # Feather file
625
+ self.test_data.to_feather(f"{self.temp_dir}/test.feather")
626
+
627
+ # JSON file (GZIP compressed, NDJSON format)
628
+ json_path = f"{self.temp_dir}/test.json"
629
+ with self.fsspec_fs.open(json_path, "wb") as f:
630
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
631
+ json_str = self.test_data.to_json(orient="records", lines=True)
632
+ gz.write(json_str.encode("utf-8"))
633
+
634
+ # AVRO file (using polars since pandas delegates to polars for AVRO)
635
+ import polars as pl
636
+
637
+ pl_df = pl.from_pandas(self.test_data)
638
+ pl_df.write_avro(f"{self.temp_dir}/test.avro")
639
+
640
+ # ORC file
641
+ self.test_data.to_orc(f"{self.temp_dir}/test.orc")
642
+
643
+ def _assert_dataframes_equal(self, result, expected):
644
+ """Helper to assert pandas dataframes are equal."""
645
+ pd.testing.assert_frame_equal(
646
+ result.reset_index(drop=True),
647
+ expected.reset_index(drop=True),
648
+ check_dtype=False, # Allow minor type differences
649
+ )
650
+
651
+ def test_csv_with_fsspec_filesystem(self):
652
+ """Test CSV reading with fsspec AbstractFileSystem."""
653
+ from deltacat.utils.pandas import read_csv
654
+
655
+ # Test uncompressed CSV
656
+ result = read_csv(
657
+ f"{self.temp_dir}/test.csv",
658
+ filesystem=self.fsspec_fs,
659
+ content_encoding=ContentEncoding.IDENTITY.value,
660
+ names=["col1", "col2", "col3"],
661
+ )
662
+ self._assert_dataframes_equal(result, self.test_data)
663
+
664
+ # Test GZIP compressed CSV
665
+ result = read_csv(
666
+ f"{self.temp_dir}/test_gzip.csv.gz",
667
+ filesystem=self.fsspec_fs,
668
+ content_encoding=ContentEncoding.GZIP.value,
669
+ names=["col1", "col2", "col3"],
670
+ )
671
+ self._assert_dataframes_equal(result, self.test_data)
672
+
673
+ # Test BZIP2 compressed CSV
674
+ result = read_csv(
675
+ f"{self.temp_dir}/test_bzip2.csv.bz2",
676
+ filesystem=self.fsspec_fs,
677
+ content_encoding=ContentEncoding.BZIP2.value,
678
+ names=["col1", "col2", "col3"],
679
+ )
680
+ self._assert_dataframes_equal(result, self.test_data)
681
+
682
+ def test_csv_with_pyarrow_filesystem(self):
683
+ """Test CSV reading with PyArrow FileSystem."""
684
+ from deltacat.utils.pandas import read_csv
685
+
686
+ # Test uncompressed CSV
687
+ result = read_csv(
688
+ f"{self.temp_dir}/test.csv",
689
+ filesystem=self.pyarrow_fs,
690
+ content_encoding=ContentEncoding.IDENTITY.value,
691
+ names=["col1", "col2", "col3"],
692
+ )
693
+ self._assert_dataframes_equal(result, self.test_data)
694
+
695
+ # Test GZIP compressed CSV
696
+ result = read_csv(
697
+ f"{self.temp_dir}/test_gzip.csv.gz",
698
+ filesystem=self.pyarrow_fs,
699
+ content_encoding=ContentEncoding.GZIP.value,
700
+ names=["col1", "col2", "col3"],
701
+ )
702
+ self._assert_dataframes_equal(result, self.test_data)
703
+
704
+ def test_csv_with_auto_inferred_filesystem(self):
705
+ """Test CSV reading with automatically inferred filesystem."""
706
+ from deltacat.utils.pandas import read_csv
707
+
708
+ # Test uncompressed CSV (filesystem=None, should auto-infer)
709
+ result = read_csv(
710
+ f"{self.temp_dir}/test.csv",
711
+ filesystem=None,
712
+ content_encoding=ContentEncoding.IDENTITY.value,
713
+ names=["col1", "col2", "col3"],
714
+ )
715
+ self._assert_dataframes_equal(result, self.test_data)
716
+
717
+ def test_parquet_with_different_filesystems(self):
718
+ """Test Parquet reading with different filesystem types."""
719
+ from deltacat.utils.pandas import read_parquet
720
+
721
+ # Test with fsspec
722
+ result = read_parquet(
723
+ f"{self.temp_dir}/test.parquet",
724
+ filesystem=self.fsspec_fs,
725
+ content_encoding=ContentEncoding.IDENTITY.value,
726
+ )
727
+ self._assert_dataframes_equal(result, self.test_data)
728
+
729
+ # Test with PyArrow
730
+ result = read_parquet(
731
+ f"{self.temp_dir}/test.parquet",
732
+ filesystem=self.pyarrow_fs,
733
+ content_encoding=ContentEncoding.IDENTITY.value,
734
+ )
735
+ self._assert_dataframes_equal(result, self.test_data)
736
+
737
+ # Test with auto-inferred
738
+ result = read_parquet(
739
+ f"{self.temp_dir}/test.parquet",
740
+ filesystem=None,
741
+ content_encoding=ContentEncoding.IDENTITY.value,
742
+ )
743
+ self._assert_dataframes_equal(result, self.test_data)
744
+
745
+ def test_feather_with_different_filesystems(self):
746
+ """Test Feather reading with different filesystem types."""
747
+ from deltacat.utils.pandas import read_feather
748
+
749
+ # Test with fsspec
750
+ result = read_feather(
751
+ f"{self.temp_dir}/test.feather",
752
+ filesystem=self.fsspec_fs,
753
+ content_encoding=ContentEncoding.IDENTITY.value,
754
+ )
755
+ self._assert_dataframes_equal(result, self.test_data)
756
+
757
+ # Test with PyArrow
758
+ result = read_feather(
759
+ f"{self.temp_dir}/test.feather",
760
+ filesystem=self.pyarrow_fs,
761
+ content_encoding=ContentEncoding.IDENTITY.value,
762
+ )
763
+ self._assert_dataframes_equal(result, self.test_data)
764
+
765
+ # Test with auto-inferred
766
+ result = read_feather(
767
+ f"{self.temp_dir}/test.feather",
768
+ filesystem=None,
769
+ content_encoding=ContentEncoding.IDENTITY.value,
770
+ )
771
+ self._assert_dataframes_equal(result, self.test_data)
772
+
773
+ def test_json_with_different_filesystems(self):
774
+ """Test JSON reading with different filesystem types."""
775
+ from deltacat.utils.pandas import read_json
776
+
777
+ # Test with fsspec
778
+ result = read_json(
779
+ f"{self.temp_dir}/test.json",
780
+ filesystem=self.fsspec_fs,
781
+ content_encoding=ContentEncoding.GZIP.value,
782
+ lines=True, # Required for NDJSON format
783
+ )
784
+ self._assert_dataframes_equal(result, self.test_data)
785
+
786
+ # Test with PyArrow
787
+ result = read_json(
788
+ f"{self.temp_dir}/test.json",
789
+ filesystem=self.pyarrow_fs,
790
+ content_encoding=ContentEncoding.GZIP.value,
791
+ lines=True, # Required for NDJSON format
792
+ )
793
+ self._assert_dataframes_equal(result, self.test_data)
794
+
795
+ # Test with auto-inferred
796
+ result = read_json(
797
+ f"{self.temp_dir}/test.json",
798
+ filesystem=None,
799
+ content_encoding=ContentEncoding.GZIP.value,
800
+ lines=True, # Required for NDJSON format
801
+ )
802
+ self._assert_dataframes_equal(result, self.test_data)
803
+
804
+ def test_avro_with_different_filesystems(self):
805
+ """Test AVRO reading with different filesystem types."""
806
+ from deltacat.utils.pandas import read_avro
807
+
808
+ # Test with fsspec
809
+ result = read_avro(
810
+ f"{self.temp_dir}/test.avro",
811
+ filesystem=self.fsspec_fs,
812
+ content_encoding=ContentEncoding.IDENTITY.value,
813
+ )
814
+ self._assert_dataframes_equal(result, self.test_data)
815
+
816
+ # Test with PyArrow
817
+ result = read_avro(
818
+ f"{self.temp_dir}/test.avro",
819
+ filesystem=self.pyarrow_fs,
820
+ content_encoding=ContentEncoding.IDENTITY.value,
821
+ )
822
+ self._assert_dataframes_equal(result, self.test_data)
823
+
824
+ # Test with auto-inferred
825
+ result = read_avro(
826
+ f"{self.temp_dir}/test.avro",
827
+ filesystem=None,
828
+ content_encoding=ContentEncoding.IDENTITY.value,
829
+ )
830
+ self._assert_dataframes_equal(result, self.test_data)
831
+
832
+ def test_orc_with_different_filesystems(self):
833
+ """Test ORC reading with different filesystem types."""
834
+ from deltacat.utils.pandas import read_orc
835
+
836
+ # Test with fsspec
837
+ result = read_orc(
838
+ f"{self.temp_dir}/test.orc",
839
+ filesystem=self.fsspec_fs,
840
+ content_encoding=ContentEncoding.IDENTITY.value,
841
+ )
842
+ self._assert_dataframes_equal(result, self.test_data)
843
+
844
+ # Test with PyArrow
845
+ result = read_orc(
846
+ f"{self.temp_dir}/test.orc",
847
+ filesystem=self.pyarrow_fs,
848
+ content_encoding=ContentEncoding.IDENTITY.value,
849
+ )
850
+ self._assert_dataframes_equal(result, self.test_data)
851
+
852
+ # Test with auto-inferred
853
+ result = read_orc(
854
+ f"{self.temp_dir}/test.orc",
855
+ filesystem=None,
856
+ content_encoding=ContentEncoding.IDENTITY.value,
857
+ )
858
+ self._assert_dataframes_equal(result, self.test_data)
859
+
860
+ def test_file_to_dataframe_with_different_filesystems(self):
861
+ """Test file_to_dataframe with different filesystem types for all content types."""
862
+ test_cases = [
863
+ (
864
+ f"{self.temp_dir}/test.csv",
865
+ ContentType.CSV.value,
866
+ ContentEncoding.IDENTITY.value,
867
+ {"column_names": ["col1", "col2", "col3"]},
868
+ ),
869
+ (
870
+ f"{self.temp_dir}/test_gzip.csv.gz",
871
+ ContentType.CSV.value,
872
+ ContentEncoding.GZIP.value,
873
+ {"column_names": ["col1", "col2", "col3"]},
874
+ ),
875
+ (
876
+ f"{self.temp_dir}/test.parquet",
877
+ ContentType.PARQUET.value,
878
+ ContentEncoding.IDENTITY.value,
879
+ {},
880
+ ),
881
+ (
882
+ f"{self.temp_dir}/test.feather",
883
+ ContentType.FEATHER.value,
884
+ ContentEncoding.IDENTITY.value,
885
+ {},
886
+ ),
887
+ (
888
+ f"{self.temp_dir}/test.json",
889
+ ContentType.JSON.value,
890
+ ContentEncoding.GZIP.value,
891
+ {},
892
+ ),
893
+ (
894
+ f"{self.temp_dir}/test.avro",
895
+ ContentType.AVRO.value,
896
+ ContentEncoding.IDENTITY.value,
897
+ {},
898
+ ),
899
+ (
900
+ f"{self.temp_dir}/test.orc",
901
+ ContentType.ORC.value,
902
+ ContentEncoding.IDENTITY.value,
903
+ {},
904
+ ),
905
+ ]
906
+
907
+ filesystems = [
908
+ ("fsspec", self.fsspec_fs),
909
+ ("pyarrow", self.pyarrow_fs),
910
+ ("auto-inferred", None),
911
+ ]
912
+
913
+ for path, content_type, content_encoding, extra_kwargs in test_cases:
914
+ for fs_name, filesystem in filesystems:
915
+ with self.subTest(
916
+ content_type=content_type,
917
+ filesystem=fs_name,
918
+ encoding=content_encoding,
919
+ ):
920
+ result = file_to_dataframe(
921
+ path=path,
922
+ content_type=content_type,
923
+ content_encoding=content_encoding,
924
+ filesystem=filesystem,
925
+ **extra_kwargs,
926
+ )
927
+ self._assert_dataframes_equal(result, self.test_data)
928
+
929
+ def test_compression_encoding_with_different_filesystems(self):
930
+ """Test that compression encoding works correctly with different filesystem types."""
931
+ test_cases = [
932
+ (f"{self.temp_dir}/test.csv", ContentEncoding.IDENTITY.value),
933
+ (f"{self.temp_dir}/test_gzip.csv.gz", ContentEncoding.GZIP.value),
934
+ (f"{self.temp_dir}/test_bzip2.csv.bz2", ContentEncoding.BZIP2.value),
935
+ ]
936
+
937
+ filesystems = [
938
+ ("fsspec", self.fsspec_fs),
939
+ ("pyarrow", self.pyarrow_fs),
940
+ ("auto-inferred", None),
941
+ ]
942
+
943
+ for path, content_encoding in test_cases:
944
+ for fs_name, filesystem in filesystems:
945
+ with self.subTest(encoding=content_encoding, filesystem=fs_name):
946
+ result = file_to_dataframe(
947
+ path=path,
948
+ content_type=ContentType.CSV.value,
949
+ content_encoding=content_encoding,
950
+ filesystem=filesystem,
951
+ column_names=["col1", "col2", "col3"],
952
+ )
953
+ self._assert_dataframes_equal(result, self.test_data)
954
+
955
+ def test_filesystem_open_kwargs(self):
956
+ """Test that filesystem open kwargs are properly passed through."""
957
+ from deltacat.utils.pandas import read_csv
958
+
959
+ # Test with custom fs_open_kwargs
960
+ result = read_csv(
961
+ f"{self.temp_dir}/test.csv",
962
+ filesystem=self.fsspec_fs,
963
+ content_encoding=ContentEncoding.IDENTITY.value,
964
+ fs_open_kwargs={
965
+ "encoding": "utf-8"
966
+ }, # This should be passed to filesystem.open()
967
+ names=["col1", "col2", "col3"],
968
+ )
969
+ self._assert_dataframes_equal(result, self.test_data)
970
+
971
+ def test_delimited_formats_with_different_filesystems(self):
972
+ """Test delimited formats (TSV, PSV, etc.) with different filesystem types."""
973
+ # Create TSV test file without headers to match test data structure
974
+ tsv_data = "value1\t1\t1.1\nvalue2\t2\t2.2\nvalue3\t3\t3.3\n"
975
+ with open(f"{self.temp_dir}/test.tsv", "w") as f:
976
+ f.write(tsv_data)
977
+
978
+ # Create PSV test file without headers to match test data structure
979
+ psv_data = "value1|1|1.1\nvalue2|2|2.2\nvalue3|3|3.3\n"
980
+ with open(f"{self.temp_dir}/test.psv", "w") as f:
981
+ f.write(psv_data)
982
+
983
+ delimited_test_cases = [
984
+ (
985
+ f"{self.temp_dir}/test.tsv",
986
+ ContentType.TSV.value,
987
+ {"sep": "\t", "column_names": ["col1", "col2", "col3"]},
988
+ ),
989
+ (
990
+ f"{self.temp_dir}/test.psv",
991
+ ContentType.PSV.value,
992
+ {"sep": "|", "column_names": ["col1", "col2", "col3"]},
993
+ ),
994
+ ]
995
+
996
+ filesystems = [
997
+ ("fsspec", self.fsspec_fs),
998
+ ("pyarrow", self.pyarrow_fs),
999
+ ("auto-inferred", None),
1000
+ ]
1001
+
1002
+ for path, content_type, extra_kwargs in delimited_test_cases:
1003
+ for fs_name, filesystem in filesystems:
1004
+ with self.subTest(content_type=content_type, filesystem=fs_name):
1005
+ result = file_to_dataframe(
1006
+ path=path,
1007
+ content_type=content_type,
1008
+ content_encoding=ContentEncoding.IDENTITY.value,
1009
+ filesystem=filesystem,
1010
+ **extra_kwargs,
1011
+ )
1012
+ self._assert_dataframes_equal(result, self.test_data)
1013
+
1014
+ def test_end_to_end_round_trip_all_formats(self):
1015
+ """Test end-to-end round trip with write and read for all supported formats."""
1016
+ from deltacat.utils.pandas import (
1017
+ write_csv,
1018
+ write_parquet,
1019
+ write_feather,
1020
+ write_json,
1021
+ write_avro,
1022
+ write_orc,
1023
+ read_csv,
1024
+ read_parquet,
1025
+ read_feather,
1026
+ read_json,
1027
+ read_avro,
1028
+ read_orc,
1029
+ )
1030
+
1031
+ # Test cases with writer/reader pairs
1032
+ # Note: CSV and JSON writers automatically apply GZIP compression
1033
+ round_trip_cases = [
1034
+ (
1035
+ "test_roundtrip.csv",
1036
+ write_csv,
1037
+ read_csv,
1038
+ {
1039
+ "content_encoding": ContentEncoding.GZIP.value,
1040
+ "names": ["col1", "col2", "col3"],
1041
+ },
1042
+ {"index": False},
1043
+ ),
1044
+ (
1045
+ "test_roundtrip.parquet",
1046
+ write_parquet,
1047
+ read_parquet,
1048
+ {"content_encoding": ContentEncoding.IDENTITY.value},
1049
+ {},
1050
+ ),
1051
+ (
1052
+ "test_roundtrip.feather",
1053
+ write_feather,
1054
+ read_feather,
1055
+ {"content_encoding": ContentEncoding.IDENTITY.value},
1056
+ {},
1057
+ ),
1058
+ (
1059
+ "test_roundtrip.json",
1060
+ write_json,
1061
+ read_json,
1062
+ {"content_encoding": ContentEncoding.GZIP.value, "orient": "records"},
1063
+ {"orient": "records"},
1064
+ ),
1065
+ (
1066
+ "test_roundtrip.avro",
1067
+ write_avro,
1068
+ read_avro,
1069
+ {"content_encoding": ContentEncoding.IDENTITY.value},
1070
+ {},
1071
+ ),
1072
+ (
1073
+ "test_roundtrip.orc",
1074
+ write_orc,
1075
+ read_orc,
1076
+ {"content_encoding": ContentEncoding.IDENTITY.value},
1077
+ {},
1078
+ ),
1079
+ ]
1080
+
1081
+ filesystems = [
1082
+ ("fsspec", self.fsspec_fs),
1083
+ ("pyarrow", self.pyarrow_fs),
1084
+ ]
1085
+
1086
+ for (
1087
+ filename,
1088
+ write_func,
1089
+ read_func,
1090
+ read_kwargs,
1091
+ write_kwargs,
1092
+ ) in round_trip_cases:
1093
+ for fs_name, filesystem in filesystems:
1094
+ with self.subTest(format=filename, filesystem=fs_name):
1095
+ file_path = f"{self.temp_dir}/{filename}"
1096
+
1097
+ # Write the file
1098
+ write_func(
1099
+ self.test_data, file_path, filesystem=filesystem, **write_kwargs
1100
+ )
1101
+
1102
+ # Read it back
1103
+ result = read_func(file_path, filesystem=filesystem, **read_kwargs)
1104
+
1105
+ # Verify it matches
1106
+ self._assert_dataframes_equal(result, self.test_data)