deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1040 @@
1
+ from unittest import TestCase
2
+ import polars as pl
3
+ import pandas as pd
4
+ import tempfile
5
+ import fsspec
6
+ import gzip
7
+ import json
8
+ import io
9
+ from deltacat.types.media import ContentType, ContentEncoding
10
+ from deltacat.utils.polars import (
11
+ dataframe_to_file,
12
+ file_to_dataframe,
13
+ content_type_to_reader_kwargs,
14
+ _add_column_kwargs,
15
+ ReadKwargsProviderPolarsStringTypes,
16
+ concat_dataframes,
17
+ )
18
+
19
+
20
+ class TestPolarsWriters(TestCase):
21
+ def setUp(self):
22
+ # Create a test DataFrame with data that includes delimiters
23
+ self.df = pl.DataFrame({"col1": ["a,b\tc|d", "e,f\tg|h"], "col2": [1, 2]})
24
+ self.fs = fsspec.filesystem("file")
25
+ self.base_path = tempfile.mkdtemp()
26
+ self.fs.makedirs(self.base_path, exist_ok=True)
27
+
28
+ def tearDown(self):
29
+ self.fs.rm(self.base_path, recursive=True)
30
+
31
+ def test_write_feather(self):
32
+ path = f"{self.base_path}/test.feather"
33
+
34
+ dataframe_to_file(
35
+ self.df,
36
+ path,
37
+ self.fs,
38
+ lambda x: path,
39
+ content_type=ContentType.FEATHER.value,
40
+ )
41
+ assert self.fs.exists(path), "file was not written"
42
+
43
+ # Verify content
44
+ result = pl.read_ipc(path)
45
+ assert result.equals(self.df)
46
+
47
+ def test_write_csv(self):
48
+ path = f"{self.base_path}/test.csv.gz"
49
+
50
+ dataframe_to_file(
51
+ self.df, path, self.fs, lambda x: path, content_type=ContentType.CSV.value
52
+ )
53
+ assert self.fs.exists(path), "file was not written"
54
+
55
+ # Verify content (should be GZIP compressed)
56
+ with self.fs.open(path, "rb") as f:
57
+ with gzip.GzipFile(fileobj=f) as gz:
58
+ content = gz.read().decode("utf-8")
59
+ # Should be quoted due to commas in data
60
+ assert '"a,b\tc|d",1' in content
61
+ assert '"e,f\tg|h",2' in content
62
+
63
+ def test_write_tsv(self):
64
+ path = f"{self.base_path}/test.tsv.gz"
65
+
66
+ dataframe_to_file(
67
+ self.df,
68
+ path,
69
+ self.fs,
70
+ lambda x: path,
71
+ content_type=ContentType.TSV.value,
72
+ )
73
+ assert self.fs.exists(path), "file was not written"
74
+
75
+ # Verify content (should be GZIP compressed)
76
+ with self.fs.open(path, "rb") as f:
77
+ with gzip.GzipFile(fileobj=f) as gz:
78
+ content = gz.read().decode("utf-8")
79
+ # Polars writes TSV with tab separators
80
+ assert '"a,b\tc|d"\t1' in content
81
+ assert '"e,f\tg|h"\t2' in content
82
+
83
+ def test_write_psv(self):
84
+ path = f"{self.base_path}/test.psv.gz"
85
+
86
+ dataframe_to_file(
87
+ self.df,
88
+ path,
89
+ self.fs,
90
+ lambda x: path,
91
+ content_type=ContentType.PSV.value,
92
+ )
93
+ assert self.fs.exists(path), "file was not written"
94
+
95
+ # Verify content (should be GZIP compressed)
96
+ with self.fs.open(path, "rb") as f:
97
+ with gzip.GzipFile(fileobj=f) as gz:
98
+ content = gz.read().decode("utf-8")
99
+ # Polars writes PSV with pipe separators
100
+ assert '"a,b\tc|d"|1' in content
101
+ assert '"e,f\tg|h"|2' in content
102
+
103
+ def test_write_unescaped_tsv(self):
104
+ # Create DataFrame without delimiters for unescaped TSV
105
+ df = pl.DataFrame({"col1": ["abc", "def"], "col2": [1, 2]})
106
+ path = f"{self.base_path}/test.tsv.gz"
107
+
108
+ dataframe_to_file(
109
+ df,
110
+ path,
111
+ self.fs,
112
+ lambda x: path,
113
+ content_type=ContentType.UNESCAPED_TSV.value,
114
+ )
115
+ assert self.fs.exists(path), "file was not written"
116
+
117
+ # Verify content (should be GZIP compressed)
118
+ with self.fs.open(path, "rb") as f:
119
+ with gzip.GzipFile(fileobj=f) as gz:
120
+ content = gz.read().decode("utf-8")
121
+ # With quote_char=None for unescaped TSV, should use tab separators
122
+ assert "abc\t1" in content
123
+ assert "def\t2" in content
124
+
125
+ def test_write_orc(self):
126
+ path = f"{self.base_path}/test.orc"
127
+
128
+ dataframe_to_file(
129
+ self.df, path, self.fs, lambda x: path, content_type=ContentType.ORC.value
130
+ )
131
+ assert self.fs.exists(path), "file was not written"
132
+
133
+ # Verify content by reading with pandas (since polars delegates to PyArrow)
134
+ result = pd.read_orc(path)
135
+ expected = self.df.to_pandas()
136
+ pd.testing.assert_frame_equal(result, expected)
137
+
138
+ def test_write_parquet(self):
139
+ path = f"{self.base_path}/test.parquet"
140
+
141
+ dataframe_to_file(
142
+ self.df,
143
+ path,
144
+ self.fs,
145
+ lambda x: path,
146
+ content_type=ContentType.PARQUET.value,
147
+ )
148
+ assert self.fs.exists(path), "file was not written"
149
+
150
+ # Verify content
151
+ result = pl.read_parquet(path)
152
+ assert result.equals(self.df)
153
+
154
+ def test_write_json(self):
155
+ path = f"{self.base_path}/test.json.gz"
156
+
157
+ dataframe_to_file(
158
+ self.df, path, self.fs, lambda x: path, content_type=ContentType.JSON.value
159
+ )
160
+ assert self.fs.exists(path), "file was not written"
161
+
162
+ # Verify content (should be GZIP compressed, newline-delimited JSON)
163
+ with self.fs.open(path, "rb") as f:
164
+ with gzip.GzipFile(fileobj=f) as gz:
165
+ content = gz.read().decode("utf-8")
166
+ # Each line should be a valid JSON object
167
+ lines = [
168
+ line for line in content.split("\n") if line
169
+ ] # Skip empty lines
170
+ assert len(lines) == 2 # 2 records
171
+ assert json.loads(lines[0]) == {"col1": "a,b\tc|d", "col2": 1}
172
+ assert json.loads(lines[1]) == {"col1": "e,f\tg|h", "col2": 2}
173
+
174
+ def test_write_avro(self):
175
+ path = f"{self.base_path}/test.avro"
176
+
177
+ dataframe_to_file(
178
+ self.df, path, self.fs, lambda x: path, content_type=ContentType.AVRO.value
179
+ )
180
+ assert self.fs.exists(path), "file was not written"
181
+
182
+ # Verify content by reading with polars
183
+ result = pl.read_avro(path)
184
+ assert result.equals(self.df)
185
+
186
+
187
+ class TestPolarsReaders(TestCase):
188
+ def setUp(self):
189
+ # Create test data files for reading
190
+ self.fs = fsspec.filesystem("file")
191
+ self.base_path = tempfile.mkdtemp()
192
+ self.fs.makedirs(self.base_path, exist_ok=True)
193
+
194
+ # Create test DataFrame
195
+ self.df = pl.DataFrame(
196
+ {
197
+ "col1": ["a,b\tc|d", "e,f\tg|h", "test"],
198
+ "col2": [1, 2, 3],
199
+ "col3": [1.1, 2.2, 3.3],
200
+ }
201
+ )
202
+
203
+ # Write test files in different formats
204
+ self._create_test_files()
205
+
206
+ def tearDown(self):
207
+ self.fs.rm(self.base_path, recursive=True)
208
+
209
+ def _create_test_files(self):
210
+ """Create test files for reading tests with the original test data structure."""
211
+ import gzip
212
+ import bz2
213
+
214
+ # Create CSV file (GZIP compressed) with the original test data
215
+ csv_path = f"{self.base_path}/test.csv"
216
+ with self.fs.open(csv_path, "wb") as f:
217
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
218
+ content = '"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n'
219
+ gz.write(content.encode("utf-8"))
220
+
221
+ # Create TSV file (GZIP compressed)
222
+ tsv_path = f"{self.base_path}/test.tsv"
223
+ with self.fs.open(tsv_path, "wb") as f:
224
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
225
+ content = '"a,b\tc|d"\t1\t1.1\n"e,f\tg|h"\t2\t2.2\ntest\t3\t3.3\n'
226
+ gz.write(content.encode("utf-8"))
227
+
228
+ # Create PSV file (GZIP compressed)
229
+ psv_path = f"{self.base_path}/test.psv"
230
+ with self.fs.open(psv_path, "wb") as f:
231
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
232
+ content = '"a,b\tc|d"|1|1.1\n"e,f\tg|h"|2|2.2\ntest|3|3.3\n'
233
+ gz.write(content.encode("utf-8"))
234
+
235
+ # Create unescaped TSV file (GZIP compressed)
236
+ unescaped_tsv_path = f"{self.base_path}/test_unescaped.tsv"
237
+ with self.fs.open(unescaped_tsv_path, "wb") as f:
238
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
239
+ content = "abc\t1\t1.1\ndef\t2\t2.2\nghi\t3\t3.3\n"
240
+ gz.write(content.encode("utf-8"))
241
+
242
+ # Create Parquet file
243
+ parquet_path = f"{self.base_path}/test.parquet"
244
+ self.df.write_parquet(parquet_path)
245
+
246
+ # Create Feather file
247
+ feather_path = f"{self.base_path}/test.feather"
248
+ self.df.write_ipc(feather_path)
249
+
250
+ # Create JSON file (GZIP compressed, NDJSON format)
251
+ json_path = f"{self.base_path}/test.json"
252
+ with self.fs.open(json_path, "wb") as f:
253
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
254
+ # Use proper NDJSON format - one JSON object per line
255
+ lines = []
256
+ for i in range(len(self.df)):
257
+ row = self.df.row(i)
258
+ json_obj = {"col1": row[0], "col2": row[1], "col3": row[2]}
259
+ lines.append(json.dumps(json_obj))
260
+ content = "\n".join(lines) + "\n"
261
+ gz.write(content.encode("utf-8"))
262
+
263
+ # Create Avro file
264
+ avro_path = f"{self.base_path}/test.avro"
265
+ self.df.write_avro(avro_path)
266
+
267
+ # Create ORC file using pandas (since polars delegates to pandas for ORC)
268
+ orc_path = f"{self.base_path}/test.orc"
269
+ self.df.to_pandas().to_orc(orc_path)
270
+
271
+ # Create BZIP2 compressed CSV for compression tests
272
+ bzip2_path = f"{self.base_path}/test_bzip2.csv.bz2"
273
+ with bz2.open(bzip2_path, "wt") as f:
274
+ f.write('"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n')
275
+
276
+ def test_content_type_to_reader_kwargs(self):
277
+ # Test CSV kwargs
278
+ csv_kwargs = content_type_to_reader_kwargs(ContentType.CSV.value)
279
+ expected_csv = {"separator": ",", "has_header": False}
280
+ assert csv_kwargs == expected_csv
281
+
282
+ # Test TSV kwargs
283
+ tsv_kwargs = content_type_to_reader_kwargs(ContentType.TSV.value)
284
+ expected_tsv = {"separator": "\t", "has_header": False}
285
+ assert tsv_kwargs == expected_tsv
286
+
287
+ # Test PSV kwargs
288
+ psv_kwargs = content_type_to_reader_kwargs(ContentType.PSV.value)
289
+ expected_psv = {"separator": "|", "has_header": False}
290
+ assert psv_kwargs == expected_psv
291
+
292
+ # Test unescaped TSV kwargs
293
+ unescaped_kwargs = content_type_to_reader_kwargs(
294
+ ContentType.UNESCAPED_TSV.value
295
+ )
296
+ expected_unescaped = {
297
+ "separator": "\t",
298
+ "has_header": False,
299
+ "null_values": [""],
300
+ "quote_char": None,
301
+ }
302
+ assert unescaped_kwargs == expected_unescaped
303
+
304
+ # Test Parquet kwargs (should be empty)
305
+ parquet_kwargs = content_type_to_reader_kwargs(ContentType.PARQUET.value)
306
+ assert parquet_kwargs == {}
307
+
308
+ def test_add_column_kwargs(self):
309
+ kwargs = {}
310
+ column_names = ["col1", "col2", "col3"]
311
+ include_columns = ["col1", "col2"]
312
+
313
+ # Test CSV column kwargs
314
+ _add_column_kwargs(ContentType.CSV.value, column_names, include_columns, kwargs)
315
+ assert kwargs["new_columns"] == column_names
316
+ assert kwargs["columns"] == include_columns
317
+
318
+ # Test Parquet column kwargs
319
+ kwargs = {}
320
+ _add_column_kwargs(
321
+ ContentType.PARQUET.value, column_names, include_columns, kwargs
322
+ )
323
+ assert kwargs["columns"] == include_columns
324
+ assert "new_columns" not in kwargs
325
+
326
+ def test_read_csv_from_file(self):
327
+ csv_path = f"{self.base_path}/test.csv"
328
+
329
+ # Read using polars directly to test our reader logic
330
+ with self.fs.open(csv_path, "rb") as f:
331
+ with gzip.GzipFile(fileobj=f) as gz:
332
+ source = io.BytesIO(gz.read())
333
+
334
+ kwargs = content_type_to_reader_kwargs(ContentType.CSV.value)
335
+ kwargs["new_columns"] = ["col1", "col2", "col3"]
336
+
337
+ result = pl.read_csv(source, **kwargs)
338
+
339
+ # Verify basic structure
340
+ assert len(result) == 3
341
+ assert list(result.columns) == ["col1", "col2", "col3"]
342
+ assert result["col1"].to_list() == ["a,b\tc|d", "e,f\tg|h", "test"]
343
+
344
+ def test_read_tsv_from_file(self):
345
+ tsv_path = f"{self.base_path}/test.tsv"
346
+
347
+ with self.fs.open(tsv_path, "rb") as f:
348
+ with gzip.GzipFile(fileobj=f) as gz:
349
+ source = io.BytesIO(gz.read())
350
+
351
+ kwargs = content_type_to_reader_kwargs(ContentType.TSV.value)
352
+ kwargs["new_columns"] = ["col1", "col2", "col3"]
353
+
354
+ result = pl.read_csv(source, **kwargs)
355
+
356
+ assert len(result) == 3
357
+ assert list(result.columns) == ["col1", "col2", "col3"]
358
+ assert result["col1"].to_list() == ["a,b\tc|d", "e,f\tg|h", "test"]
359
+
360
+ def test_read_psv_from_file(self):
361
+ psv_path = f"{self.base_path}/test.psv"
362
+
363
+ with self.fs.open(psv_path, "rb") as f:
364
+ with gzip.GzipFile(fileobj=f) as gz:
365
+ source = io.BytesIO(gz.read())
366
+
367
+ kwargs = content_type_to_reader_kwargs(ContentType.PSV.value)
368
+ kwargs["new_columns"] = ["col1", "col2", "col3"]
369
+
370
+ result = pl.read_csv(source, **kwargs)
371
+
372
+ assert len(result) == 3
373
+ assert list(result.columns) == ["col1", "col2", "col3"]
374
+ assert result["col1"].to_list() == ["a,b\tc|d", "e,f\tg|h", "test"]
375
+
376
+ def test_read_parquet_from_file(self):
377
+ parquet_path = f"{self.base_path}/test.parquet"
378
+ result = pl.read_parquet(parquet_path)
379
+
380
+ assert len(result) == 3
381
+ assert list(result.columns) == ["col1", "col2", "col3"]
382
+ assert result.equals(self.df)
383
+
384
+ def test_read_feather_from_file(self):
385
+ feather_path = f"{self.base_path}/test.feather"
386
+ result = pl.read_ipc(feather_path)
387
+
388
+ assert len(result) == 3
389
+ assert list(result.columns) == ["col1", "col2", "col3"]
390
+ assert result.equals(self.df)
391
+
392
+ def test_read_json_from_file(self):
393
+ json_path = f"{self.base_path}/test.json"
394
+
395
+ with self.fs.open(json_path, "rb") as f:
396
+ with gzip.GzipFile(fileobj=f) as gz:
397
+ source = io.BytesIO(gz.read())
398
+
399
+ result = pl.read_ndjson(source)
400
+
401
+ assert len(result) == 3
402
+ assert set(result.columns) == {"col1", "col2", "col3"}
403
+ assert result["col1"].to_list() == ["a,b\tc|d", "e,f\tg|h", "test"]
404
+
405
+ def test_read_avro_from_file(self):
406
+ avro_path = f"{self.base_path}/test.avro"
407
+ result = pl.read_avro(avro_path)
408
+
409
+ assert len(result) == 3
410
+ assert list(result.columns) == ["col1", "col2", "col3"]
411
+ assert result.equals(self.df)
412
+
413
+ def test_read_orc_from_file(self):
414
+ # Test ORC reading via pandas conversion
415
+ orc_path = f"{self.base_path}/test.orc"
416
+
417
+ # Read with pandas and convert to polars (mimicking our ORC handling)
418
+ import pandas as pd
419
+
420
+ pd_df = pd.read_orc(orc_path)
421
+ result = pl.from_pandas(pd_df)
422
+
423
+ assert len(result) == 3
424
+ assert list(result.columns) == ["col1", "col2", "col3"]
425
+ # Convert both to pandas for comparison due to potential type differences
426
+ pd.testing.assert_frame_equal(result.to_pandas(), self.df.to_pandas())
427
+
428
+ def test_read_kwargs_provider_string_types(self):
429
+ # Test the string types provider
430
+ provider = ReadKwargsProviderPolarsStringTypes()
431
+ kwargs = {"separator": ",", "has_header": False}
432
+
433
+ # Apply string types
434
+ result_kwargs = provider._get_kwargs(ContentType.CSV.value, kwargs)
435
+
436
+ # Should add infer_schema=False for string type inference
437
+ assert "infer_schema" in result_kwargs
438
+ assert result_kwargs["infer_schema"] is False
439
+
440
+ def test_concat_dataframes(self):
441
+ # Test concatenation of multiple dataframes
442
+ df1 = pl.DataFrame({"col1": ["a"], "col2": [1]})
443
+ df2 = pl.DataFrame({"col1": ["b"], "col2": [2]})
444
+ df3 = pl.DataFrame({"col1": ["c"], "col2": [3]})
445
+
446
+ # Test normal concatenation
447
+ result = concat_dataframes([df1, df2, df3])
448
+ assert len(result) == 3
449
+ assert result["col1"].to_list() == ["a", "b", "c"]
450
+
451
+ # Test single dataframe
452
+ result = concat_dataframes([df1])
453
+ assert result.equals(df1)
454
+
455
+ # Test empty list
456
+ result = concat_dataframes([])
457
+ assert result is None
458
+
459
+ # Test None input
460
+ result = concat_dataframes(None)
461
+ assert result is None
462
+
463
+ def test_file_to_dataframe_csv(self):
464
+ # Test reading CSV with file_to_dataframe
465
+ csv_path = f"{self.base_path}/test.csv"
466
+
467
+ result = file_to_dataframe(
468
+ csv_path,
469
+ ContentType.CSV.value,
470
+ ContentEncoding.GZIP.value,
471
+ filesystem=self.fs,
472
+ column_names=["col1", "col2", "col3"],
473
+ )
474
+
475
+ assert len(result) == 3
476
+ assert list(result.columns) == ["col1", "col2", "col3"]
477
+ assert result["col1"].to_list() == ["a,b\tc|d", "e,f\tg|h", "test"]
478
+
479
+ def test_file_to_dataframe_tsv(self):
480
+ # Test reading TSV with file_to_dataframe
481
+ tsv_path = f"{self.base_path}/test.tsv"
482
+
483
+ result = file_to_dataframe(
484
+ tsv_path,
485
+ ContentType.TSV.value,
486
+ ContentEncoding.GZIP.value,
487
+ filesystem=self.fs,
488
+ column_names=["col1", "col2", "col3"],
489
+ )
490
+
491
+ assert len(result) == 3
492
+ assert list(result.columns) == ["col1", "col2", "col3"]
493
+ assert result["col1"].to_list() == ["a,b\tc|d", "e,f\tg|h", "test"]
494
+
495
+ def test_file_to_dataframe_parquet(self):
496
+ # Test reading Parquet with file_to_dataframe
497
+ parquet_path = f"{self.base_path}/test.parquet"
498
+
499
+ result = file_to_dataframe(
500
+ parquet_path, ContentType.PARQUET.value, filesystem=self.fs
501
+ )
502
+
503
+ assert len(result) == 3
504
+ assert list(result.columns) == ["col1", "col2", "col3"]
505
+ assert result.equals(self.df)
506
+
507
+ def test_file_to_dataframe_feather(self):
508
+ # Test reading Feather with file_to_dataframe
509
+ feather_path = f"{self.base_path}/test.feather"
510
+
511
+ result = file_to_dataframe(
512
+ feather_path, ContentType.FEATHER.value, filesystem=self.fs
513
+ )
514
+
515
+ assert len(result) == 3
516
+ assert list(result.columns) == ["col1", "col2", "col3"]
517
+ assert result.equals(self.df)
518
+
519
+ def test_file_to_dataframe_json(self):
520
+ # Test reading JSON with file_to_dataframe
521
+ json_path = f"{self.base_path}/test.json"
522
+
523
+ result = file_to_dataframe(
524
+ json_path,
525
+ ContentType.JSON.value,
526
+ ContentEncoding.GZIP.value,
527
+ filesystem=self.fs,
528
+ )
529
+
530
+ assert len(result) == 3
531
+ assert set(result.columns) == {"col1", "col2", "col3"}
532
+ assert result["col1"].to_list() == ["a,b\tc|d", "e,f\tg|h", "test"]
533
+
534
+ def test_file_to_dataframe_avro(self):
535
+ # Test reading Avro with file_to_dataframe
536
+ avro_path = f"{self.base_path}/test.avro"
537
+
538
+ result = file_to_dataframe(
539
+ avro_path, ContentType.AVRO.value, filesystem=self.fs
540
+ )
541
+
542
+ assert len(result) == 3
543
+ assert list(result.columns) == ["col1", "col2", "col3"]
544
+ assert result.equals(self.df)
545
+
546
+ def test_file_to_dataframe_orc(self):
547
+ # Test reading ORC with file_to_dataframe
548
+ orc_path = f"{self.base_path}/test.orc"
549
+
550
+ result = file_to_dataframe(orc_path, ContentType.ORC.value, filesystem=self.fs)
551
+
552
+ assert len(result) == 3
553
+ assert list(result.columns) == ["col1", "col2", "col3"]
554
+ # Convert both to pandas for comparison due to potential type differences
555
+ pd.testing.assert_frame_equal(result.to_pandas(), self.df.to_pandas())
556
+
557
+ def test_file_to_dataframe_with_column_selection(self):
558
+ # Test reading with column selection
559
+ csv_path = f"{self.base_path}/test.csv"
560
+
561
+ # When has_header=False and we specify columns, we need to use column indices or
562
+ # not provide new_columns. Let's test by just specifying the first 2 columns by index
563
+ result = file_to_dataframe(
564
+ csv_path,
565
+ ContentType.CSV.value,
566
+ ContentEncoding.GZIP.value,
567
+ filesystem=self.fs,
568
+ include_columns=[0, 1], # Select first two columns by index
569
+ )
570
+
571
+ assert len(result) == 3
572
+ assert len(result.columns) == 2 # Should only have 2 columns
573
+ # With auto-generated column names when has_header=False
574
+ assert list(result.columns) == ["column_1", "column_2"]
575
+
576
+ def test_file_to_dataframe_with_kwargs_provider(self):
577
+ # Test reading with kwargs provider
578
+ csv_path = f"{self.base_path}/test.csv"
579
+ provider = ReadKwargsProviderPolarsStringTypes(
580
+ include_columns=["column_1", "column_2", "column_3"]
581
+ )
582
+
583
+ result = file_to_dataframe(
584
+ csv_path,
585
+ ContentType.CSV.value,
586
+ ContentEncoding.GZIP.value,
587
+ filesystem=self.fs,
588
+ pl_read_func_kwargs_provider=provider,
589
+ )
590
+
591
+ assert len(result) == 3
592
+ assert list(result.columns) == ["column_1", "column_2", "column_3"]
593
+ # With string types provider, all columns should be strings
594
+ assert all(result[col].dtype == pl.Utf8 for col in result.columns)
595
+
596
+ def test_file_to_dataframe_filesystem_inference(self):
597
+ # Test filesystem inference when no filesystem is provided
598
+ parquet_path = f"{self.base_path}/test.parquet"
599
+
600
+ result = file_to_dataframe(
601
+ parquet_path,
602
+ ContentType.PARQUET.value
603
+ # No filesystem provided - should be inferred
604
+ )
605
+
606
+ assert len(result) == 3
607
+ assert list(result.columns) == ["col1", "col2", "col3"]
608
+ assert result.equals(self.df)
609
+
610
+ def test_file_to_dataframe_unsupported_content_type(self):
611
+ # Test error handling for unsupported content type
612
+ parquet_path = f"{self.base_path}/test.parquet"
613
+
614
+ with self.assertRaises(NotImplementedError) as context:
615
+ file_to_dataframe(
616
+ parquet_path, "unsupported/content-type", filesystem=self.fs
617
+ )
618
+
619
+ assert "not implemented" in str(context.exception)
620
+
621
+ def test_file_to_dataframe_bzip2_compression(self):
622
+ # Test BZIP2 compression handling
623
+ import bz2
624
+
625
+ # Create a BZIP2 compressed CSV file
626
+ csv_content = '"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n'
627
+ compressed_content = bz2.compress(csv_content.encode("utf-8"))
628
+
629
+ bz2_path = f"{self.base_path}/test.csv.bz2"
630
+ with self.fs.open(bz2_path, "wb") as f:
631
+ f.write(compressed_content)
632
+
633
+ result = file_to_dataframe(
634
+ bz2_path,
635
+ ContentType.CSV.value,
636
+ ContentEncoding.BZIP2.value,
637
+ filesystem=self.fs,
638
+ column_names=["col1", "col2", "col3"],
639
+ )
640
+
641
+ assert len(result) == 3
642
+ assert list(result.columns) == ["col1", "col2", "col3"]
643
+ assert result["col1"].to_list() == ["a,b\tc|d", "e,f\tg|h", "test"]
644
+
645
+
646
+ class TestPolarsFileSystemSupport(TestCase):
647
+ """
648
+ Comprehensive tests for encoding-aware reader functions with different filesystem types.
649
+ Tests fsspec AbstractFileSystem, PyArrow FileSystem, and auto-inferred filesystem.
650
+ """
651
+
652
+ def setUp(self):
653
+ import pyarrow.fs as pafs
654
+
655
+ # Create test data
656
+ self.test_data = pl.DataFrame(
657
+ {
658
+ "col1": ["value1", "value2", "value3"],
659
+ "col2": [1, 2, 3],
660
+ "col3": [1.1, 2.2, 3.3],
661
+ }
662
+ )
663
+
664
+ # Set up temporary directory
665
+ self.temp_dir = tempfile.mkdtemp()
666
+
667
+ # Set up different filesystem types
668
+ self.fsspec_fs = fsspec.filesystem("file")
669
+ self.pyarrow_fs = pafs.LocalFileSystem()
670
+
671
+ # Create test files for each content type
672
+ self._create_test_files()
673
+
674
+ def tearDown(self):
675
+ import shutil
676
+
677
+ shutil.rmtree(self.temp_dir)
678
+
679
+ def _create_test_files(self):
680
+ """Create test files in different formats with different compression types."""
681
+ import gzip
682
+ import bz2
683
+
684
+ # CSV files
685
+ csv_data = "col1,col2,col3\nvalue1,1,1.1\nvalue2,2,2.2\nvalue3,3,3.3\n"
686
+
687
+ # Create uncompressed CSV
688
+ with open(f"{self.temp_dir}/test.csv", "w") as f:
689
+ f.write(csv_data)
690
+
691
+ # Create GZIP compressed CSV (fix: properly close the file)
692
+ with gzip.open(f"{self.temp_dir}/test_gzip.csv.gz", "wt") as f:
693
+ f.write(csv_data)
694
+
695
+ # Create BZIP2 compressed CSV (fix: properly close the file)
696
+ with bz2.open(f"{self.temp_dir}/test_bzip2.csv.bz2", "wt") as f:
697
+ f.write(csv_data)
698
+
699
+ # Parquet file
700
+ self.test_data.write_parquet(f"{self.temp_dir}/test.parquet")
701
+
702
+ # Feather/IPC file
703
+ self.test_data.write_ipc(f"{self.temp_dir}/test.feather")
704
+
705
+ # JSON file (NDJSON)
706
+ json_data = '{"col1":"value1","col2":1,"col3":1.1}\n{"col1":"value2","col2":2,"col3":2.2}\n{"col1":"value3","col2":3,"col3":3.3}\n'
707
+ with open(f"{self.temp_dir}/test.json", "w") as f:
708
+ f.write(json_data)
709
+
710
+ # AVRO file
711
+ self.test_data.write_avro(f"{self.temp_dir}/test.avro")
712
+
713
+ # ORC file (via pandas since polars delegates to pandas for ORC)
714
+ self.test_data.to_pandas().to_orc(f"{self.temp_dir}/test.orc")
715
+
716
+ def _assert_dataframes_equal(self, result, expected):
717
+ """Helper to assert polars dataframes are equal."""
718
+ # Convert to pandas for comparison since polars equality can be tricky with floating point
719
+ pd.testing.assert_frame_equal(
720
+ result.to_pandas().reset_index(drop=True),
721
+ expected.to_pandas().reset_index(drop=True),
722
+ check_dtype=False, # Allow minor type differences
723
+ )
724
+
725
+ def test_csv_with_fsspec_filesystem(self):
726
+ """Test CSV reading with fsspec AbstractFileSystem."""
727
+ from deltacat.utils.polars import read_csv
728
+
729
+ # Test uncompressed CSV
730
+ result = read_csv(
731
+ f"{self.temp_dir}/test.csv",
732
+ filesystem=self.fsspec_fs,
733
+ content_encoding=ContentEncoding.IDENTITY.value,
734
+ has_header=True,
735
+ )
736
+ self._assert_dataframes_equal(result, self.test_data)
737
+
738
+ # Test GZIP compressed CSV
739
+ result = read_csv(
740
+ f"{self.temp_dir}/test_gzip.csv.gz",
741
+ filesystem=self.fsspec_fs,
742
+ content_encoding=ContentEncoding.GZIP.value,
743
+ has_header=True,
744
+ )
745
+ self._assert_dataframes_equal(result, self.test_data)
746
+
747
+ # Test BZIP2 compressed CSV
748
+ result = read_csv(
749
+ f"{self.temp_dir}/test_bzip2.csv.bz2",
750
+ filesystem=self.fsspec_fs,
751
+ content_encoding=ContentEncoding.BZIP2.value,
752
+ has_header=True,
753
+ )
754
+ self._assert_dataframes_equal(result, self.test_data)
755
+
756
+ def test_csv_with_pyarrow_filesystem(self):
757
+ """Test CSV reading with PyArrow FileSystem."""
758
+ from deltacat.utils.polars import read_csv
759
+
760
+ # Test uncompressed CSV
761
+ result = read_csv(
762
+ f"{self.temp_dir}/test.csv",
763
+ filesystem=self.pyarrow_fs,
764
+ content_encoding=ContentEncoding.IDENTITY.value,
765
+ has_header=True,
766
+ )
767
+ self._assert_dataframes_equal(result, self.test_data)
768
+
769
+ # Test GZIP compressed CSV
770
+ result = read_csv(
771
+ f"{self.temp_dir}/test_gzip.csv.gz",
772
+ filesystem=self.pyarrow_fs,
773
+ content_encoding=ContentEncoding.GZIP.value,
774
+ has_header=True,
775
+ )
776
+ self._assert_dataframes_equal(result, self.test_data)
777
+
778
+ def test_csv_with_auto_inferred_filesystem(self):
779
+ """Test CSV reading with automatically inferred filesystem."""
780
+ from deltacat.utils.polars import read_csv
781
+
782
+ # Test uncompressed CSV (filesystem=None, should auto-infer)
783
+ result = read_csv(
784
+ f"{self.temp_dir}/test.csv",
785
+ filesystem=None,
786
+ content_encoding=ContentEncoding.IDENTITY.value,
787
+ has_header=True,
788
+ )
789
+ self._assert_dataframes_equal(result, self.test_data)
790
+
791
+ def test_parquet_with_different_filesystems(self):
792
+ """Test Parquet reading with different filesystem types."""
793
+ from deltacat.utils.polars import read_parquet
794
+
795
+ # Test with fsspec
796
+ result = read_parquet(
797
+ f"{self.temp_dir}/test.parquet",
798
+ filesystem=self.fsspec_fs,
799
+ content_encoding=ContentEncoding.IDENTITY.value,
800
+ )
801
+ self._assert_dataframes_equal(result, self.test_data)
802
+
803
+ # Test with PyArrow
804
+ result = read_parquet(
805
+ f"{self.temp_dir}/test.parquet",
806
+ filesystem=self.pyarrow_fs,
807
+ content_encoding=ContentEncoding.IDENTITY.value,
808
+ )
809
+ self._assert_dataframes_equal(result, self.test_data)
810
+
811
+ # Test with auto-inferred
812
+ result = read_parquet(
813
+ f"{self.temp_dir}/test.parquet",
814
+ filesystem=None,
815
+ content_encoding=ContentEncoding.IDENTITY.value,
816
+ )
817
+ self._assert_dataframes_equal(result, self.test_data)
818
+
819
+ def test_feather_with_different_filesystems(self):
820
+ """Test Feather/IPC reading with different filesystem types."""
821
+ from deltacat.utils.polars import read_ipc
822
+
823
+ # Test with fsspec
824
+ result = read_ipc(
825
+ f"{self.temp_dir}/test.feather",
826
+ filesystem=self.fsspec_fs,
827
+ content_encoding=ContentEncoding.IDENTITY.value,
828
+ )
829
+ self._assert_dataframes_equal(result, self.test_data)
830
+
831
+ # Test with PyArrow
832
+ result = read_ipc(
833
+ f"{self.temp_dir}/test.feather",
834
+ filesystem=self.pyarrow_fs,
835
+ content_encoding=ContentEncoding.IDENTITY.value,
836
+ )
837
+ self._assert_dataframes_equal(result, self.test_data)
838
+
839
+ # Test with auto-inferred
840
+ result = read_ipc(
841
+ f"{self.temp_dir}/test.feather",
842
+ filesystem=None,
843
+ content_encoding=ContentEncoding.IDENTITY.value,
844
+ )
845
+ self._assert_dataframes_equal(result, self.test_data)
846
+
847
+ def test_json_with_different_filesystems(self):
848
+ """Test JSON reading with different filesystem types."""
849
+ from deltacat.utils.polars import read_ndjson
850
+
851
+ # Test with fsspec
852
+ result = read_ndjson(
853
+ f"{self.temp_dir}/test.json",
854
+ filesystem=self.fsspec_fs,
855
+ content_encoding=ContentEncoding.IDENTITY.value,
856
+ )
857
+ self._assert_dataframes_equal(result, self.test_data)
858
+
859
+ # Test with PyArrow
860
+ result = read_ndjson(
861
+ f"{self.temp_dir}/test.json",
862
+ filesystem=self.pyarrow_fs,
863
+ content_encoding=ContentEncoding.IDENTITY.value,
864
+ )
865
+ self._assert_dataframes_equal(result, self.test_data)
866
+
867
+ # Test with auto-inferred
868
+ result = read_ndjson(
869
+ f"{self.temp_dir}/test.json",
870
+ filesystem=None,
871
+ content_encoding=ContentEncoding.IDENTITY.value,
872
+ )
873
+ self._assert_dataframes_equal(result, self.test_data)
874
+
875
+ def test_avro_with_different_filesystems(self):
876
+ """Test AVRO reading with different filesystem types."""
877
+ from deltacat.utils.polars import read_avro
878
+
879
+ # Test with fsspec
880
+ result = read_avro(
881
+ f"{self.temp_dir}/test.avro",
882
+ filesystem=self.fsspec_fs,
883
+ content_encoding=ContentEncoding.IDENTITY.value,
884
+ )
885
+ self._assert_dataframes_equal(result, self.test_data)
886
+
887
+ # Test with PyArrow
888
+ result = read_avro(
889
+ f"{self.temp_dir}/test.avro",
890
+ filesystem=self.pyarrow_fs,
891
+ content_encoding=ContentEncoding.IDENTITY.value,
892
+ )
893
+ self._assert_dataframes_equal(result, self.test_data)
894
+
895
+ # Test with auto-inferred
896
+ result = read_avro(
897
+ f"{self.temp_dir}/test.avro",
898
+ filesystem=None,
899
+ content_encoding=ContentEncoding.IDENTITY.value,
900
+ )
901
+ self._assert_dataframes_equal(result, self.test_data)
902
+
903
+ def test_orc_with_different_filesystems(self):
904
+ """Test ORC reading with different filesystem types."""
905
+ from deltacat.utils.polars import read_orc
906
+
907
+ # Test with fsspec
908
+ result = read_orc(
909
+ f"{self.temp_dir}/test.orc",
910
+ filesystem=self.fsspec_fs,
911
+ content_encoding=ContentEncoding.IDENTITY.value,
912
+ )
913
+ self._assert_dataframes_equal(result, self.test_data)
914
+
915
+ # Test with PyArrow
916
+ result = read_orc(
917
+ f"{self.temp_dir}/test.orc",
918
+ filesystem=self.pyarrow_fs,
919
+ content_encoding=ContentEncoding.IDENTITY.value,
920
+ )
921
+ self._assert_dataframes_equal(result, self.test_data)
922
+
923
+ # Test with auto-inferred
924
+ result = read_orc(
925
+ f"{self.temp_dir}/test.orc",
926
+ filesystem=None,
927
+ content_encoding=ContentEncoding.IDENTITY.value,
928
+ )
929
+ self._assert_dataframes_equal(result, self.test_data)
930
+
931
+ def test_file_to_dataframe_with_different_filesystems(self):
932
+ """Test file_to_dataframe with different filesystem types for all content types."""
933
+ test_cases = [
934
+ (
935
+ f"{self.temp_dir}/test.csv",
936
+ ContentType.CSV.value,
937
+ ContentEncoding.IDENTITY.value,
938
+ {"has_header": True},
939
+ ),
940
+ (
941
+ f"{self.temp_dir}/test_gzip.csv.gz",
942
+ ContentType.CSV.value,
943
+ ContentEncoding.GZIP.value,
944
+ {"has_header": True},
945
+ ),
946
+ (
947
+ f"{self.temp_dir}/test.parquet",
948
+ ContentType.PARQUET.value,
949
+ ContentEncoding.IDENTITY.value,
950
+ {},
951
+ ),
952
+ (
953
+ f"{self.temp_dir}/test.feather",
954
+ ContentType.FEATHER.value,
955
+ ContentEncoding.IDENTITY.value,
956
+ {},
957
+ ),
958
+ (
959
+ f"{self.temp_dir}/test.json",
960
+ ContentType.JSON.value,
961
+ ContentEncoding.IDENTITY.value,
962
+ {},
963
+ ),
964
+ (
965
+ f"{self.temp_dir}/test.avro",
966
+ ContentType.AVRO.value,
967
+ ContentEncoding.IDENTITY.value,
968
+ {},
969
+ ),
970
+ (
971
+ f"{self.temp_dir}/test.orc",
972
+ ContentType.ORC.value,
973
+ ContentEncoding.IDENTITY.value,
974
+ {},
975
+ ),
976
+ ]
977
+
978
+ filesystems = [
979
+ ("fsspec", self.fsspec_fs),
980
+ ("pyarrow", self.pyarrow_fs),
981
+ ("auto-inferred", None),
982
+ ]
983
+
984
+ for path, content_type, content_encoding, extra_kwargs in test_cases:
985
+ for fs_name, filesystem in filesystems:
986
+ with self.subTest(
987
+ content_type=content_type,
988
+ filesystem=fs_name,
989
+ encoding=content_encoding,
990
+ ):
991
+ result = file_to_dataframe(
992
+ path=path,
993
+ content_type=content_type,
994
+ content_encoding=content_encoding,
995
+ filesystem=filesystem,
996
+ **extra_kwargs,
997
+ )
998
+ self._assert_dataframes_equal(result, self.test_data)
999
+
1000
+ def test_compression_encoding_with_different_filesystems(self):
1001
+ """Test that compression encoding works correctly with different filesystem types."""
1002
+ test_cases = [
1003
+ (f"{self.temp_dir}/test.csv", ContentEncoding.IDENTITY.value),
1004
+ (f"{self.temp_dir}/test_gzip.csv.gz", ContentEncoding.GZIP.value),
1005
+ (f"{self.temp_dir}/test_bzip2.csv.bz2", ContentEncoding.BZIP2.value),
1006
+ ]
1007
+
1008
+ filesystems = [
1009
+ ("fsspec", self.fsspec_fs),
1010
+ ("pyarrow", self.pyarrow_fs),
1011
+ ("auto-inferred", None),
1012
+ ]
1013
+
1014
+ for path, content_encoding in test_cases:
1015
+ for fs_name, filesystem in filesystems:
1016
+ with self.subTest(encoding=content_encoding, filesystem=fs_name):
1017
+ result = file_to_dataframe(
1018
+ path=path,
1019
+ content_type=ContentType.CSV.value,
1020
+ content_encoding=content_encoding,
1021
+ filesystem=filesystem,
1022
+ has_header=True,
1023
+ )
1024
+ self._assert_dataframes_equal(result, self.test_data)
1025
+
1026
+ def test_filesystem_open_kwargs(self):
1027
+ """Test that filesystem open kwargs are properly passed through."""
1028
+ from deltacat.utils.polars import read_csv
1029
+
1030
+ # Test with custom fs_open_kwargs
1031
+ result = read_csv(
1032
+ f"{self.temp_dir}/test.csv",
1033
+ filesystem=self.fsspec_fs,
1034
+ content_encoding=ContentEncoding.IDENTITY.value,
1035
+ fs_open_kwargs={
1036
+ "encoding": "utf-8"
1037
+ }, # This should be passed to filesystem.open()
1038
+ has_header=True,
1039
+ )
1040
+ self._assert_dataframes_equal(result, self.test_data)