deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
deltacat/utils/polars.py CHANGED
@@ -1,58 +1,156 @@
1
1
  import logging
2
- from typing import Optional, List, Dict, Callable, Union
2
+ import bz2
3
+ import gzip
4
+ from functools import partial
5
+ from typing import Optional, List, Dict, Callable, Union, Iterable, Any
3
6
 
4
7
  import polars as pl
8
+ import pyarrow as pa
9
+ import pyarrow.fs as pafs
5
10
 
6
11
  from fsspec import AbstractFileSystem
7
12
  from ray.data.datasource import FilenameProvider
8
13
 
9
14
  from deltacat import logs
15
+ from deltacat.utils.filesystem import resolve_path_and_filesystem
16
+ from deltacat.utils.common import ContentTypeKwargsProvider, ReadKwargsProvider
17
+ from deltacat.utils.performance import timed_invocation
10
18
 
11
- from deltacat.types.media import ContentType
19
+ from deltacat.types.media import (
20
+ ContentType,
21
+ ContentEncoding,
22
+ DELIMITED_TEXT_CONTENT_TYPES,
23
+ TABULAR_CONTENT_TYPES,
24
+ )
25
+ from deltacat.types.partial_download import PartialFileDownloadParams
12
26
 
13
27
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
14
28
 
29
+ # Encoding to file initialization function mapping
30
+ ENCODING_TO_FILE_INIT: Dict[str, Callable] = {
31
+ ContentEncoding.GZIP.value: partial(gzip.open, mode="rb"),
32
+ ContentEncoding.BZIP2.value: partial(bz2.open, mode="rb"),
33
+ ContentEncoding.IDENTITY.value: lambda file_path: file_path,
34
+ }
35
+
15
36
 
16
37
  def write_json(
17
38
  table: pl.DataFrame,
18
39
  path: str,
19
40
  *,
20
- filesystem: Optional[AbstractFileSystem] = None,
41
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
21
42
  fs_open_kwargs: Dict[str, any] = {},
22
43
  **write_kwargs,
23
44
  ) -> None:
24
- if not filesystem:
25
- table.write_ndjson(path, **write_kwargs)
45
+ # Check if the path already indicates compression to avoid double compression
46
+ should_compress = path.endswith(".gz")
47
+
48
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
49
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
50
+ with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
51
+ if should_compress:
52
+ # Path ends with .gz, PyArrow filesystem automatically compresses
53
+ table.write_ndjson(f, **write_kwargs)
54
+ else:
55
+ # No compression indicated, write uncompressed
56
+ table.write_ndjson(f, **write_kwargs)
26
57
  else:
27
58
  with filesystem.open(path, "wb", **fs_open_kwargs) as f:
28
- table.write_ndjson(f, **write_kwargs)
59
+ if should_compress:
60
+ # For fsspec filesystems, we need to apply compression explicitly
61
+ with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
62
+ table.write_ndjson(out, **write_kwargs)
63
+ else:
64
+ # No compression indicated, write uncompressed
65
+ table.write_ndjson(f, **write_kwargs)
66
+
67
+
68
+ def content_type_to_writer_kwargs(content_type: str) -> Dict[str, any]:
69
+ """
70
+ Returns writer kwargs for the given content type when writing with polars.
71
+ """
72
+ if content_type == ContentType.UNESCAPED_TSV.value:
73
+ return {
74
+ "separator": "\t",
75
+ "include_header": False,
76
+ "null_value": "",
77
+ "quote_style": "never", # Equivalent to QUOTE_NONE in pandas
78
+ }
79
+ if content_type == ContentType.TSV.value:
80
+ return {
81
+ "separator": "\t",
82
+ "include_header": False,
83
+ "quote_style": "necessary",
84
+ }
85
+ if content_type == ContentType.CSV.value:
86
+ return {
87
+ "separator": ",",
88
+ "include_header": False,
89
+ "quote_style": "necessary",
90
+ }
91
+ if content_type == ContentType.PSV.value:
92
+ return {
93
+ "separator": "|",
94
+ "include_header": False,
95
+ "quote_style": "necessary",
96
+ }
97
+ if content_type in {
98
+ ContentType.PARQUET.value,
99
+ ContentType.FEATHER.value,
100
+ ContentType.JSON.value,
101
+ ContentType.AVRO.value,
102
+ ContentType.ORC.value,
103
+ }:
104
+ return {}
105
+ raise ValueError(f"Unsupported content type: {content_type}")
29
106
 
30
107
 
31
108
  def write_csv(
32
109
  table: pl.DataFrame,
33
110
  path: str,
34
111
  *,
35
- filesystem: Optional[AbstractFileSystem] = None,
112
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
36
113
  fs_open_kwargs: Dict[str, any] = {},
37
- **write_kwargs,
114
+ **kwargs,
38
115
  ) -> None:
39
- if not filesystem:
40
- table.write_csv(path, **write_kwargs)
116
+ """
117
+ Write a polars DataFrame to a CSV file (or other delimited text format).
118
+ """
119
+ # Check if the path already indicates compression to avoid double compression
120
+ should_compress = path.endswith(".gz")
121
+
122
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
123
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
124
+ with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
125
+ if should_compress:
126
+ # Path ends with .gz, PyArrow filesystem automatically compresses
127
+ table.write_csv(f, **kwargs)
128
+ else:
129
+ # No compression indicated, write uncompressed
130
+ table.write_csv(f, **kwargs)
41
131
  else:
42
132
  with filesystem.open(path, "wb", **fs_open_kwargs) as f:
43
- table.write_csv(f, **write_kwargs)
133
+ if should_compress:
134
+ # For fsspec filesystems, we need to apply compression explicitly
135
+ with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
136
+ table.write_csv(out, **kwargs)
137
+ else:
138
+ # No compression indicated, write uncompressed
139
+ table.write_csv(f, **kwargs)
44
140
 
45
141
 
46
142
  def write_avro(
47
143
  table: pl.DataFrame,
48
144
  path: str,
49
145
  *,
50
- filesystem: Optional[AbstractFileSystem] = None,
146
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
51
147
  fs_open_kwargs: Dict[str, any] = {},
52
148
  **write_kwargs,
53
149
  ) -> None:
54
- if not filesystem:
55
- table.write_avro(path, **write_kwargs)
150
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
151
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
152
+ with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
153
+ table.write_avro(f, **write_kwargs)
56
154
  else:
57
155
  with filesystem.open(path, "wb", **fs_open_kwargs) as f:
58
156
  table.write_avro(f, **write_kwargs)
@@ -62,25 +160,75 @@ def write_parquet(
62
160
  table: pl.DataFrame,
63
161
  path: str,
64
162
  *,
65
- filesystem: Optional[AbstractFileSystem] = None,
163
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
66
164
  fs_open_kwargs: Dict[str, any] = {},
67
165
  **write_kwargs,
68
166
  ) -> None:
69
- if not filesystem:
70
- table.write_parquet(path, **write_kwargs)
167
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
168
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
169
+ with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
170
+ table.write_parquet(f, **write_kwargs)
71
171
  else:
72
172
  with filesystem.open(path, "wb", **fs_open_kwargs) as f:
73
173
  table.write_parquet(f, **write_kwargs)
74
174
 
75
175
 
176
+ def write_feather(
177
+ table: pl.DataFrame,
178
+ path: str,
179
+ *,
180
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
181
+ fs_open_kwargs: Dict[str, any] = {},
182
+ **kwargs,
183
+ ) -> None:
184
+ """
185
+ Write a polars DataFrame to a Feather file.
186
+ """
187
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
188
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
189
+ with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
190
+ table.write_ipc(f, **kwargs)
191
+ else:
192
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
193
+ table.write_ipc(f, **kwargs)
194
+
195
+
196
+ def write_orc(
197
+ table: pl.DataFrame,
198
+ path: str,
199
+ *,
200
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
201
+ fs_open_kwargs: Dict[str, any] = {},
202
+ **write_kwargs,
203
+ ) -> None:
204
+ """
205
+ Write a polars DataFrame to an ORC file by delegating to PyArrow implementation.
206
+ """
207
+ from deltacat.utils.pyarrow import write_orc as pyarrow_write_orc
208
+
209
+ # Convert polars DataFrame to PyArrow Table
210
+ pa_table = table.to_arrow()
211
+
212
+ # Delegate to PyArrow write_orc implementation
213
+ pyarrow_write_orc(
214
+ pa_table,
215
+ path,
216
+ filesystem=filesystem,
217
+ fs_open_kwargs=fs_open_kwargs,
218
+ **write_kwargs,
219
+ )
220
+
221
+
76
222
  CONTENT_TYPE_TO_PL_WRITE_FUNC: Dict[str, Callable] = {
77
- # TODO (pdames): add support for other delimited text content types as
78
- # pyarrow adds support for custom delimiters, escaping, and None value
79
- # representations to pyarrow.csv.WriteOptions.
80
- ContentType.AVRO.value: write_avro,
223
+ ContentType.UNESCAPED_TSV.value: write_csv,
224
+ ContentType.TSV.value: write_csv,
81
225
  ContentType.CSV.value: write_csv,
226
+ ContentType.PSV.value: write_csv,
82
227
  ContentType.PARQUET.value: write_parquet,
228
+ ContentType.FEATHER.value: write_feather,
83
229
  ContentType.JSON.value: write_json,
230
+ ContentType.AVRO.value: write_avro,
231
+ ContentType.ORC.value: write_orc,
84
232
  }
85
233
 
86
234
 
@@ -108,21 +256,504 @@ def dataframe_size(table: pl.DataFrame) -> int:
108
256
  def dataframe_to_file(
109
257
  table: pl.DataFrame,
110
258
  base_path: str,
111
- file_system: Optional[AbstractFileSystem],
259
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
112
260
  block_path_provider: Union[Callable, FilenameProvider],
113
261
  content_type: str = ContentType.PARQUET.value,
262
+ schema: Optional[pa.Schema] = None,
114
263
  **kwargs,
115
264
  ) -> None:
116
265
  """
117
- Writes the given Pyarrow Table to a file.
266
+ Writes the given Polars DataFrame to a file.
118
267
  """
119
268
  writer = CONTENT_TYPE_TO_PL_WRITE_FUNC.get(content_type)
269
+ writer_kwargs = content_type_to_writer_kwargs(content_type)
270
+ writer_kwargs.update(kwargs)
120
271
  if not writer:
121
272
  raise NotImplementedError(
122
- f"Pyarrow writer for content type '{content_type}' not "
273
+ f"Polars writer for content type '{content_type}' not "
123
274
  f"implemented. Known content types: "
124
- f"{CONTENT_TYPE_TO_PL_WRITE_FUNC.keys}"
275
+ f"{CONTENT_TYPE_TO_PL_WRITE_FUNC.keys()}"
125
276
  )
126
277
  path = block_path_provider(base_path)
127
- logger.debug(f"Writing table: {table} with kwargs: {kwargs} to path: {path}")
128
- writer(table, path, filesystem=file_system, **kwargs)
278
+ logger.debug(f"Writing table: {table} with kwargs: {writer_kwargs} to path: {path}")
279
+ writer(table, path, filesystem=filesystem, **writer_kwargs)
280
+
281
+
282
+ def write_table(
283
+ table: pl.DataFrame,
284
+ path: str,
285
+ *,
286
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
287
+ fs_open_kwargs: Dict[str, any] = {},
288
+ content_type: str = ContentType.PARQUET.value,
289
+ **kwargs,
290
+ ) -> None:
291
+ """
292
+ Write a polars DataFrame to a file in the specified format.
293
+ """
294
+ writer = CONTENT_TYPE_TO_PL_WRITE_FUNC.get(content_type)
295
+ writer_kwargs = content_type_to_writer_kwargs(content_type)
296
+ writer_kwargs.update(kwargs)
297
+ if not writer:
298
+ raise NotImplementedError(
299
+ f"Polars writer for content type '{content_type}' not "
300
+ f"implemented. Known content types: "
301
+ f"{CONTENT_TYPE_TO_PL_WRITE_FUNC.keys()}"
302
+ )
303
+ writer(
304
+ table,
305
+ path,
306
+ filesystem=filesystem,
307
+ fs_open_kwargs=fs_open_kwargs,
308
+ **writer_kwargs,
309
+ )
310
+
311
+
312
+ CONTENT_TYPE_TO_PL_READ_FUNC: Dict[str, Callable] = {
313
+ ContentType.UNESCAPED_TSV.value: pl.read_csv,
314
+ ContentType.TSV.value: pl.read_csv,
315
+ ContentType.CSV.value: pl.read_csv,
316
+ ContentType.PSV.value: pl.read_csv,
317
+ ContentType.PARQUET.value: pl.read_parquet,
318
+ ContentType.FEATHER.value: pl.read_ipc,
319
+ ContentType.JSON.value: pl.read_ndjson,
320
+ ContentType.AVRO.value: pl.read_avro,
321
+ }
322
+
323
+
324
+ class ReadKwargsProviderPolarsStringTypes(ContentTypeKwargsProvider):
325
+ """ReadKwargsProvider impl that reads columns of delimited text files
326
+ as UTF-8 strings (i.e. disables type inference). Useful for ensuring
327
+ lossless reads of UTF-8 delimited text datasets and improving read
328
+ performance in cases where type casting is not required."""
329
+
330
+ def __init__(self, include_columns: Optional[Iterable[str]] = None):
331
+ self.include_columns = include_columns
332
+
333
+ def _get_kwargs(self, content_type: str, kwargs: Dict[str, Any]) -> Dict[str, Any]:
334
+ if content_type in DELIMITED_TEXT_CONTENT_TYPES:
335
+ include_columns = (
336
+ self.include_columns if self.include_columns else kwargs.get("columns")
337
+ )
338
+ if not include_columns:
339
+ # read all columns as strings - disable schema inference
340
+ kwargs["infer_schema"] = False
341
+ else:
342
+ # read only the included columns as strings
343
+ kwargs["schema_overrides"] = {
344
+ column_name: pl.Utf8 for column_name in include_columns
345
+ }
346
+ return kwargs
347
+
348
+
349
+ def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
350
+ if content_type == ContentType.UNESCAPED_TSV.value:
351
+ return {
352
+ "separator": "\t",
353
+ "has_header": False,
354
+ "null_values": [""],
355
+ "quote_char": None,
356
+ }
357
+ if content_type == ContentType.TSV.value:
358
+ return {"separator": "\t", "has_header": False}
359
+ if content_type == ContentType.CSV.value:
360
+ return {"separator": ",", "has_header": False}
361
+ if content_type == ContentType.PSV.value:
362
+ return {"separator": "|", "has_header": False}
363
+ if content_type in {
364
+ ContentType.PARQUET.value,
365
+ ContentType.FEATHER.value,
366
+ ContentType.ORC.value,
367
+ ContentType.JSON.value,
368
+ ContentType.AVRO.value,
369
+ }:
370
+ return {}
371
+ raise ValueError(f"Unsupported content type: {content_type}")
372
+
373
+
374
+ def _add_column_kwargs(
375
+ content_type: str,
376
+ column_names: Optional[List[str]],
377
+ include_columns: Optional[List[str]],
378
+ kwargs: Dict[str, Any],
379
+ ):
380
+ if content_type in DELIMITED_TEXT_CONTENT_TYPES:
381
+ if column_names:
382
+ kwargs["new_columns"] = column_names
383
+ if include_columns:
384
+ kwargs["columns"] = include_columns
385
+ else:
386
+ if content_type in TABULAR_CONTENT_TYPES:
387
+ if include_columns:
388
+ kwargs["columns"] = include_columns
389
+ else:
390
+ if include_columns:
391
+ logger.warning(
392
+ f"Ignoring request to include columns {include_columns} "
393
+ f"for non-tabular content type {content_type}"
394
+ )
395
+
396
+
397
+ def concat_dataframes(dataframes: List[pl.DataFrame]) -> Optional[pl.DataFrame]:
398
+ if dataframes is None or not len(dataframes):
399
+ return None
400
+ if len(dataframes) == 1:
401
+ return next(iter(dataframes))
402
+ return pl.concat(dataframes)
403
+
404
+
405
+ def append_column_to_table(
406
+ table: pl.DataFrame,
407
+ column_name: str,
408
+ column_value: Any,
409
+ ) -> pl.DataFrame:
410
+ return table.with_columns(pl.lit(column_value).alias(column_name))
411
+
412
+
413
+ def select_columns(
414
+ table: pl.DataFrame,
415
+ column_names: List[str],
416
+ ) -> pl.DataFrame:
417
+ return table.select(column_names)
418
+
419
+
420
+ def file_to_dataframe(
421
+ path: str,
422
+ content_type: str,
423
+ content_encoding: str = ContentEncoding.IDENTITY.value,
424
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
425
+ column_names: Optional[List[str]] = None,
426
+ include_columns: Optional[List[str]] = None,
427
+ pl_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
428
+ partial_file_download_params: Optional[PartialFileDownloadParams] = None,
429
+ fs_open_kwargs: Dict[str, Any] = {},
430
+ **kwargs,
431
+ ) -> pl.DataFrame:
432
+ """
433
+ Read a file into a Polars DataFrame using any filesystem.
434
+
435
+ Args:
436
+ path: The file path to read
437
+ content_type: The content type of the file (e.g., ContentType.CSV.value)
438
+ content_encoding: The content encoding (default: IDENTITY)
439
+ filesystem: The filesystem to use (if None, will be inferred from path)
440
+ column_names: Optional column names to assign
441
+ include_columns: Optional columns to include in the result
442
+ pl_read_func_kwargs_provider: Optional kwargs provider for customization
443
+ fs_open_kwargs: Optional kwargs for filesystem open operations
444
+ **kwargs: Additional kwargs passed to the reader function
445
+
446
+ Returns:
447
+ pl.DataFrame: The loaded DataFrame
448
+ """
449
+ logger.debug(
450
+ f"Reading {path} to Polars. Content type: {content_type}. "
451
+ f"Encoding: {content_encoding}"
452
+ )
453
+
454
+ pl_read_func = CONTENT_TYPE_TO_READ_FN.get(content_type)
455
+ if not pl_read_func:
456
+ raise NotImplementedError(
457
+ f"Polars reader for content type '{content_type}' not "
458
+ f"implemented. Known content types: "
459
+ f"{list(CONTENT_TYPE_TO_READ_FN.keys())}"
460
+ )
461
+
462
+ reader_kwargs = content_type_to_reader_kwargs(content_type)
463
+ _add_column_kwargs(content_type, column_names, include_columns, reader_kwargs)
464
+
465
+ # Merge with provided kwargs
466
+ reader_kwargs.update(kwargs)
467
+
468
+ if pl_read_func_kwargs_provider:
469
+ reader_kwargs = pl_read_func_kwargs_provider(content_type, reader_kwargs)
470
+
471
+ logger.debug(f"Reading {path} via {pl_read_func} with kwargs: {reader_kwargs}")
472
+
473
+ dataframe, latency = timed_invocation(
474
+ pl_read_func,
475
+ path,
476
+ filesystem=filesystem,
477
+ fs_open_kwargs=fs_open_kwargs,
478
+ content_encoding=content_encoding,
479
+ **reader_kwargs,
480
+ )
481
+ logger.debug(f"Time to read {path} into Polars DataFrame: {latency}s")
482
+ return dataframe
483
+
484
+
485
+ def read_csv(
486
+ path: str,
487
+ *,
488
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
489
+ fs_open_kwargs: Dict[str, any] = {},
490
+ content_encoding: str = ContentEncoding.IDENTITY.value,
491
+ **read_kwargs,
492
+ ) -> pl.DataFrame:
493
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
494
+ path, filesystem = resolve_path_and_filesystem(path)
495
+ if content_encoding == ContentEncoding.IDENTITY.value:
496
+ with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
497
+ return pl.read_csv(f, **read_kwargs)
498
+ else:
499
+ # For compressed files with PyArrow, we need to be careful because PyArrow
500
+ # may auto-decompress some formats. Try to read directly first.
501
+ try:
502
+ with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
503
+ # Try reading as if it's already decompressed by PyArrow
504
+ return pl.read_csv(f, **read_kwargs)
505
+ except Exception:
506
+ # If that fails, try manual decompression
507
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f:
508
+ input_file_init = ENCODING_TO_FILE_INIT.get(
509
+ content_encoding, lambda x: x
510
+ )
511
+ with input_file_init(f) as input_file:
512
+ content = input_file.read()
513
+ if isinstance(content, str):
514
+ content = content.encode("utf-8")
515
+ return pl.read_csv(content, **read_kwargs)
516
+ else:
517
+ # fsspec AbstractFileSystem
518
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
519
+ # Handle compression
520
+ if content_encoding == ContentEncoding.IDENTITY.value:
521
+ return pl.read_csv(f, **read_kwargs)
522
+ else:
523
+ input_file_init = ENCODING_TO_FILE_INIT.get(
524
+ content_encoding, lambda x: x
525
+ )
526
+ with input_file_init(f) as input_file:
527
+ # Read decompressed content as bytes and pass to polars
528
+ content = input_file.read()
529
+ if isinstance(content, str):
530
+ content = content.encode("utf-8")
531
+ return pl.read_csv(content, **read_kwargs)
532
+
533
+
534
+ def read_parquet(
535
+ path: str,
536
+ *,
537
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
538
+ fs_open_kwargs: Dict[str, any] = {},
539
+ content_encoding: str = ContentEncoding.IDENTITY.value,
540
+ **read_kwargs,
541
+ ) -> pl.DataFrame:
542
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
543
+ path, filesystem = resolve_path_and_filesystem(path)
544
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f:
545
+ # Handle compression
546
+ if content_encoding == ContentEncoding.IDENTITY.value:
547
+ return pl.read_parquet(f, **read_kwargs)
548
+ else:
549
+ input_file_init = ENCODING_TO_FILE_INIT.get(
550
+ content_encoding, lambda x: x
551
+ )
552
+ with input_file_init(f) as input_file:
553
+ # Read decompressed content as bytes and pass to polars
554
+ content = input_file.read()
555
+ return pl.read_parquet(content, **read_kwargs)
556
+ else:
557
+ # fsspec AbstractFileSystem
558
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
559
+ # Handle compression
560
+ if content_encoding == ContentEncoding.IDENTITY.value:
561
+ return pl.read_parquet(f, **read_kwargs)
562
+ else:
563
+ input_file_init = ENCODING_TO_FILE_INIT.get(
564
+ content_encoding, lambda x: x
565
+ )
566
+ with input_file_init(f) as input_file:
567
+ # Read decompressed content as bytes and pass to polars
568
+ content = input_file.read()
569
+ return pl.read_parquet(content, **read_kwargs)
570
+
571
+
572
+ def read_ipc(
573
+ path: str,
574
+ *,
575
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
576
+ fs_open_kwargs: Dict[str, any] = {},
577
+ content_encoding: str = ContentEncoding.IDENTITY.value,
578
+ **read_kwargs,
579
+ ) -> pl.DataFrame:
580
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
581
+ path, filesystem = resolve_path_and_filesystem(path)
582
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f:
583
+ # Handle compression
584
+ if content_encoding == ContentEncoding.IDENTITY.value:
585
+ return pl.read_ipc(f, **read_kwargs)
586
+ else:
587
+ input_file_init = ENCODING_TO_FILE_INIT.get(
588
+ content_encoding, lambda x: x
589
+ )
590
+ with input_file_init(f) as input_file:
591
+ # Read decompressed content as bytes and pass to polars
592
+ content = input_file.read()
593
+ return pl.read_ipc(content, **read_kwargs)
594
+ else:
595
+ # fsspec AbstractFileSystem
596
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
597
+ # Handle compression
598
+ if content_encoding == ContentEncoding.IDENTITY.value:
599
+ return pl.read_ipc(f, **read_kwargs)
600
+ else:
601
+ input_file_init = ENCODING_TO_FILE_INIT.get(
602
+ content_encoding, lambda x: x
603
+ )
604
+ with input_file_init(f) as input_file:
605
+ # Read decompressed content as bytes and pass to polars
606
+ content = input_file.read()
607
+ return pl.read_ipc(content, **read_kwargs)
608
+
609
+
610
+ def read_ndjson(
611
+ path: str,
612
+ *,
613
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
614
+ fs_open_kwargs: Dict[str, any] = {},
615
+ content_encoding: str = ContentEncoding.IDENTITY.value,
616
+ **read_kwargs,
617
+ ) -> pl.DataFrame:
618
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
619
+ path, filesystem = resolve_path_and_filesystem(path)
620
+ if content_encoding == ContentEncoding.IDENTITY.value:
621
+ with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
622
+ return pl.read_ndjson(f, **read_kwargs)
623
+ else:
624
+ # For compressed files with PyArrow, we need to be careful because PyArrow
625
+ # may auto-decompress some formats. Try to read directly first.
626
+ try:
627
+ with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
628
+ # Try reading as if it's already decompressed by PyArrow
629
+ return pl.read_ndjson(f, **read_kwargs)
630
+ except Exception:
631
+ # If that fails, try manual decompression
632
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f:
633
+ input_file_init = ENCODING_TO_FILE_INIT.get(
634
+ content_encoding, lambda x: x
635
+ )
636
+ with input_file_init(f) as input_file:
637
+ content = input_file.read()
638
+ if isinstance(content, str):
639
+ content = content.encode("utf-8")
640
+ return pl.read_ndjson(content, **read_kwargs)
641
+ else:
642
+ # fsspec AbstractFileSystem
643
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
644
+ # Handle compression
645
+ if content_encoding == ContentEncoding.IDENTITY.value:
646
+ return pl.read_ndjson(f, **read_kwargs)
647
+ else:
648
+ input_file_init = ENCODING_TO_FILE_INIT.get(
649
+ content_encoding, lambda x: x
650
+ )
651
+ with input_file_init(f) as input_file:
652
+ # Read decompressed content as bytes and pass to polars
653
+ content = input_file.read()
654
+ if isinstance(content, str):
655
+ content = content.encode("utf-8")
656
+ return pl.read_ndjson(content, **read_kwargs)
657
+
658
+
659
+ def read_avro(
660
+ path: str,
661
+ *,
662
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
663
+ fs_open_kwargs: Dict[str, any] = {},
664
+ content_encoding: str = ContentEncoding.IDENTITY.value,
665
+ **read_kwargs,
666
+ ) -> pl.DataFrame:
667
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
668
+ path, filesystem = resolve_path_and_filesystem(path)
669
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f:
670
+ # Handle compression
671
+ if content_encoding == ContentEncoding.IDENTITY.value:
672
+ return pl.read_avro(f, **read_kwargs)
673
+ else:
674
+ input_file_init = ENCODING_TO_FILE_INIT.get(
675
+ content_encoding, lambda x: x
676
+ )
677
+ with input_file_init(f) as input_file:
678
+ # Read decompressed content as bytes and pass to polars
679
+ content = input_file.read()
680
+ return pl.read_avro(content, **read_kwargs)
681
+ else:
682
+ # fsspec AbstractFileSystem
683
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
684
+ # Handle compression
685
+ if content_encoding == ContentEncoding.IDENTITY.value:
686
+ return pl.read_avro(f, **read_kwargs)
687
+ else:
688
+ input_file_init = ENCODING_TO_FILE_INIT.get(
689
+ content_encoding, lambda x: x
690
+ )
691
+ with input_file_init(f) as input_file:
692
+ # Read decompressed content as bytes and pass to polars
693
+ content = input_file.read()
694
+ return pl.read_avro(content, **read_kwargs)
695
+
696
+
697
+ def read_orc(
698
+ path: str,
699
+ *,
700
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
701
+ fs_open_kwargs: Dict[str, any] = {},
702
+ content_encoding: str = ContentEncoding.IDENTITY.value,
703
+ **read_kwargs,
704
+ ) -> pl.DataFrame:
705
+ """
706
+ Read an ORC file using pandas and convert to polars since polars doesn't have native ORC support.
707
+ """
708
+ import pandas as pd
709
+
710
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
711
+ path, filesystem = resolve_path_and_filesystem(path)
712
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f:
713
+ # Handle compression
714
+ if content_encoding == ContentEncoding.IDENTITY.value:
715
+ pd_df = pd.read_orc(f, **read_kwargs)
716
+ return pl.from_pandas(pd_df)
717
+ else:
718
+ input_file_init = ENCODING_TO_FILE_INIT.get(
719
+ content_encoding, lambda x: x
720
+ )
721
+ with input_file_init(f) as input_file:
722
+ # Read decompressed content and pass to pandas
723
+ content = input_file.read()
724
+ import io
725
+
726
+ pd_df = pd.read_orc(io.BytesIO(content), **read_kwargs)
727
+ return pl.from_pandas(pd_df)
728
+ else:
729
+ # fsspec AbstractFileSystem
730
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
731
+ # Handle compression
732
+ if content_encoding == ContentEncoding.IDENTITY.value:
733
+ pd_df = pd.read_orc(f, **read_kwargs)
734
+ return pl.from_pandas(pd_df)
735
+ else:
736
+ input_file_init = ENCODING_TO_FILE_INIT.get(
737
+ content_encoding, lambda x: x
738
+ )
739
+ with input_file_init(f) as input_file:
740
+ # Read decompressed content and pass to pandas
741
+ content = input_file.read()
742
+ import io
743
+
744
+ pd_df = pd.read_orc(io.BytesIO(content), **read_kwargs)
745
+ return pl.from_pandas(pd_df)
746
+
747
+
748
+ # New mapping for encoding-aware reader functions used by file_to_dataframe
749
+ CONTENT_TYPE_TO_READ_FN: Dict[str, Callable] = {
750
+ ContentType.UNESCAPED_TSV.value: read_csv,
751
+ ContentType.TSV.value: read_csv,
752
+ ContentType.CSV.value: read_csv,
753
+ ContentType.PSV.value: read_csv,
754
+ ContentType.PARQUET.value: read_parquet,
755
+ ContentType.FEATHER.value: read_ipc,
756
+ ContentType.JSON.value: read_ndjson,
757
+ ContentType.AVRO.value: read_avro,
758
+ ContentType.ORC.value: read_orc,
759
+ }