deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. deltacat/__init__.py +41 -16
  2. deltacat/api.py +478 -123
  3. deltacat/aws/s3u.py +2 -2
  4. deltacat/benchmarking/benchmark_engine.py +4 -2
  5. deltacat/benchmarking/conftest.py +1 -1
  6. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  7. deltacat/catalog/__init__.py +62 -5
  8. deltacat/catalog/main/impl.py +26 -10
  9. deltacat/catalog/model/catalog.py +165 -109
  10. deltacat/catalog/model/properties.py +25 -24
  11. deltacat/compute/__init__.py +14 -0
  12. deltacat/compute/converter/constants.py +5 -0
  13. deltacat/compute/converter/converter_session.py +78 -36
  14. deltacat/compute/converter/model/convert_input.py +24 -4
  15. deltacat/compute/converter/model/convert_result.py +61 -0
  16. deltacat/compute/converter/model/converter_session_params.py +52 -10
  17. deltacat/compute/converter/pyiceberg/overrides.py +181 -62
  18. deltacat/compute/converter/steps/convert.py +84 -36
  19. deltacat/compute/converter/steps/dedupe.py +25 -4
  20. deltacat/compute/converter/utils/convert_task_options.py +42 -13
  21. deltacat/compute/converter/utils/iceberg_columns.py +5 -0
  22. deltacat/compute/converter/utils/io.py +82 -11
  23. deltacat/compute/converter/utils/s3u.py +13 -4
  24. deltacat/compute/jobs/client.py +406 -0
  25. deltacat/constants.py +5 -6
  26. deltacat/env.py +10 -0
  27. deltacat/examples/basic_logging.py +6 -6
  28. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  29. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  30. deltacat/examples/hello_world.py +4 -2
  31. deltacat/examples/indexer/indexer.py +163 -0
  32. deltacat/examples/indexer/job_runner.py +198 -0
  33. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  34. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  35. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +27 -9
  36. deltacat/{storage → experimental/storage}/iceberg/iceberg_scan_planner.py +1 -1
  37. deltacat/{storage → experimental/storage}/iceberg/impl.py +1 -1
  38. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  39. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  40. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -9
  41. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  42. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  43. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  44. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  45. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  46. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  47. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  48. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  49. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -1
  50. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  51. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  52. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  53. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  54. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  55. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  56. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  57. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  58. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  59. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +4 -4
  60. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  61. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  62. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  63. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  64. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  65. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  66. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  67. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  68. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  69. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  70. deltacat/io/__init__.py +13 -0
  71. deltacat/io/dataset/__init__.py +0 -0
  72. deltacat/io/dataset/deltacat_dataset.py +91 -0
  73. deltacat/io/datasink/__init__.py +0 -0
  74. deltacat/io/datasink/deltacat_datasink.py +207 -0
  75. deltacat/io/datasource/__init__.py +0 -0
  76. deltacat/io/datasource/deltacat_datasource.py +580 -0
  77. deltacat/io/reader/__init__.py +0 -0
  78. deltacat/io/reader/deltacat_read_api.py +172 -0
  79. deltacat/storage/__init__.py +2 -0
  80. deltacat/storage/model/expression/__init__.py +47 -0
  81. deltacat/storage/model/expression/expression.py +656 -0
  82. deltacat/storage/model/expression/visitor.py +248 -0
  83. deltacat/storage/model/metafile.py +74 -42
  84. deltacat/storage/model/scan/push_down.py +32 -5
  85. deltacat/storage/model/shard.py +6 -2
  86. deltacat/storage/model/types.py +5 -3
  87. deltacat/tests/_io/reader/__init__.py +0 -0
  88. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  89. deltacat/tests/catalog/data/__init__.py +0 -0
  90. deltacat/tests/catalog/main/__init__.py +0 -0
  91. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  92. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +436 -0
  93. deltacat/tests/catalog/model/__init__.py +0 -0
  94. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  95. deltacat/tests/catalog/test_catalogs.py +52 -98
  96. deltacat/tests/catalog/test_default_catalog_impl.py +1 -2
  97. deltacat/tests/compute/converter/test_convert_session.py +209 -46
  98. deltacat/tests/daft/__init__.py +0 -0
  99. deltacat/tests/daft/test_model.py +97 -0
  100. deltacat/tests/experimental/__init__.py +0 -0
  101. deltacat/tests/experimental/catalog/__init__.py +0 -0
  102. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  103. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  104. deltacat/tests/experimental/daft/__init__.py +0 -0
  105. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  106. deltacat/tests/experimental/storage/__init__.py +0 -0
  107. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  108. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  109. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  110. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  111. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  112. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  113. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  114. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  115. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  116. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  117. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  118. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  119. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  120. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  121. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  122. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  123. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  124. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  125. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  126. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  127. deltacat/tests/local_deltacat_storage/__init__.py +1 -0
  128. deltacat/tests/storage/model/test_expression.py +327 -0
  129. deltacat/tests/storage/model/test_shard.py +3 -1
  130. deltacat/tests/test_deltacat_api.py +50 -9
  131. deltacat/types/media.py +141 -43
  132. deltacat/types/tables.py +35 -7
  133. deltacat/utils/daft.py +531 -5
  134. deltacat/utils/export.py +3 -1
  135. deltacat/utils/filesystem.py +39 -9
  136. deltacat/utils/polars.py +128 -0
  137. deltacat/utils/pyarrow.py +151 -15
  138. deltacat/utils/ray_utils/concurrency.py +1 -1
  139. deltacat/utils/ray_utils/runtime.py +56 -4
  140. deltacat/utils/url.py +1284 -0
  141. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/METADATA +11 -9
  142. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/RECORD +168 -123
  143. deltacat/catalog/iceberg/__init__.py +0 -4
  144. deltacat/daft/daft_scan.py +0 -111
  145. deltacat/daft/model.py +0 -258
  146. deltacat/examples/common/fixtures.py +0 -15
  147. deltacat/storage/rivulet/__init__.py +0 -11
  148. deltacat/storage/rivulet/feather/__init__.py +0 -5
  149. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  150. /deltacat/{daft → compute/jobs}/__init__.py +0 -0
  151. /deltacat/examples/{common → experimental}/__init__.py +0 -0
  152. /deltacat/examples/{iceberg → experimental/iceberg}/__init__.py +0 -0
  153. /deltacat/{storage/iceberg → examples/indexer}/__init__.py +0 -0
  154. /deltacat/{storage/rivulet/arrow → examples/indexer/aws}/__init__.py +0 -0
  155. /deltacat/{storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  156. /deltacat/{storage/rivulet/metastore → experimental/catalog}/__init__.py +0 -0
  157. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  158. /deltacat/{storage/rivulet/reader → experimental/storage}/__init__.py +0 -0
  159. /deltacat/{storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  160. /deltacat/{storage → experimental/storage}/iceberg/model.py +0 -0
  161. /deltacat/{storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  162. /deltacat/{tests/storage/rivulet → experimental/storage/rivulet/fs}/__init__.py +0 -0
  163. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  164. /deltacat/{tests/storage/rivulet/fs → experimental/storage/rivulet/metastore}/__init__.py +0 -0
  165. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  166. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  167. /deltacat/{storage → experimental/storage}/rivulet/parquet/data_reader.py +0 -0
  168. /deltacat/{tests/storage/rivulet/schema → experimental/storage/rivulet/reader}/__init__.py +0 -0
  169. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  170. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/schema}/__init__.py +0 -0
  171. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  172. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  173. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/LICENSE +0 -0
  174. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/WHEEL +0 -0
  175. {deltacat-2.0.0b9.dist-info → deltacat-2.0.0b11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,128 @@
1
+ import logging
2
+ from typing import Optional, List, Dict, Callable, Union
3
+
4
+ import polars as pl
5
+
6
+ from fsspec import AbstractFileSystem
7
+ from ray.data.datasource import FilenameProvider
8
+
9
+ from deltacat import logs
10
+
11
+ from deltacat.types.media import ContentType
12
+
13
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
14
+
15
+
16
+ def write_json(
17
+ table: pl.DataFrame,
18
+ path: str,
19
+ *,
20
+ filesystem: Optional[AbstractFileSystem] = None,
21
+ fs_open_kwargs: Dict[str, any] = {},
22
+ **write_kwargs,
23
+ ) -> None:
24
+ if not filesystem:
25
+ table.write_ndjson(path, **write_kwargs)
26
+ else:
27
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
28
+ table.write_ndjson(f, **write_kwargs)
29
+
30
+
31
+ def write_csv(
32
+ table: pl.DataFrame,
33
+ path: str,
34
+ *,
35
+ filesystem: Optional[AbstractFileSystem] = None,
36
+ fs_open_kwargs: Dict[str, any] = {},
37
+ **write_kwargs,
38
+ ) -> None:
39
+ if not filesystem:
40
+ table.write_csv(path, **write_kwargs)
41
+ else:
42
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
43
+ table.write_csv(f, **write_kwargs)
44
+
45
+
46
+ def write_avro(
47
+ table: pl.DataFrame,
48
+ path: str,
49
+ *,
50
+ filesystem: Optional[AbstractFileSystem] = None,
51
+ fs_open_kwargs: Dict[str, any] = {},
52
+ **write_kwargs,
53
+ ) -> None:
54
+ if not filesystem:
55
+ table.write_avro(path, **write_kwargs)
56
+ else:
57
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
58
+ table.write_avro(f, **write_kwargs)
59
+
60
+
61
+ def write_parquet(
62
+ table: pl.DataFrame,
63
+ path: str,
64
+ *,
65
+ filesystem: Optional[AbstractFileSystem] = None,
66
+ fs_open_kwargs: Dict[str, any] = {},
67
+ **write_kwargs,
68
+ ) -> None:
69
+ if not filesystem:
70
+ table.write_parquet(path, **write_kwargs)
71
+ else:
72
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
73
+ table.write_parquet(f, **write_kwargs)
74
+
75
+
76
+ CONTENT_TYPE_TO_PL_WRITE_FUNC: Dict[str, Callable] = {
77
+ # TODO (pdames): add support for other delimited text content types as
78
+ # pyarrow adds support for custom delimiters, escaping, and None value
79
+ # representations to pyarrow.csv.WriteOptions.
80
+ ContentType.AVRO.value: write_avro,
81
+ ContentType.CSV.value: write_csv,
82
+ ContentType.PARQUET.value: write_parquet,
83
+ ContentType.JSON.value: write_json,
84
+ }
85
+
86
+
87
+ def slice_table(table: pl.DataFrame, max_len: Optional[int]) -> List[pl.DataFrame]:
88
+ """
89
+ Iteratively create 0-copy table slices.
90
+ """
91
+ if max_len is None:
92
+ return [table]
93
+ tables = []
94
+ offset = 0
95
+ records_remaining = len(table)
96
+ while records_remaining > 0:
97
+ records_this_entry = min(max_len, records_remaining)
98
+ tables.append(table.slice(offset, records_this_entry))
99
+ records_remaining -= records_this_entry
100
+ offset += records_this_entry
101
+ return tables
102
+
103
+
104
+ def dataframe_size(table: pl.DataFrame) -> int:
105
+ return table.estimated_size()
106
+
107
+
108
+ def dataframe_to_file(
109
+ table: pl.DataFrame,
110
+ base_path: str,
111
+ file_system: Optional[AbstractFileSystem],
112
+ block_path_provider: Union[Callable, FilenameProvider],
113
+ content_type: str = ContentType.PARQUET.value,
114
+ **kwargs,
115
+ ) -> None:
116
+ """
117
+ Writes the given Pyarrow Table to a file.
118
+ """
119
+ writer = CONTENT_TYPE_TO_PL_WRITE_FUNC.get(content_type)
120
+ if not writer:
121
+ raise NotImplementedError(
122
+ f"Pyarrow writer for content type '{content_type}' not "
123
+ f"implemented. Known content types: "
124
+ f"{CONTENT_TYPE_TO_PL_WRITE_FUNC.keys}"
125
+ )
126
+ path = block_path_provider(base_path)
127
+ logger.debug(f"Writing table: {table} with kwargs: {kwargs} to path: {path}")
128
+ writer(table, path, filesystem=file_system, **kwargs)
deltacat/utils/pyarrow.py CHANGED
@@ -13,11 +13,14 @@ from deltacat.exceptions import ContentTypeValidationError
13
13
  import pyarrow as pa
14
14
  import numpy as np
15
15
  import pyarrow.compute as pc
16
+ import pyarrow.fs as pafs
17
+
16
18
  from fsspec import AbstractFileSystem
17
19
  from pyarrow import csv as pacsv
18
20
  from pyarrow import feather as paf
19
21
  from pyarrow import json as pajson
20
22
  from pyarrow import parquet as papq
23
+ from pyarrow import orc as paorc
21
24
  from ray.data.datasource import FilenameProvider
22
25
  from deltacat.utils.s3fs import create_s3_file_system
23
26
 
@@ -40,8 +43,10 @@ from deltacat.utils.arguments import (
40
43
  sanitize_kwargs_to_callable,
41
44
  sanitize_kwargs_by_supported_kwargs,
42
45
  )
46
+ from deltacat.utils.filesystem import resolve_path_and_filesystem
43
47
  from functools import lru_cache
44
48
 
49
+
45
50
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
46
51
 
47
52
  RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
@@ -103,6 +108,82 @@ def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
103
108
  raise e
104
109
 
105
110
 
111
+ # TODO(pdames): Remove deprecated S3-only readers.
112
+ def read_csv(
113
+ path: str,
114
+ *,
115
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
116
+ fs_open_kwargs: Dict[str, any] = {},
117
+ **read_kwargs,
118
+ ) -> pa.Table:
119
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
120
+ path, filesystem = resolve_path_and_filesystem(path)
121
+ with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
122
+ return pacsv.read_csv(f, **read_kwargs)
123
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
124
+ return pacsv.read_csv(f, **read_kwargs)
125
+
126
+
127
+ def read_feather(
128
+ path: str,
129
+ *,
130
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
131
+ fs_open_kwargs: Dict[str, any] = {},
132
+ **read_kwargs,
133
+ ) -> pa.Table:
134
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
135
+ path, filesystem = resolve_path_and_filesystem(path)
136
+ with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
137
+ return paf.read_feather(f, **read_kwargs)
138
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
139
+ return paf.read_feather(f, **read_kwargs)
140
+
141
+
142
+ def read_json(
143
+ path: str,
144
+ *,
145
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
146
+ fs_open_kwargs: Dict[str, any] = {},
147
+ **read_kwargs,
148
+ ) -> pa.Table:
149
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
150
+ path, filesystem = resolve_path_and_filesystem(path)
151
+ with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
152
+ return pajson.read_json(f, **read_kwargs)
153
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
154
+ return pajson.read_json(f, **read_kwargs)
155
+
156
+
157
+ def read_orc(
158
+ path: str,
159
+ *,
160
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
161
+ fs_open_kwargs: Dict[str, any] = {},
162
+ **read_kwargs,
163
+ ) -> pa.Table:
164
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
165
+ path, filesystem = resolve_path_and_filesystem(path)
166
+ with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
167
+ return paorc.read_table(f, **read_kwargs)
168
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
169
+ return paorc.read_table(f, **read_kwargs)
170
+
171
+
172
+ def read_parquet(
173
+ path: str,
174
+ *,
175
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
176
+ fs_open_kwargs: Dict[str, any] = {},
177
+ **read_kwargs,
178
+ ) -> pa.Table:
179
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
180
+ path, filesystem = resolve_path_and_filesystem(path)
181
+ with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
182
+ return papq.read_table(f, **read_kwargs)
183
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
184
+ return papq.read_table(f, **read_kwargs)
185
+
186
+
106
187
  CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
107
188
  ContentType.UNESCAPED_TSV.value: pyarrow_read_csv,
108
189
  ContentType.TSV.value: pyarrow_read_csv,
@@ -118,24 +199,78 @@ CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
118
199
 
119
200
 
120
201
  def write_feather(
121
- table: pa.Table, path: str, *, filesystem: AbstractFileSystem, **kwargs
202
+ table: pa.Table,
203
+ path: str,
204
+ *,
205
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
206
+ fs_open_kwargs: Dict[str, any] = {},
207
+ **write_kwargs,
122
208
  ) -> None:
123
-
124
- with filesystem.open(path, "wb") as f:
125
- paf.write_feather(table, f, **kwargs)
209
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
210
+ path, filesystem = resolve_path_and_filesystem(path)
211
+ with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
212
+ paf.write_feather(table, f, **write_kwargs)
213
+ else:
214
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
215
+ paf.write_feather(table, f, **write_kwargs)
126
216
 
127
217
 
128
218
  def write_csv(
129
- table: pa.Table, path: str, *, filesystem: AbstractFileSystem, **kwargs
219
+ table: pa.Table,
220
+ path: str,
221
+ *,
222
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
223
+ fs_open_kwargs: Dict[str, any] = {},
224
+ **write_kwargs,
225
+ ) -> None:
226
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
227
+ path, filesystem = resolve_path_and_filesystem(path)
228
+ with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
229
+ pacsv.write_csv(table, f, **write_kwargs)
230
+ else:
231
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
232
+ # TODO (pdames): Add support for client-specified compression types.
233
+ with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
234
+ if write_kwargs.get("write_options") is None:
235
+ # column names are kept in table metadata, so omit header
236
+ write_kwargs["write_options"] = pacsv.WriteOptions(
237
+ include_header=False
238
+ )
239
+ pacsv.write_csv(table, out, **write_kwargs)
240
+
241
+
242
+ def write_orc(
243
+ table: pa.Table,
244
+ path: str,
245
+ *,
246
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
247
+ fs_open_kwargs: Dict[str, any] = {},
248
+ **write_kwargs,
130
249
  ) -> None:
250
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
251
+ path, filesystem = resolve_path_and_filesystem(path)
252
+ with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
253
+ paorc.write_table(table, f, **write_kwargs)
254
+ else:
255
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
256
+ paorc.write_table(table, f, **write_kwargs)
257
+
131
258
 
132
- with filesystem.open(path, "wb") as f:
133
- # TODO (pdames): Add support for client-specified compression types.
134
- with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
135
- if kwargs.get("write_options") is None:
136
- # column names are kept in table metadata, so omit header
137
- kwargs["write_options"] = pacsv.WriteOptions(include_header=False)
138
- pacsv.write_csv(table, out, **kwargs)
259
+ def write_parquet(
260
+ table: pa.Table,
261
+ path: str,
262
+ *,
263
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
264
+ fs_open_kwargs: Dict[str, any] = {},
265
+ **write_kwargs,
266
+ ) -> None:
267
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
268
+ path, filesystem = resolve_path_and_filesystem(path)
269
+ with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
270
+ papq.write_table(table, f, **write_kwargs)
271
+ else:
272
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
273
+ papq.write_table(table, f, **write_kwargs)
139
274
 
140
275
 
141
276
  CONTENT_TYPE_TO_PA_WRITE_FUNC: Dict[str, Callable] = {
@@ -143,7 +278,8 @@ CONTENT_TYPE_TO_PA_WRITE_FUNC: Dict[str, Callable] = {
143
278
  # pyarrow adds support for custom delimiters, escaping, and None value
144
279
  # representations to pyarrow.csv.WriteOptions.
145
280
  ContentType.CSV.value: write_csv,
146
- ContentType.PARQUET.value: papq.write_table,
281
+ ContentType.ORC.value: write_orc,
282
+ ContentType.PARQUET.value: write_parquet,
147
283
  ContentType.FEATHER.value: write_feather,
148
284
  }
149
285
 
@@ -180,7 +316,7 @@ def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
180
316
  ENCODING_TO_FILE_INIT: Dict[str, Callable] = {
181
317
  ContentEncoding.GZIP.value: partial(gzip.open, mode="rb"),
182
318
  ContentEncoding.BZIP2.value: partial(bz2.open, mode="rb"),
183
- ContentEncoding.IDENTITY.value: lambda s3_file: s3_file,
319
+ ContentEncoding.IDENTITY.value: lambda file_path: file_path,
184
320
  }
185
321
 
186
322
 
@@ -522,7 +658,7 @@ def parquet_file_size(table: papq.ParquetFile) -> int:
522
658
  def table_to_file(
523
659
  table: pa.Table,
524
660
  base_path: str,
525
- file_system: AbstractFileSystem,
661
+ file_system: Optional[AbstractFileSystem],
526
662
  block_path_provider: Union[Callable, FilenameProvider],
527
663
  content_type: str = ContentType.PARQUET.value,
528
664
  **kwargs,
@@ -88,7 +88,7 @@ def round_robin_options_provider(
88
88
  **kwargs,
89
89
  ) -> Dict[str, Any]:
90
90
  """Returns a resource dictionary that can be included with ray remote
91
- options to round robin indexed tasks or actors across a list of resource
91
+ options to round-robin indexed tasks or actors across a list of resource
92
92
  keys. For example, the following code round-robins 100 tasks across all
93
93
  live cluster nodes:
94
94
  ```
@@ -21,7 +21,7 @@ def node_resource_keys(
21
21
  keys = []
22
22
  node_dict = ray.nodes()
23
23
  if node_dict:
24
- for node in ray.nodes():
24
+ for node in node_dict:
25
25
  if filter_fn(node):
26
26
  for key in node["Resources"].keys():
27
27
  if key.startswith("node:"):
@@ -37,7 +37,7 @@ def current_node_resource_key() -> str:
37
37
  actors on that node via:
38
38
  `foo.options(resources={get_current_node_resource_key(): 0.01}).remote()`
39
39
  """
40
- current_node_id = ray.get_runtime_context().get_node_id().hex()
40
+ current_node_id = ray.get_runtime_context().get_node_id()
41
41
  keys = node_resource_keys(lambda n: n["NodeID"] == current_node_id)
42
42
  assert (
43
43
  len(keys) <= 1
@@ -45,6 +45,47 @@ def current_node_resource_key() -> str:
45
45
  return keys[0] if len(keys) == 1 else None
46
46
 
47
47
 
48
+ def current_node_resources() -> Dict[str, float]:
49
+ """Get's Ray's resources for the current node as a dictionary.
50
+
51
+ Example Return Value:
52
+ >>> {
53
+ >>> 'memory': 17611605607.0,
54
+ >>> 'node:127.0.0.1': 1.0,
55
+ >>> 'node:__internal_head__': 1.0,
56
+ >>> 'object_store_memory': 2147483648.0,
57
+ >>> 'CPU': 10.0,
58
+ >>> }
59
+ """
60
+ current_node_id = ray.get_runtime_context().get_node_id()
61
+ node_dict = ray.nodes()
62
+ if node_dict:
63
+ for node in node_dict:
64
+ if node["NodeID"] == current_node_id:
65
+ return node["Resources"]
66
+ else:
67
+ raise ValueError("No node dictionary found on current node.")
68
+ return {}
69
+
70
+
71
+ def find_max_single_node_resource_type(resource_type: str) -> float:
72
+ """Finds the max resource amount available on any single cluster node
73
+ for the given resource type. Returns the max resource amount as a float."""
74
+ node_dict = ray.nodes()
75
+ max_single_node_resource_amount = 0
76
+ if node_dict:
77
+ for node in node_dict:
78
+ node_resource_amount = node["Resources"].get(resource_type)
79
+ if node_resource_amount is not None:
80
+ max_single_node_resource_amount = max(
81
+ max_single_node_resource_amount,
82
+ node_resource_amount,
83
+ )
84
+ else:
85
+ raise ValueError("No node dictionary found on current node.")
86
+ return max_single_node_resource_amount
87
+
88
+
48
89
  def is_node_alive(node: Dict[str, Any]) -> bool:
49
90
  """Takes a node from `ray.nodes()` as input. Returns True if the node is
50
91
  alive, and False otherwise."""
@@ -67,6 +108,17 @@ def live_node_waiter(min_live_nodes: int, poll_interval_seconds: float = 0.5) ->
67
108
  time.sleep(poll_interval_seconds)
68
109
 
69
110
 
111
+ def live_cpu_waiter(min_live_cpus: int, poll_interval_seconds: float = 0.5) -> None:
112
+ """Waits until the given minimum number of live CPUs are present in the
113
+ cluster. Checks the current number of live CPUs every
114
+ `poll_interval_seconds`."""
115
+ live_cpus = cluster_cpus()
116
+ while live_cpus < min_live_cpus:
117
+ live_cpus = cluster_cpus()
118
+ logger.info(f"Waiting for Live CPUs: {live_cpus}/{min_live_cpus}")
119
+ time.sleep(poll_interval_seconds)
120
+
121
+
70
122
  def live_node_resource_keys() -> List[str]:
71
123
  """Get Ray resource keys for all live cluster nodes as a list of strings of
72
124
  the form: "node:{node_resource_name}". The returned keys can be used to
@@ -83,7 +135,7 @@ def other_live_node_resource_keys() -> List[str]:
83
135
 
84
136
  For example, invoking this function from your Ray application driver on the
85
137
  head node returns the resource keys of all live worker nodes."""
86
- current_node_id = ray.get_runtime_context().get_node_id().hex()
138
+ current_node_id = ray.get_runtime_context().get_node_id()
87
139
  return node_resource_keys(
88
140
  lambda n: n["NodeID"] != current_node_id and is_node_alive(n)
89
141
  )
@@ -97,7 +149,7 @@ def other_node_resource_keys() -> List[str]:
97
149
 
98
150
  For example, invoking this function from your Ray application driver on the
99
151
  head node returns the resource keys of all worker nodes."""
100
- current_node_id = ray.get_runtime_context().get_node_id().hex()
152
+ current_node_id = ray.get_runtime_context().get_node_id()
101
153
  return node_resource_keys(lambda n: n["NodeID"] != current_node_id)
102
154
 
103
155