deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,11 @@ import logging
2
2
  from typing import Callable, Dict, List, Optional, Union
3
3
 
4
4
  from fsspec import AbstractFileSystem
5
+
6
+ import pyarrow as pa
5
7
  from pyarrow import csv as pacsv
8
+ import pyarrow.fs as pafs
9
+
6
10
  from ray.data import Dataset
7
11
  from ray.data.datasource import FilenameProvider
8
12
 
@@ -16,7 +20,7 @@ def write_parquet(
16
20
  dataset: Dataset,
17
21
  base_path: str,
18
22
  *,
19
- filesystem: AbstractFileSystem,
23
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
20
24
  block_path_provider: Union[Callable, FilenameProvider],
21
25
  **kwargs,
22
26
  ) -> None:
@@ -34,16 +38,36 @@ def write_csv(
34
38
  dataset: Dataset,
35
39
  base_path: str,
36
40
  *,
37
- filesystem: AbstractFileSystem,
41
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
38
42
  block_path_provider: Union[Callable, FilenameProvider],
39
43
  **kwargs,
40
44
  ) -> None:
45
+ """
46
+ Write a Ray Dataset to a CSV file (or other delimited text format).
47
+ """
48
+ # Extract CSV-specific options from kwargs
49
+ delimiter = kwargs.pop("delimiter", ",")
50
+ quoting_style = kwargs.pop("quoting_style", None)
51
+ include_header = kwargs.pop("include_header", False)
52
+
53
+ # Create a function that will generate WriteOptions inside the worker process
54
+ def arrow_csv_args_fn():
55
+ write_options = pacsv.WriteOptions(
56
+ delimiter=delimiter,
57
+ include_header=include_header,
58
+ quoting_style=quoting_style,
59
+ )
60
+ return {"write_options": write_options}
61
+
62
+ # Check if the block_path_provider will generate .gz files to avoid double compression
63
+ pa_open_stream_args = {}
64
+ if not (
65
+ hasattr(block_path_provider, "content_encoding")
66
+ and block_path_provider.content_encoding == ContentEncoding.GZIP
67
+ ):
68
+ # Block path provider will not generate .gz files, so we need to apply explicit compression
69
+ pa_open_stream_args["compression"] = ContentEncoding.GZIP.value
41
70
 
42
- # column names are kept in table metadata, so omit header
43
- arrow_csv_args_fn = lambda: {
44
- "write_options": pacsv.WriteOptions(include_header=False)
45
- }
46
- pa_open_stream_args = {"compression": ContentEncoding.GZIP.value}
47
71
  dataset.write_csv(
48
72
  base_path,
49
73
  arrow_open_stream_args=pa_open_stream_args,
@@ -55,12 +79,76 @@ def write_csv(
55
79
  )
56
80
 
57
81
 
82
+ def write_json(
83
+ dataset: Dataset,
84
+ base_path: str,
85
+ *,
86
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
87
+ block_path_provider: Union[Callable, FilenameProvider],
88
+ **kwargs,
89
+ ) -> None:
90
+ """
91
+ Write a Ray Dataset to a JSON file using Ray's native JSON writer.
92
+ """
93
+ # Check if the block_path_provider will generate .gz files to avoid double compression
94
+ pa_open_stream_args = {}
95
+ if not (
96
+ hasattr(block_path_provider, "content_encoding")
97
+ and block_path_provider.content_encoding == ContentEncoding.GZIP
98
+ ):
99
+ # Block path provider will not generate .gz files, so we need to apply explicit compression
100
+ pa_open_stream_args["compression"] = ContentEncoding.GZIP.value
101
+
102
+ dataset.write_json(
103
+ base_path,
104
+ arrow_open_stream_args=pa_open_stream_args,
105
+ filesystem=filesystem,
106
+ try_create_dir=False,
107
+ filename_provider=block_path_provider,
108
+ **kwargs,
109
+ )
110
+
111
+
58
112
  CONTENT_TYPE_TO_DATASET_WRITE_FUNC: Dict[str, Callable] = {
113
+ ContentType.UNESCAPED_TSV.value: write_csv,
114
+ ContentType.TSV.value: write_csv,
59
115
  ContentType.CSV.value: write_csv,
116
+ ContentType.PSV.value: write_csv,
60
117
  ContentType.PARQUET.value: write_parquet,
118
+ ContentType.JSON.value: write_json,
61
119
  }
62
120
 
63
121
 
122
+ def content_type_to_writer_kwargs(content_type: str) -> Dict[str, any]:
123
+ """
124
+ Returns writer kwargs for the given content type when writing with Ray Dataset.
125
+ """
126
+ if content_type == ContentType.UNESCAPED_TSV.value:
127
+ return {
128
+ "delimiter": "\t",
129
+ "include_header": False,
130
+ "quoting_style": "none",
131
+ }
132
+ if content_type == ContentType.TSV.value:
133
+ return {
134
+ "delimiter": "\t",
135
+ "include_header": False,
136
+ }
137
+ if content_type == ContentType.CSV.value:
138
+ return {
139
+ "delimiter": ",",
140
+ "include_header": False,
141
+ }
142
+ if content_type == ContentType.PSV.value:
143
+ return {
144
+ "delimiter": "|",
145
+ "include_header": False,
146
+ }
147
+ if content_type in {ContentType.PARQUET.value, ContentType.JSON.value}:
148
+ return {}
149
+ raise ValueError(f"Unsupported content type: {content_type}")
150
+
151
+
64
152
  def slice_dataset(dataset: Dataset, max_len: Optional[int]) -> List[Dataset]:
65
153
  """
66
154
  Returns equally-sized dataset slices of up to `max_len` records each.
@@ -88,9 +176,10 @@ def dataset_size(dataset: Dataset) -> int:
88
176
  def dataset_to_file(
89
177
  table: Dataset,
90
178
  base_path: str,
91
- file_system: AbstractFileSystem,
179
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
92
180
  block_path_provider: Union[Callable, FilenameProvider],
93
181
  content_type: str = ContentType.PARQUET.value,
182
+ schema: Optional[pa.Schema] = None,
94
183
  **kwargs,
95
184
  ) -> None:
96
185
  """
@@ -103,10 +192,12 @@ def dataset_to_file(
103
192
  f" implemented. Known content types: "
104
193
  f"{CONTENT_TYPE_TO_DATASET_WRITE_FUNC.keys}"
105
194
  )
195
+ writer_kwargs = content_type_to_writer_kwargs(content_type)
196
+ writer_kwargs.update(kwargs)
106
197
  writer(
107
198
  table,
108
199
  base_path,
109
- filesystem=file_system,
200
+ filesystem=filesystem,
110
201
  block_path_provider=block_path_provider,
111
- **kwargs,
202
+ **writer_kwargs,
112
203
  )