deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,473 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ DeltaCAT Catalog Explorer
4
+
5
+ Discover candidate streams and partitions for compaction.
6
+ It provides an easy way to explore catalog contents and generates example compaction commands.
7
+
8
+ Usage:
9
+ # Explore default test catalog (from bootstrap.py)
10
+ python explorer.py
11
+
12
+ # Explore with custom catalog root
13
+ python explorer.py --catalog-root /path/to/catalog
14
+
15
+ # Explore specific URL
16
+ python explorer.py --url "dc://my_catalog/my_namespace"
17
+
18
+ # Non-recursive listing
19
+ python explorer.py --no-recursive
20
+
21
+ Examples:
22
+ # After running bootstrap.py
23
+ python bootstrap.py --catalog-root /tmp/deltacat_test
24
+ python explorer.py --catalog-root /tmp/deltacat_test
25
+
26
+ # Explore and find compaction candidates
27
+ python explorer.py --catalog-root /tmp/deltacat_test --show-compaction-candidates
28
+ """
29
+
30
+ import argparse
31
+ import os
32
+ import sys
33
+ from typing import List, Tuple
34
+
35
+ import deltacat as dc
36
+ from deltacat import DeltaCatUrl
37
+ from deltacat.storage.model.namespace import Namespace
38
+ from deltacat.storage.model.table import Table
39
+ from deltacat.storage.model.table_version import TableVersion
40
+ from deltacat.storage.model.stream import Stream
41
+ from deltacat.storage.model.partition import Partition
42
+ from deltacat.storage.model.delta import Delta
43
+
44
+ # Import common utilities
45
+ from deltacat.examples.compactor.utils.common import (
46
+ get_default_catalog_root,
47
+ initialize_deltacat_url_catalog,
48
+ format_partition_values_for_command,
49
+ get_max_stream_position_from_partition,
50
+ get_bootstrap_destination_info,
51
+ )
52
+
53
+
54
+ def setup_catalog(
55
+ catalog_root: str, catalog_name: str = "compactor_test_catalog"
56
+ ) -> DeltaCatUrl:
57
+ """Initialize and register the catalog."""
58
+ print(f"šŸ”§ Initializing catalog...")
59
+ print(f" Catalog root: {catalog_root}")
60
+ print(f" Catalog name: {catalog_name}")
61
+
62
+ return initialize_deltacat_url_catalog(catalog_root, catalog_name)
63
+
64
+
65
+ def find_compaction_candidates(
66
+ all_objects: List,
67
+ ) -> List[Tuple[Partition, Stream, TableVersion, Table, Namespace]]:
68
+ """Find partitions that are candidates for compaction."""
69
+ candidates = []
70
+
71
+ # Group objects by type for easier lookup
72
+ namespaces = {ns.namespace: ns for ns in all_objects if isinstance(ns, Namespace)}
73
+ tables = {
74
+ (t.namespace, t.table_name): t for t in all_objects if isinstance(t, Table)
75
+ }
76
+ table_versions = {
77
+ (tv.namespace, tv.table_name, tv.table_version): tv
78
+ for tv in all_objects
79
+ if isinstance(tv, TableVersion)
80
+ }
81
+ streams = {
82
+ (s.namespace, s.table_name, s.table_version, s.stream_id): s
83
+ for s in all_objects
84
+ if isinstance(s, Stream)
85
+ }
86
+ partitions = [p for p in all_objects if isinstance(p, Partition)]
87
+ deltas = [d for d in all_objects if isinstance(d, Delta)]
88
+
89
+ # Group deltas by partition for counting
90
+ deltas_by_partition = {}
91
+ for delta in deltas:
92
+ partition_key = (
93
+ delta.namespace,
94
+ delta.table_name,
95
+ delta.table_version,
96
+ delta.stream_id,
97
+ delta.partition_id,
98
+ )
99
+ if partition_key not in deltas_by_partition:
100
+ deltas_by_partition[partition_key] = []
101
+ deltas_by_partition[partition_key].append(delta)
102
+
103
+ for partition in partitions:
104
+ # Find the related objects for this partition
105
+ namespace = namespaces.get(partition.namespace)
106
+ table = tables.get((partition.namespace, partition.table_name))
107
+ table_version = table_versions.get(
108
+ (partition.namespace, partition.table_name, partition.table_version)
109
+ )
110
+ stream = streams.get(
111
+ (
112
+ partition.namespace,
113
+ partition.table_name,
114
+ partition.table_version,
115
+ partition.stream_id,
116
+ )
117
+ )
118
+
119
+ # Check if this partition has deltas
120
+ partition_key = (
121
+ partition.namespace,
122
+ partition.table_name,
123
+ partition.table_version,
124
+ partition.stream_id,
125
+ partition.partition_id,
126
+ )
127
+ partition_deltas = deltas_by_partition.get(partition_key, [])
128
+
129
+ if all([namespace, table, table_version, stream]):
130
+ # Check if this partition is a good candidate for compaction
131
+ # Must have committed stream and at least one delta (preferably multiple)
132
+ if stream.state == "committed" and len(partition_deltas) > 0:
133
+ candidates.append((partition, stream, table_version, table, namespace))
134
+
135
+ return candidates
136
+
137
+
138
+ def generate_compaction_command(
139
+ partition: Partition,
140
+ stream: Stream,
141
+ table_version: TableVersion,
142
+ table: Table,
143
+ namespace: Namespace,
144
+ catalog_root: str,
145
+ ) -> str:
146
+ """Generate an example compaction command for the given partition."""
147
+
148
+ # Format partition values for command line
149
+ partition_values = format_partition_values_for_command(partition.partition_values)
150
+
151
+ dest_namespace, dest_table_name = get_bootstrap_destination_info(
152
+ namespace.namespace, table.table_name
153
+ )
154
+
155
+ # Get stream position for generic tables too
156
+ from deltacat.catalog.model.properties import CatalogProperties
157
+
158
+ catalog = CatalogProperties(root=catalog_root)
159
+
160
+ partition_values_list = (
161
+ list(partition.partition_values) if partition.partition_values else []
162
+ )
163
+ max_stream_position = get_max_stream_position_from_partition(
164
+ namespace.namespace,
165
+ table.table_name,
166
+ table_version.table_version,
167
+ partition_values_list,
168
+ catalog,
169
+ )
170
+
171
+ command = f"""python compactor.py \\
172
+ --namespace '{namespace.namespace}' \\
173
+ --table-name '{table.table_name}' \\
174
+ --table-version '{table_version.table_version}' \\
175
+ --partition-values '{partition_values}' \\
176
+ --dest-namespace '{dest_namespace}' \\
177
+ --dest-table-name '{dest_table_name}' \\
178
+ --dest-table-version '1' \\
179
+ --dest-partition-values '{partition_values}' \\
180
+ --last-stream-position {max_stream_position} \\
181
+ --primary-keys 'id' \\
182
+ --compactor-version 'V2' \\
183
+ --hash-bucket-count 1 \\
184
+ --catalog-root '{catalog_root}'"""
185
+
186
+ return command
187
+
188
+
189
+ def print_catalog_summary(all_objects: List) -> None:
190
+ """Print a summary of the catalog contents."""
191
+ namespaces = [obj for obj in all_objects if isinstance(obj, Namespace)]
192
+ tables = [obj for obj in all_objects if isinstance(obj, Table)]
193
+ table_versions = [obj for obj in all_objects if isinstance(obj, TableVersion)]
194
+ streams = [obj for obj in all_objects if isinstance(obj, Stream)]
195
+ partitions = [obj for obj in all_objects if isinstance(obj, Partition)]
196
+ deltas = [obj for obj in all_objects if isinstance(obj, Delta)]
197
+
198
+ print(f"\nšŸ“Š Catalog Summary:")
199
+ print(f" Namespaces: {len(namespaces)}")
200
+ print(f" Tables: {len(tables)}")
201
+ print(f" Table Versions: {len(table_versions)}")
202
+ print(f" Streams: {len(streams)}")
203
+ print(f" Partitions: {len(partitions)}")
204
+ print(f" Deltas: {len(deltas)}")
205
+ print(f" Total Objects: {len(all_objects)}")
206
+
207
+
208
+ def print_detailed_listing(all_objects: List) -> None:
209
+ """Print detailed listing of all objects."""
210
+ print(f"\nšŸ“‹ Detailed Catalog Listing:")
211
+
212
+ # Group deltas by partition for better display
213
+ deltas = [d for d in all_objects if isinstance(d, Delta)]
214
+ deltas_by_partition = {}
215
+ for delta in deltas:
216
+ partition_key = (
217
+ delta.namespace,
218
+ delta.table_name,
219
+ delta.table_version,
220
+ delta.stream_id,
221
+ delta.partition_id,
222
+ )
223
+ if partition_key not in deltas_by_partition:
224
+ deltas_by_partition[partition_key] = []
225
+ deltas_by_partition[partition_key].append(delta)
226
+
227
+ for obj in all_objects:
228
+ if isinstance(obj, Namespace):
229
+ obj.namespace
230
+ print(f"šŸ“ Namespace: {obj.namespace}")
231
+ elif isinstance(obj, Table):
232
+ obj.table_name
233
+ print(f" šŸ“Š Table: {obj.table_name}")
234
+ elif isinstance(obj, TableVersion):
235
+ obj.table_version
236
+ print(f" šŸ“Œ Table Version: {obj.table_version} (state: {obj.state})")
237
+ elif isinstance(obj, Stream):
238
+ obj.stream_id
239
+ print(f" 🌊 Stream: {obj.stream_id}")
240
+ print(f" Format: {obj.stream_format}")
241
+ print(f" State: {obj.state}")
242
+ elif isinstance(obj, Partition):
243
+ print(f" šŸ“¦ Partition: {obj.partition_id}")
244
+ if obj.partition_values:
245
+ print(f" Values: {obj.partition_values}")
246
+
247
+ # Show deltas for this partition
248
+ partition_key = (
249
+ obj.namespace,
250
+ obj.table_name,
251
+ obj.table_version,
252
+ obj.stream_id,
253
+ obj.partition_id,
254
+ )
255
+ partition_deltas = deltas_by_partition.get(partition_key, [])
256
+ if partition_deltas:
257
+ # Sort deltas by stream position
258
+ sorted_deltas = sorted(
259
+ partition_deltas, key=lambda d: d.stream_position
260
+ )
261
+ for delta in sorted_deltas:
262
+ print(f" šŸ“„ Delta at position: {delta.stream_position}")
263
+ else:
264
+ print(f" āš ļø No deltas found")
265
+
266
+
267
+ def print_compaction_candidates(candidates: List, catalog_root: str) -> None:
268
+ """Print compaction candidates with enhanced information."""
269
+ if candidates:
270
+ print(f"\nšŸŽÆ Compaction Candidates:")
271
+ print(f" Found {len(candidates)} partition(s) ready for compaction")
272
+
273
+ for i, (partition, stream, table_version, table, namespace) in enumerate(
274
+ candidates, 1
275
+ ):
276
+ # Count deltas for this partition
277
+ try:
278
+ from deltacat.storage import metastore
279
+ from deltacat.catalog.model.properties import CatalogProperties
280
+
281
+ catalog = CatalogProperties(root=catalog_root)
282
+
283
+ # Create partition locator
284
+ partition_locator = {
285
+ "streamLocator": {
286
+ "tableVersionLocator": {
287
+ "tableLocator": {
288
+ "namespaceLocator": {"namespace": namespace.namespace},
289
+ "tableName": table.table_name,
290
+ },
291
+ "tableVersion": table_version.table_version,
292
+ },
293
+ "streamId": stream.stream_id,
294
+ "format": "deltacat",
295
+ },
296
+ "partitionValues": None,
297
+ "partitionId": partition.partition_id,
298
+ }
299
+
300
+ # Get deltas to count them
301
+ partition_deltas = metastore.list_partition_deltas(
302
+ partition_like=type(
303
+ "obj", (object,), {"locator": partition_locator}
304
+ )(),
305
+ include_manifest=True,
306
+ catalog=catalog,
307
+ )
308
+
309
+ delta_list = partition_deltas.all_items()
310
+ delta_count = len(delta_list)
311
+ max_stream_position = (
312
+ max(delta.stream_position for delta in delta_list)
313
+ if delta_list
314
+ else 0
315
+ )
316
+ total_records = sum(
317
+ delta.meta.record_count if delta.meta else 0 for delta in delta_list
318
+ )
319
+ except Exception:
320
+ delta_count = "unknown"
321
+ max_stream_position = "unknown"
322
+ total_records = "unknown"
323
+
324
+ print(f"\nšŸ“¦ Candidate {i}:")
325
+ print(f" Namespace: {namespace.namespace}")
326
+ print(f" Table: {table.table_name}")
327
+ print(f" Table Version: {table_version.table_version}")
328
+ print(f" Stream: {stream.stream_id}")
329
+ print(f" Partition: {partition.partition_id}")
330
+ print(f" Stream State: {stream.state}")
331
+ print(f" Deltas: {delta_count}")
332
+ if delta_count != "unknown" and delta_count > 0:
333
+ print(f" Total Records: {total_records}")
334
+ print(f" Max Stream Position: {max_stream_position}")
335
+ if delta_count > 1:
336
+ print(
337
+ f" šŸŽÆ Good candidate: Multiple deltas available for compaction"
338
+ )
339
+ else:
340
+ print(f" āš ļø Single delta: Limited compaction benefit")
341
+
342
+ if i == 1: # Show command for first candidate
343
+ command = generate_compaction_command(
344
+ partition, stream, table_version, table, namespace, catalog_root
345
+ )
346
+ print(f"\nšŸš€ Compaction command for candidate {i}:")
347
+ print(f" cd deltacat/examples/compactor")
348
+ for line in command.split("\n"):
349
+ if line.strip():
350
+ print(f" {line}")
351
+ else:
352
+ print(f"\nāš ļø No compaction candidates found.")
353
+ print(
354
+ f"šŸ’” Tip: Compaction candidates are partitions with committed streams and deltas."
355
+ )
356
+ print(f" Tables need multiple deltas to benefit from compaction.")
357
+
358
+
359
+ def main():
360
+ """Main entry point for the explorer script."""
361
+ parser = argparse.ArgumentParser(
362
+ description="Explore DeltaCAT catalog contents and find compaction candidates",
363
+ formatter_class=argparse.RawDescriptionHelpFormatter,
364
+ epilog="""
365
+ Examples:
366
+ # Explore default test catalog (after running bootstrap.py)
367
+ python explorer.py --catalog-root /tmp/deltacat_test
368
+
369
+ # Explore specific URL
370
+ python explorer.py --url "dc://my_catalog/my_namespace"
371
+
372
+ # Show compaction candidates with example commands
373
+ python explorer.py --catalog-root /tmp/deltacat_test --show-compaction-candidates
374
+
375
+ # Non-recursive listing (step by step)
376
+ python explorer.py --catalog-root /tmp/deltacat_test --no-recursive
377
+ """,
378
+ )
379
+
380
+ parser.add_argument(
381
+ "--catalog-root",
382
+ type=str,
383
+ default=get_default_catalog_root(),
384
+ help=f"Root directory for the DeltaCAT catalog (default: {get_default_catalog_root()}, same as bootstrap.py)",
385
+ )
386
+
387
+ parser.add_argument(
388
+ "--url",
389
+ type=str,
390
+ help="Specific DeltaCAT URL to explore (e.g., 'dc://catalog/namespace'). If not provided, uses the full catalog.",
391
+ )
392
+
393
+ parser.add_argument(
394
+ "--no-recursive",
395
+ action="store_true",
396
+ help="Disable recursive listing (only list top-level objects)",
397
+ )
398
+
399
+ parser.add_argument(
400
+ "--show-compaction-candidates",
401
+ action="store_true",
402
+ help="Show partitions that are candidates for compaction with example commands",
403
+ )
404
+
405
+ parser.add_argument(
406
+ "--catalog-name",
407
+ type=str,
408
+ default="compactor_test_catalog",
409
+ help="Name to register the catalog under (default: compactor_test_catalog)",
410
+ )
411
+
412
+ args = parser.parse_args()
413
+
414
+ # Validate catalog root exists
415
+ if not os.path.exists(args.catalog_root):
416
+ print(f"āŒ Error: Catalog root directory does not exist: {args.catalog_root}")
417
+ print(f"šŸ’” Tip: Run bootstrap.py first to create test data:")
418
+ print(f" python bootstrap.py --catalog-root {args.catalog_root}")
419
+ return 1
420
+
421
+ print(f"šŸ” DeltaCAT Catalog Explorer")
422
+ print(f"=" * 50)
423
+
424
+ try:
425
+ # Setup catalog
426
+ catalog_url = setup_catalog(args.catalog_root, args.catalog_name)
427
+
428
+ # Determine what URL to explore
429
+ if args.url:
430
+ explore_url = DeltaCatUrl(args.url)
431
+ print(f"šŸŽÆ Exploring specific URL: {args.url}")
432
+ else:
433
+ explore_url = catalog_url
434
+ print(f"šŸŽÆ Exploring full catalog: {catalog_url.url}")
435
+
436
+ # List objects
437
+ recursive = not args.no_recursive
438
+ print(f"šŸ“– Listing mode: {'Recursive' if recursive else 'Non-recursive'}")
439
+
440
+ all_objects = dc.list(explore_url, recursive=recursive)
441
+
442
+ if not all_objects:
443
+ print(f"\nāš ļø No objects found in catalog.")
444
+ print(f"šŸ’” Tip: Run bootstrap.py to create test data:")
445
+ print(f" python bootstrap.py --catalog-root {args.catalog_root}")
446
+ return 0
447
+
448
+ # Print summary
449
+ print_catalog_summary(all_objects)
450
+
451
+ # Print detailed listing
452
+ print_detailed_listing(all_objects)
453
+
454
+ # Show compaction candidates if requested
455
+ if args.show_compaction_candidates:
456
+ candidates = find_compaction_candidates(all_objects)
457
+
458
+ print_compaction_candidates(candidates, args.catalog_root)
459
+
460
+ print(f"\nāœ… Catalog exploration completed!")
461
+
462
+ except Exception as e:
463
+ print(f"\nāŒ Error exploring catalog: {str(e)}")
464
+ import traceback
465
+
466
+ traceback.print_exc()
467
+ return 1
468
+
469
+ return 0
470
+
471
+
472
+ if __name__ == "__main__":
473
+ sys.exit(main())
@@ -0,0 +1 @@
1
+ # DeltaCAT Compactor GCP Examples