deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,25 @@
1
1
  from __future__ import annotations
2
- from typing import Dict, List
2
+ from typing import Dict, List, Any, Optional
3
3
  from deltacat.compute.converter.model.convert_input_files import ConvertInputFiles
4
+ from fsspec import AbstractFileSystem
4
5
 
5
6
 
6
7
  class ConvertInput(Dict):
7
8
  @staticmethod
8
9
  def of(
9
- convert_input_files,
10
- convert_task_index,
11
- iceberg_table_warehouse_prefix,
12
- identifier_fields,
13
- table_io,
14
- table_metadata,
15
- compact_previous_position_delete_files,
16
- enforce_primary_key_uniqueness,
17
- position_delete_for_multiple_data_files,
18
- max_parallel_data_file_download,
19
- s3_file_system,
20
- s3_client_kwargs,
10
+ convert_input_files: ConvertInputFiles,
11
+ convert_task_index: int,
12
+ iceberg_table_warehouse_prefix: str,
13
+ identifier_fields: List[str],
14
+ table_io: Any,
15
+ table_metadata: Any,
16
+ compact_previous_position_delete_files: bool,
17
+ enforce_primary_key_uniqueness: bool,
18
+ position_delete_for_multiple_data_files: bool,
19
+ max_parallel_data_file_download: int,
20
+ filesystem: Optional[AbstractFileSystem],
21
+ s3_client_kwargs: Optional[Dict[str, Any]],
22
+ task_memory: float,
21
23
  ) -> ConvertInput:
22
24
 
23
25
  result = ConvertInput()
@@ -35,8 +37,9 @@ class ConvertInput(Dict):
35
37
  "position_delete_for_multiple_data_files"
36
38
  ] = position_delete_for_multiple_data_files
37
39
  result["max_parallel_data_file_download"] = max_parallel_data_file_download
38
- result["s3_file_system"] = s3_file_system
40
+ result["filesystem"] = filesystem
39
41
  result["s3_client_kwargs"] = s3_client_kwargs
42
+ result["task_memory"] = task_memory
40
43
 
41
44
  return result
42
45
 
@@ -57,11 +60,11 @@ class ConvertInput(Dict):
57
60
  return self["iceberg_table_warehouse_prefix"]
58
61
 
59
62
  @property
60
- def table_io(self):
63
+ def table_io(self) -> Any:
61
64
  return self["table_io"]
62
65
 
63
66
  @property
64
- def table_metadata(self):
67
+ def table_metadata(self) -> Any:
65
68
  return self["table_metadata"]
66
69
 
67
70
  @property
@@ -81,9 +84,13 @@ class ConvertInput(Dict):
81
84
  return self["max_parallel_data_file_download"]
82
85
 
83
86
  @property
84
- def s3_file_system(self):
85
- return self["s3_file_system"]
87
+ def filesystem(self) -> Optional[AbstractFileSystem]:
88
+ return self["filesystem"]
86
89
 
87
90
  @property
88
- def s3_client_kwargs(self):
91
+ def s3_client_kwargs(self) -> Optional[Dict[str, Any]]:
89
92
  return self["s3_client_kwargs"]
93
+
94
+ @property
95
+ def task_memory(self) -> float:
96
+ return self["task_memory"]
@@ -1,15 +1,21 @@
1
1
  from __future__ import annotations
2
- from typing import Dict
2
+ from typing import Dict, List, Any, Optional, Tuple
3
+ from pyiceberg.manifest import DataFile
4
+
5
+ # Type aliases to simplify nested types
6
+ DataFileWithSequence = Tuple[int, DataFile] # (sequence_number, data_file)
7
+ DataFileList = List[DataFileWithSequence] # List of data files with sequence numbers
8
+ DataFileListGroup = List[DataFileList] # Group of data file lists
3
9
 
4
10
 
5
11
  class ConvertInputFiles(Dict):
6
12
  @staticmethod
7
13
  def of(
8
- partition_value,
9
- all_data_files_for_dedupe=None,
10
- applicable_data_files=None,
11
- applicable_equality_delete_files=None,
12
- existing_position_delete_files=None,
14
+ partition_value: Any,
15
+ all_data_files_for_dedupe: Optional[DataFileList] = None,
16
+ applicable_data_files: Optional[DataFileListGroup] = None,
17
+ applicable_equality_delete_files: Optional[DataFileListGroup] = None,
18
+ existing_position_delete_files: Optional[DataFileList] = None,
13
19
  ) -> ConvertInputFiles:
14
20
 
15
21
  result = ConvertInputFiles()
@@ -21,41 +27,52 @@ class ConvertInputFiles(Dict):
21
27
  return result
22
28
 
23
29
  @property
24
- def partition_value(self):
30
+ def partition_value(self) -> Any:
25
31
  return self["partition_value"]
26
32
 
27
33
  @property
28
- def all_data_files_for_dedupe(self):
34
+ def all_data_files_for_dedupe(self) -> Optional[DataFileList]:
29
35
  return self["all_data_files_for_dedupe"]
30
36
 
31
37
  @property
32
- def applicable_data_files(self):
38
+ def applicable_data_files(self) -> Optional[DataFileListGroup]:
33
39
  return self["applicable_data_files"]
34
40
 
35
41
  @property
36
- def applicable_equality_delete_files(self):
42
+ def applicable_equality_delete_files(
43
+ self,
44
+ ) -> Optional[DataFileListGroup]:
37
45
  return self["applicable_equality_delete_files"]
38
46
 
39
47
  @property
40
- def existing_position_delete_files(self):
48
+ def existing_position_delete_files(self) -> Optional[DataFileList]:
41
49
  return self["existing_position_delete_files"]
42
50
 
43
51
  @partition_value.setter
44
- def partition_value(self, partition_value):
52
+ def partition_value(self, partition_value: Any) -> None:
45
53
  self["partition_value"] = partition_value
46
54
 
47
55
  @all_data_files_for_dedupe.setter
48
- def all_data_files_for_dedupe(self, all_data_files_for_dedupe):
56
+ def all_data_files_for_dedupe(
57
+ self, all_data_files_for_dedupe: Optional[DataFileList]
58
+ ) -> None:
49
59
  self["all_data_files_for_dedupe"] = all_data_files_for_dedupe
50
60
 
51
61
  @applicable_data_files.setter
52
- def applicable_data_files(self, applicable_data_files):
62
+ def applicable_data_files(
63
+ self, applicable_data_files: Optional[DataFileListGroup]
64
+ ) -> None:
53
65
  self["applicable_data_files"] = applicable_data_files
54
66
 
55
67
  @applicable_equality_delete_files.setter
56
- def applicable_equality_delete_files(self, applicable_equality_delete_files):
68
+ def applicable_equality_delete_files(
69
+ self,
70
+ applicable_equality_delete_files: Optional[DataFileListGroup],
71
+ ) -> None:
57
72
  self["applicable_equality_delete_files"] = applicable_equality_delete_files
58
73
 
59
74
  @existing_position_delete_files.setter
60
- def existing_position_delete_files(self, existing_position_delete_files):
75
+ def existing_position_delete_files(
76
+ self, existing_position_delete_files: Optional[DataFileList]
77
+ ) -> None:
61
78
  self["existing_position_delete_files"] = existing_position_delete_files
@@ -1,18 +1,22 @@
1
1
  from __future__ import annotations
2
- from typing import Dict
2
+ from typing import Dict, List, Any
3
+ from pyiceberg.manifest import DataFile
3
4
 
4
5
 
5
6
  class ConvertResult(Dict):
6
7
  @staticmethod
7
8
  def of(
8
- convert_task_index,
9
- to_be_added_files,
10
- to_be_deleted_files,
11
- position_delete_record_count,
12
- input_data_files_record_count,
13
- input_data_files_hash_columns_in_memory_sizes,
14
- position_delete_in_memory_sizes,
15
- position_delete_on_disk_sizes,
9
+ convert_task_index: int,
10
+ to_be_added_files: List[DataFile],
11
+ to_be_deleted_files: Dict[Any, List[DataFile]],
12
+ position_delete_record_count: int,
13
+ input_data_files_record_count: int,
14
+ input_data_files_hash_columns_in_memory_sizes: int,
15
+ position_delete_in_memory_sizes: int,
16
+ position_delete_on_disk_sizes: int,
17
+ input_data_files_on_disk_size: int,
18
+ peak_memory_usage_bytes: int,
19
+ memory_usage_percentage: float,
16
20
  ) -> ConvertResult:
17
21
 
18
22
  result = ConvertResult()
@@ -26,6 +30,9 @@ class ConvertResult(Dict):
26
30
  ] = input_data_files_hash_columns_in_memory_sizes
27
31
  result["position_delete_in_memory_sizes"] = position_delete_in_memory_sizes
28
32
  result["position_delete_on_disk_sizes"] = position_delete_on_disk_sizes
33
+ result["input_data_files_on_disk_size"] = input_data_files_on_disk_size
34
+ result["peak_memory_usage_bytes"] = peak_memory_usage_bytes
35
+ result["memory_usage_percentage"] = memory_usage_percentage
29
36
  return result
30
37
 
31
38
  @property
@@ -33,29 +40,41 @@ class ConvertResult(Dict):
33
40
  return self["convert_task_index"]
34
41
 
35
42
  @property
36
- def to_be_added_files(self):
43
+ def to_be_added_files(self) -> List[DataFile]:
37
44
  return self["to_be_added_files"]
38
45
 
39
46
  @property
40
- def to_be_deleted_files(self):
47
+ def to_be_deleted_files(self) -> Dict[Any, List[DataFile]]:
41
48
  return self["to_be_deleted_files"]
42
49
 
43
50
  @property
44
- def position_delete_record_count(self):
51
+ def position_delete_record_count(self) -> int:
45
52
  return self["position_delete_record_count"]
46
53
 
47
54
  @property
48
- def input_data_files_record_count(self):
55
+ def input_data_files_record_count(self) -> int:
49
56
  return self["input_data_files_record_count"]
50
57
 
51
58
  @property
52
- def input_data_files_hash_columns_in_memory_sizes(self):
59
+ def input_data_files_hash_columns_in_memory_sizes(self) -> int:
53
60
  return self["input_data_files_hash_columns_in_memory_sizes"]
54
61
 
55
62
  @property
56
- def position_delete_in_memory_sizes(self):
63
+ def position_delete_in_memory_sizes(self) -> int:
57
64
  return self["position_delete_in_memory_sizes"]
58
65
 
59
66
  @property
60
- def position_delete_on_disk_sizes(self):
67
+ def position_delete_on_disk_sizes(self) -> int:
61
68
  return self["position_delete_on_disk_sizes"]
69
+
70
+ @property
71
+ def input_data_files_on_disk_size(self) -> int:
72
+ return self["input_data_files_on_disk_size"]
73
+
74
+ @property
75
+ def peak_memory_usage_bytes(self) -> int:
76
+ return self["peak_memory_usage_bytes"]
77
+
78
+ @property
79
+ def memory_usage_percentage(self) -> float:
80
+ return self["memory_usage_percentage"]
@@ -1,10 +1,11 @@
1
1
  from __future__ import annotations
2
- from typing import Optional, Dict
2
+ from typing import Optional, Dict, Any, List
3
3
  from deltacat.compute.converter.constants import (
4
4
  DEFAULT_CONVERTER_TASK_MAX_PARALLELISM,
5
5
  )
6
6
  from deltacat.constants import DEFAULT_NAMESPACE
7
7
  from fsspec import AbstractFileSystem
8
+ from pyiceberg.catalog import Catalog
8
9
 
9
10
 
10
11
  class ConverterSessionParams(dict):
@@ -13,7 +14,7 @@ class ConverterSessionParams(dict):
13
14
  """
14
15
 
15
16
  @staticmethod
16
- def of(params: Optional[Dict]) -> ConverterSessionParams:
17
+ def of(params: Optional[Dict[str, Any]]) -> ConverterSessionParams:
17
18
  params = {} if params is None else params
18
19
  assert params.get("catalog") is not None, "catalog is a required arg"
19
20
  assert (
@@ -41,13 +42,13 @@ class ConverterSessionParams(dict):
41
42
  )
42
43
  result.merge_keys = params.get("merge_keys", None)
43
44
  result.s3_client_kwargs = params.get("s3_client_kwargs", {})
44
- result.s3_file_system = params.get("s3_file_system", None)
45
+ result.filesystem = params.get("filesystem", None)
45
46
  result.s3_prefix_override = params.get("s3_prefix_override", None)
46
47
 
47
48
  return result
48
49
 
49
50
  @property
50
- def catalog(self):
51
+ def catalog(self) -> Catalog:
51
52
  return self["catalog"]
52
53
 
53
54
  @property
@@ -63,7 +64,7 @@ class ConverterSessionParams(dict):
63
64
  return self["iceberg_namespace"]
64
65
 
65
66
  @iceberg_namespace.setter
66
- def iceberg_namespace(self, iceberg_namespace) -> None:
67
+ def iceberg_namespace(self, iceberg_namespace: str) -> None:
67
68
  self["iceberg_namespace"] = iceberg_namespace
68
69
 
69
70
  @property
@@ -71,7 +72,9 @@ class ConverterSessionParams(dict):
71
72
  return self["enforce_primary_key_uniqueness"]
72
73
 
73
74
  @enforce_primary_key_uniqueness.setter
74
- def enforce_primary_key_uniqueness(self, enforce_primary_key_uniqueness) -> None:
75
+ def enforce_primary_key_uniqueness(
76
+ self, enforce_primary_key_uniqueness: bool
77
+ ) -> None:
75
78
  self["enforce_primary_key_uniqueness"] = enforce_primary_key_uniqueness
76
79
 
77
80
  @property
@@ -80,7 +83,7 @@ class ConverterSessionParams(dict):
80
83
 
81
84
  @compact_previous_position_delete_files.setter
82
85
  def compact_previous_position_delete_files(
83
- self, compact_previous_position_delete_files
86
+ self, compact_previous_position_delete_files: bool
84
87
  ) -> None:
85
88
  self[
86
89
  "compact_previous_position_delete_files"
@@ -92,50 +95,50 @@ class ConverterSessionParams(dict):
92
95
 
93
96
  @position_delete_for_multiple_data_files.setter
94
97
  def position_delete_for_multiple_data_files(
95
- self, position_delete_for_multiple_data_files
98
+ self, position_delete_for_multiple_data_files: bool
96
99
  ) -> None:
97
100
  self[
98
101
  "position_delete_for_multiple_data_files"
99
102
  ] = position_delete_for_multiple_data_files
100
103
 
101
104
  @property
102
- def task_max_parallelism(self) -> str:
105
+ def task_max_parallelism(self) -> int:
103
106
  return self["task_max_parallelism"]
104
107
 
105
108
  @task_max_parallelism.setter
106
- def task_max_parallelism(self, task_max_parallelism) -> None:
109
+ def task_max_parallelism(self, task_max_parallelism: int) -> None:
107
110
  self["task_max_parallelism"] = task_max_parallelism
108
111
 
109
112
  @property
110
- def merge_keys(self) -> str:
113
+ def merge_keys(self) -> Optional[List[str]]:
111
114
  return self["merge_keys"]
112
115
 
113
116
  @merge_keys.setter
114
- def merge_keys(self, merge_keys) -> None:
117
+ def merge_keys(self, merge_keys: Optional[List[str]]) -> None:
115
118
  self["merge_keys"] = merge_keys
116
119
 
117
120
  @property
118
- def s3_client_kwargs(self) -> Dict:
121
+ def s3_client_kwargs(self) -> Dict[str, Any]:
119
122
  return self["s3_client_kwargs"]
120
123
 
121
124
  @s3_client_kwargs.setter
122
- def s3_client_kwargs(self, s3_client_kwargs) -> None:
125
+ def s3_client_kwargs(self, s3_client_kwargs: Dict[str, Any]) -> None:
123
126
  self["s3_client_kwargs"] = s3_client_kwargs
124
127
 
125
128
  @property
126
- def s3_file_system(self) -> AbstractFileSystem:
127
- return self["s3_file_system"]
129
+ def filesystem(self) -> Optional[AbstractFileSystem]:
130
+ return self["filesystem"]
128
131
 
129
- @s3_file_system.setter
130
- def s3_file_system(self, s3_file_system) -> None:
131
- self["s3_file_system"] = s3_file_system
132
+ @filesystem.setter
133
+ def filesystem(self, filesystem: Optional[AbstractFileSystem]) -> None:
134
+ self["filesystem"] = filesystem
132
135
 
133
136
  @property
134
- def location_provider_prefix_override(self) -> str:
137
+ def location_provider_prefix_override(self) -> Optional[str]:
135
138
  return self["location_provider_prefix_override"]
136
139
 
137
140
  @location_provider_prefix_override.setter
138
141
  def location_provider_prefix_override(
139
- self, location_provider_prefix_override
142
+ self, location_provider_prefix_override: Optional[str]
140
143
  ) -> None:
141
144
  self["location_provider_prefix_override"] = location_provider_prefix_override
@@ -1,8 +1,15 @@
1
- from typing import Optional
2
-
3
-
4
- def load_catalog(iceberg_catalog_name, iceberg_catalog_properties):
5
- catalog = load_catalog(
1
+ from typing import Optional, Dict, Any
2
+ from pyiceberg.table import Table
3
+ from pyiceberg.catalog import Catalog, load_catalog as pyiceberg_load_catalog
4
+ from botocore.credentials import Credentials
5
+ import boto3
6
+ from boto3.session import Session
7
+
8
+
9
+ def load_catalog(
10
+ iceberg_catalog_name: str, iceberg_catalog_properties: Dict[str, Any]
11
+ ) -> Catalog:
12
+ catalog = pyiceberg_load_catalog(
6
13
  name=iceberg_catalog_name,
7
14
  **iceberg_catalog_properties,
8
15
  )
@@ -23,25 +30,21 @@ def get_s3_path(
23
30
  return result_path
24
31
 
25
32
 
26
- def get_bucket_name():
27
- return "metadata-py4j-zyiqin1"
33
+ def get_bucket_name() -> str:
34
+ return "test-bucket"
28
35
 
29
36
 
30
- def get_s3_prefix():
37
+ def get_s3_prefix() -> str:
31
38
  return get_s3_path(get_bucket_name())
32
39
 
33
40
 
34
- def get_credential():
35
- import boto3
36
-
37
- boto3_session = boto3.Session()
38
- credentials = boto3_session.get_credentials()
41
+ def get_credential() -> Credentials:
42
+ boto3_session: Session = boto3.Session()
43
+ credentials: Credentials = boto3_session.get_credentials()
39
44
  return credentials
40
45
 
41
46
 
42
- def get_glue_catalog():
43
- from pyiceberg.catalog import load_catalog
44
-
47
+ def get_glue_catalog() -> Catalog:
45
48
  credential = get_credential()
46
49
  # Credentials are refreshable, so accessing your access key / secret key
47
50
  # separately can lead to a race condition. Use this to get an actual matched
@@ -51,7 +54,7 @@ def get_glue_catalog():
51
54
  secret_access_key = credential.secret_key
52
55
  session_token = credential.token
53
56
  s3_path = get_s3_prefix()
54
- glue_catalog = load_catalog(
57
+ glue_catalog = pyiceberg_load_catalog(
55
58
  "glue",
56
59
  **{
57
60
  "warehouse": s3_path,
@@ -70,6 +73,6 @@ def get_glue_catalog():
70
73
  return glue_catalog
71
74
 
72
75
 
73
- def load_table(catalog, table_name):
76
+ def load_table(catalog: Catalog, table_name: str) -> Table:
74
77
  loaded_table = catalog.load_table(table_name)
75
78
  return loaded_table
@@ -11,7 +11,7 @@ from pyiceberg.io.pyarrow import (
11
11
  MetricsMode,
12
12
  StatsAggregator,
13
13
  )
14
- from typing import Dict, List, Set
14
+ from typing import Dict, List, Set, Any, Tuple
15
15
  from deltacat.compute.converter.utils.iceberg_columns import (
16
16
  ICEBERG_RESERVED_FIELD_ID_FOR_FILE_PATH_COLUMN,
17
17
  ICEBERG_RESERVED_FIELD_ID_FOR_POS_COLUMN,
@@ -24,18 +24,23 @@ from pyiceberg.manifest import (
24
24
  DataFileContent,
25
25
  FileFormat,
26
26
  )
27
- from pyiceberg.table import _min_sequence_number, _open_manifest
27
+ from pyiceberg.table import _min_sequence_number, _open_manifest, Table
28
28
  from pyiceberg.utils.concurrent import ExecutorFactory
29
29
  from itertools import chain
30
30
  from pyiceberg.typedef import (
31
31
  KeyDefaultDict,
32
32
  )
33
+ from pyiceberg.schema import Schema
34
+ from pyiceberg.io import FileIO
35
+ from deltacat.compute.converter.model.convert_input_files import (
36
+ DataFileList,
37
+ )
33
38
 
34
39
 
35
40
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
36
41
 
37
42
 
38
- def parquet_path_to_id_mapping_override(schema):
43
+ def parquet_path_to_id_mapping_override(schema: Schema) -> Dict[str, int]:
39
44
  res = parquet_path_to_id_mapping(schema)
40
45
  # Override here to insert position delete reserved column field IDs
41
46
  res["file_path"] = ICEBERG_RESERVED_FIELD_ID_FOR_FILE_PATH_COLUMN
@@ -155,13 +160,16 @@ def data_file_statistics_from_parquet_metadata(
155
160
  )
156
161
 
157
162
 
158
- def parquet_files_dict_to_iceberg_data_files(io, table_metadata, files_dict):
159
- data_file_content_type = DataFileContent.POSITION_DELETES
163
+ def parquet_files_dict_to_iceberg_data_files(
164
+ io: FileIO,
165
+ table_metadata: Any,
166
+ files_dict: Dict[Any, List[str]],
167
+ file_content_type: DataFileContent,
168
+ ) -> List[DataFile]:
160
169
  iceberg_files = []
161
170
  schema = table_metadata.schema()
162
171
  for partition_value, file_paths in files_dict.items():
163
172
  for file_path in file_paths:
164
- logger.info(f"DEBUG_file_path:{file_path}")
165
173
  input_file = io.new_input(file_path)
166
174
  with input_file.open() as input_stream:
167
175
  parquet_metadata = pq.read_metadata(input_stream)
@@ -177,7 +185,7 @@ def parquet_files_dict_to_iceberg_data_files(io, table_metadata, files_dict):
177
185
  )
178
186
 
179
187
  data_file = DataFile(
180
- content=data_file_content_type,
188
+ content=file_content_type,
181
189
  file_path=file_path,
182
190
  file_format=FileFormat.PARQUET,
183
191
  partition=partition_value,
@@ -192,10 +200,11 @@ def parquet_files_dict_to_iceberg_data_files(io, table_metadata, files_dict):
192
200
  return iceberg_files
193
201
 
194
202
 
195
- def fetch_all_bucket_files(table):
203
+ def fetch_all_bucket_files(
204
+ table: Table,
205
+ ) -> Tuple[Dict[Any, DataFileList], Dict[Any, DataFileList], Dict[Any, DataFileList]]:
196
206
  # step 1: filter manifests using partition summaries
197
207
  # the filter depends on the partition spec used to write the manifest file, so create a cache of filters for each spec id
198
-
199
208
  data_scan = table.scan()
200
209
  snapshot = data_scan.snapshot()
201
210
  if not snapshot: