deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,97 +1,102 @@
1
1
  from collections import defaultdict
2
2
  import logging
3
3
  from deltacat import logs
4
- from deltacat.compute.converter.model.convert_input_files import ConvertInputFiles
4
+ from deltacat.compute.converter.model.convert_input_files import (
5
+ ConvertInputFiles,
6
+ DataFileList,
7
+ DataFileListGroup,
8
+ )
9
+ from typing import List, Dict, Tuple, Any
10
+ from enum import Enum
11
+ from pyiceberg.manifest import DataFile
5
12
 
6
13
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
7
14
 
8
15
 
9
- def check_data_files_sequence_number(data_files_list, equality_delete_files_list):
16
+ def check_data_files_sequence_number(
17
+ data_files_list: DataFileList,
18
+ equality_delete_files_list: DataFileList,
19
+ ) -> Tuple[DataFileListGroup, DataFileListGroup]:
10
20
  # Sort by file sequence number
11
21
  data_files_list.sort(key=lambda file_tuple: file_tuple[0])
12
22
  equality_delete_files_list.sort(key=lambda file_tuple: file_tuple[0])
13
-
14
- equality_delete_files = []
15
- result_data_file = []
16
-
17
- # Pointer for list data_file
18
- data_file_pointer = 0
23
+ data_file_delete_applicable = []
24
+ result_eq_files_list = []
19
25
 
20
26
  # Loop through each value in equality_delete_file
21
- for equality_file_tuple in equality_delete_files_list:
22
- # Find all values in data_file that are smaller than val_equality
23
- valid_values = []
27
+ for data_file_tuple in data_files_list:
28
+
29
+ # Find all values in equality delete file that having a larger sequence number than current data file
30
+ valid_values_eq = []
24
31
 
32
+ # Pointer for equality delete file
33
+ eq_file_pointer = 0
25
34
  # Move data_file_pointer to the first value in data_file that is smaller than val_equality
26
35
  while (
27
- data_file_pointer < len(data_files_list)
28
- and data_files_list[data_file_pointer][0] < equality_file_tuple[0]
36
+ eq_file_pointer < len(equality_delete_files_list)
37
+ and equality_delete_files_list[eq_file_pointer][0] > data_file_tuple[0]
29
38
  ):
30
- valid_values.append(data_files_list[data_file_pointer])
31
- data_file_pointer += 1
32
- equality_delete_files.append(equality_file_tuple)
33
-
34
- # Append the value from equality_delete_file and the corresponding valid values from data_file
35
- if valid_values:
36
- result_data_file.append(valid_values)
37
-
38
- result_equality_delete_file = append_larger_sequence_number_data_files(
39
- equality_delete_files
39
+ valid_values_eq.append(equality_delete_files_list[eq_file_pointer])
40
+ eq_file_pointer += 1
41
+
42
+ if valid_values_eq:
43
+ # Append the value for both applicable eq files list and applicable data files list
44
+ data_file_delete_applicable.append(data_file_tuple)
45
+ result_eq_files_list.append(valid_values_eq)
46
+
47
+ res_data_file_list = []
48
+ res_equality_delete_file_list = []
49
+ merged_file_dict = defaultdict(list)
50
+ for data_file_sublist, eq_delete_sublist in zip(
51
+ data_file_delete_applicable, result_eq_files_list
52
+ ):
53
+ merged_file_dict[tuple(eq_delete_sublist)].append(data_file_sublist)
54
+ for eq_file_list, data_file_list in merged_file_dict.items():
55
+ res_data_file_list.append(list(set(data_file_list)))
56
+ res_equality_delete_file_list.append(list(set(eq_file_list)))
57
+
58
+ assert len(res_data_file_list) == len(res_equality_delete_file_list), (
59
+ f"length of applicable data files list: {len(res_data_file_list)} "
60
+ f"should equal to length of equality delete files list:{len(res_equality_delete_file_list)}"
40
61
  )
41
62
 
42
- return result_equality_delete_file, result_data_file
43
-
44
-
45
- def append_larger_sequence_number_data_files(data_files_list):
46
- result = []
47
- # Iterate over the input list
48
- for i in range(len(data_files_list)):
49
- sublist = data_files_list[i:]
50
- sublist_file_list = []
51
- for file in sublist:
52
- sublist_file_list.append(file)
53
- result.append(sublist_file_list)
54
- return result
63
+ return res_equality_delete_file_list, res_data_file_list
55
64
 
56
65
 
57
66
  def construct_iceberg_table_prefix(
58
- iceberg_warehouse_bucket_name, table_name, iceberg_namespace
59
- ):
67
+ iceberg_warehouse_bucket_name: str, table_name: str, iceberg_namespace: str
68
+ ) -> str:
60
69
  return f"{iceberg_warehouse_bucket_name}/{iceberg_namespace}/{table_name}/data"
61
70
 
62
71
 
63
- def partition_value_record_to_partition_value_string(partition):
72
+ def partition_value_record_to_partition_value_string(partition: Any) -> str:
64
73
  # Get string representation of partition value out of Record[partition_value]
65
74
  partition_value_str = partition.__repr__().split("[", 1)[1].split("]")[0]
66
75
  return partition_value_str
67
76
 
68
77
 
69
78
  def group_all_files_to_each_bucket(
70
- data_file_dict, equality_delete_dict, pos_delete_dict
71
- ):
79
+ data_file_dict: Dict[Any, DataFileList],
80
+ equality_delete_dict: Dict[Any, DataFileList],
81
+ pos_delete_dict: Dict[Any, DataFileList],
82
+ ) -> List[ConvertInputFiles]:
72
83
  convert_input_files_for_all_buckets = []
73
84
  files_for_each_bucket_for_deletes = defaultdict(tuple)
74
85
  if equality_delete_dict:
75
86
  for partition_value, equality_delete_file_list in equality_delete_dict.items():
76
- (
77
- result_equality_delete_file,
78
- result_data_file,
79
- ) = check_data_files_sequence_number(
80
- data_files_list=data_file_dict[partition_value],
81
- equality_delete_files_list=equality_delete_dict[partition_value],
82
- )
83
- files_for_each_bucket_for_deletes[partition_value] = (
84
- result_data_file,
85
- result_equality_delete_file,
86
- [],
87
- )
88
- if partition_value not in data_file_dict:
89
- convert_input_file = ConvertInputFiles.of(
90
- partition_value=partition_value,
91
- applicable_data_files=result_data_file,
92
- applicable_equalitu_delete_files=result_equality_delete_file,
87
+ if partition_value in data_file_dict:
88
+ (
89
+ result_equality_delete_file,
90
+ result_data_file,
91
+ ) = check_data_files_sequence_number(
92
+ data_files_list=data_file_dict[partition_value],
93
+ equality_delete_files_list=equality_delete_dict[partition_value],
94
+ )
95
+ files_for_each_bucket_for_deletes[partition_value] = (
96
+ result_data_file,
97
+ result_equality_delete_file,
98
+ [],
93
99
  )
94
- convert_input_files_for_all_buckets.append(convert_input_file)
95
100
 
96
101
  for partition_value, all_data_files_for_each_bucket in data_file_dict.items():
97
102
  convert_input_file = ConvertInputFiles.of(
@@ -102,8 +107,69 @@ def group_all_files_to_each_bucket(
102
107
  convert_input_file.applicable_data_files = (
103
108
  files_for_each_bucket_for_deletes[partition_value][0]
104
109
  )
105
- convert_input_file.applicable_delete_files = (
110
+ convert_input_file.applicable_equality_delete_files = (
106
111
  files_for_each_bucket_for_deletes[partition_value][1]
107
112
  )
108
113
  convert_input_files_for_all_buckets.append(convert_input_file)
109
114
  return convert_input_files_for_all_buckets
115
+
116
+
117
+ def sort_data_files_maintaining_order(data_files: DataFileList) -> DataFileList:
118
+ """
119
+ Sort data files deterministically based on two criterias:
120
+ 1. Sequence number: Newly added files will have a higher sequence number
121
+ 2. File path: If file sequence is the same, files are guaranteed to be returned in a deterministic order since file path is unique.
122
+ """
123
+ if data_files:
124
+ data_files = sorted(data_files, key=lambda f: (f[0], f[1].file_path))
125
+ return data_files
126
+
127
+
128
+ class SnapshotType(Enum):
129
+ """Enumeration of possible snapshot types."""
130
+
131
+ NONE = "none"
132
+ APPEND = "append"
133
+ REPLACE = "replace"
134
+ DELETE = "delete"
135
+
136
+
137
+ def _get_snapshot_action_description(
138
+ snapshot_type: SnapshotType,
139
+ files_to_delete: List[List[DataFile]],
140
+ files_to_add: List[DataFile],
141
+ ) -> str:
142
+ """Get a human-readable description of the snapshot action."""
143
+ descriptions = {
144
+ SnapshotType.NONE: "No changes needed",
145
+ SnapshotType.APPEND: f"Adding {len(files_to_add)} new files",
146
+ SnapshotType.REPLACE: f"Replacing {sum(len(files) for files in files_to_delete)} files with {len(files_to_add)} new files",
147
+ SnapshotType.DELETE: f"Deleting {sum(len(files) for files in files_to_delete)} files",
148
+ }
149
+ return descriptions[snapshot_type]
150
+
151
+
152
+ def _determine_snapshot_type(
153
+ to_be_deleted_files: List[List[DataFile]], to_be_added_files: List[DataFile]
154
+ ) -> SnapshotType:
155
+ """
156
+ Determine the snapshot type based on file changes.
157
+
158
+ Args:
159
+ to_be_deleted_files: List of files to be deleted
160
+ to_be_added_files: List of files to be added
161
+
162
+ Returns:
163
+ SnapshotType indicating what kind of snapshot to commit
164
+ """
165
+ has_files_to_delete = bool(to_be_deleted_files)
166
+ has_files_to_add = bool(to_be_added_files)
167
+
168
+ if not has_files_to_delete and not has_files_to_add:
169
+ return SnapshotType.NONE
170
+ elif not has_files_to_delete and has_files_to_add:
171
+ return SnapshotType.APPEND
172
+ elif has_files_to_delete and has_files_to_add:
173
+ return SnapshotType.REPLACE
174
+ else: # has_files_to_delete and not has_files_to_add
175
+ return SnapshotType.DELETE
@@ -1,5 +1,5 @@
1
1
  import pyarrow as pa
2
- from typing import Union
2
+ from typing import Union, Iterator, Any
3
3
  import numpy as np
4
4
 
5
5
  # Refer to: https://iceberg.apache.org/spec/#reserved-field-ids for reserved field ids
@@ -9,7 +9,7 @@ ICEBERG_RESERVED_FIELD_ID_FOR_FILE_PATH_COLUMN = 2147483546
9
9
  ICEBERG_RESERVED_FIELD_ID_FOR_POS_COLUMN = 2147483545
10
10
 
11
11
 
12
- def _get_iceberg_col_name(suffix):
12
+ def _get_iceberg_col_name(suffix: str) -> str:
13
13
  return suffix
14
14
 
15
15
 
@@ -26,15 +26,16 @@ _ORDERED_RECORD_IDX_COLUMN_FIELD = pa.field(
26
26
  )
27
27
 
28
28
 
29
- def get_record_index_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
29
+ def get_record_index_column_array(obj: Any) -> Union[pa.Array, pa.ChunkedArray]:
30
30
  return pa.array(
31
31
  obj,
32
32
  _ORDERED_RECORD_IDX_COLUMN_TYPE,
33
33
  )
34
34
 
35
35
 
36
- def append_record_idx_col(table: pa.Table, ordered_record_indices) -> pa.Table:
37
-
36
+ def append_record_idx_col(
37
+ table: pa.Table, ordered_record_indices: Iterator[int]
38
+ ) -> pa.Table:
38
39
  table = table.append_column(
39
40
  _ORDERED_RECORD_IDX_COLUMN_FIELD,
40
41
  get_record_index_column_array(ordered_record_indices),
@@ -55,7 +56,7 @@ _FILE_PATH_COLUMN_FIELD = pa.field(
55
56
  )
56
57
 
57
58
 
58
- def append_file_path_column(table: pa.Table, file_path: str):
59
+ def append_file_path_column(table: pa.Table, file_path: str) -> pa.Table:
59
60
  table = table.append_column(
60
61
  _FILE_PATH_COLUMN_FIELD,
61
62
  pa.array(np.repeat(file_path, len(table)), _FILE_PATH_COLUMN_TYPE),
@@ -72,11 +73,15 @@ _GLOBAL_RECORD_IDX_COLUMN_FIELD = pa.field(
72
73
 
73
74
 
74
75
  def append_global_record_idx_column(
75
- table: pa.Table, ordered_record_indices
76
+ table: pa.Table, ordered_record_indices: Iterator[int]
76
77
  ) -> pa.Table:
77
-
78
78
  table = table.append_column(
79
79
  _GLOBAL_RECORD_IDX_COLUMN_NAME,
80
80
  pa.array(ordered_record_indices, _GLOBAL_RECORD_IDX_COLUMN_TYPE),
81
81
  )
82
82
  return table
83
+
84
+
85
+ _IDENTIFIER_COLUMNS_HASH_COLUMN_NAME = _get_iceberg_col_name(
86
+ "identifier_columns_hashed"
87
+ )
@@ -1,26 +1,57 @@
1
+ import logging
2
+
3
+ from fsspec import AbstractFileSystem
4
+ from deltacat import logs
1
5
  import deltacat.compute.converter.utils.iceberg_columns as sc
2
6
  import daft
7
+ from deltacat.utils.daft import _get_s3_io_config
8
+ from daft import TimeUnit, DataFrame
9
+ import pyarrow as pa
10
+ from typing import Callable, Optional, List, Dict, Any
11
+ from deltacat.utils.pyarrow import sliced_string_cast
12
+ from deltacat.compute.converter.constants import IDENTIFIER_FIELD_DELIMITER
13
+ from deltacat.compute.converter.utils.s3u import upload_table_with_retry
14
+ from pyiceberg.manifest import DataFile
15
+ import pyarrow.compute as pc
16
+ from deltacat.types.media import ContentType
17
+ from deltacat.types.tables import (
18
+ get_table_writer,
19
+ get_table_slicer,
20
+ write_sliced_table as types_write_sliced_table,
21
+ )
22
+ from deltacat.storage import LocalTable, DistributedDataset
23
+ from typing import Union
24
+
25
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
3
26
 
4
27
 
5
28
  def download_data_table_and_append_iceberg_columns(
6
- file, columns_to_download, additional_columns_to_append, sequence_number
7
- ):
8
- # TODO; add S3 client kwargs
29
+ file: DataFile,
30
+ columns_to_download: List[str],
31
+ additional_columns_to_append: Optional[List[str]] = [],
32
+ s3_client_kwargs: Optional[Dict[str, Any]] = None,
33
+ ) -> pa.Table:
9
34
  table = download_parquet_with_daft_hash_applied(
10
- identify_columns=columns_to_download, file=file, s3_client_kwargs={}
35
+ identifier_columns=columns_to_download,
36
+ file=file,
37
+ s3_client_kwargs=s3_client_kwargs,
11
38
  )
39
+
12
40
  if sc._FILE_PATH_COLUMN_NAME in additional_columns_to_append:
13
41
  table = sc.append_file_path_column(table, file.file_path)
14
42
  if sc._ORDERED_RECORD_IDX_COLUMN_NAME in additional_columns_to_append:
15
43
  record_idx_iterator = iter(range(len(table)))
16
44
  table = sc.append_record_idx_col(table, record_idx_iterator)
45
+
17
46
  return table
18
47
 
19
48
 
20
49
  def download_parquet_with_daft_hash_applied(
21
- identify_columns, file, s3_client_kwargs, **kwargs
22
- ):
23
- from daft import TimeUnit
50
+ identifier_columns: List[str],
51
+ file: DataFile,
52
+ s3_client_kwargs: Optional[Dict[str, Any]],
53
+ **kwargs: Any,
54
+ ) -> pa.Table:
24
55
 
25
56
  # TODO: Add correct read kwargs as in:
26
57
  # https://github.com/ray-project/deltacat/blob/383855a4044e4dfe03cf36d7738359d512a517b4/deltacat/utils/daft.py#L97
@@ -29,15 +60,144 @@ def download_parquet_with_daft_hash_applied(
29
60
  kwargs.get("coerce_int96_timestamp_unit", "ms")
30
61
  )
31
62
 
32
- from deltacat.utils.daft import _get_s3_io_config
33
-
34
63
  # TODO: Use Daft SHA1 hash instead to minimize probably of data corruption
35
64
  io_config = _get_s3_io_config(s3_client_kwargs=s3_client_kwargs)
36
- df = daft.read_parquet(
65
+ df = daft_read_parquet(
37
66
  path=file.file_path,
38
67
  io_config=io_config,
39
68
  coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
40
69
  )
41
- df = df.select(daft.col(identify_columns[0]).hash())
42
- arrow_table = df.to_arrow()
43
- return arrow_table
70
+
71
+ hash_column = concatenate_hashed_identifier_columns(
72
+ df=df, identifier_columns=identifier_columns
73
+ )
74
+
75
+ table = pa.Table.from_arrays(
76
+ [hash_column], names=[sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME]
77
+ )
78
+
79
+ return table
80
+
81
+
82
+ def daft_read_parquet(
83
+ path: str, io_config: Dict[str, Any], coerce_int96_timestamp_unit: TimeUnit
84
+ ) -> DataFrame:
85
+ df = daft.read_parquet(
86
+ path=path,
87
+ io_config=io_config,
88
+ coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
89
+ )
90
+ return df
91
+
92
+
93
+ def concatenate_hashed_identifier_columns(
94
+ df: DataFrame, identifier_columns: List[str]
95
+ ) -> pa.Array:
96
+ pk_hash_columns = []
97
+ previous_hash_column_length = None
98
+ for i in range(len(identifier_columns)):
99
+ pk_hash_column = df.select(daft.col(identifier_columns[i]).hash())
100
+ pk_hash_column_arrow = pk_hash_column.to_arrow()
101
+
102
+ # Assert that each hash column downloaded are same length to ensure we don't create mismatch between columns.
103
+ if not previous_hash_column_length:
104
+ previous_hash_column_length = len(pk_hash_column_arrow)
105
+ else:
106
+ assert previous_hash_column_length == len(pk_hash_column_arrow), (
107
+ f"Identifier column Length mismatch: {identifier_columns[i]} has length {len(pk_hash_column_arrow)} "
108
+ f"but expected {previous_hash_column_length}."
109
+ )
110
+ previous_hash_column_length = len(pk_hash_column_arrow)
111
+
112
+ # Convert identifier from different datatypes to string here
113
+ pk_hash_column_str = sliced_string_cast(
114
+ pk_hash_column_arrow[identifier_columns[i]]
115
+ )
116
+ assert len(pk_hash_column_str) == previous_hash_column_length, (
117
+ f"Casting column Length mismatch: {identifier_columns[i]} has length {len(pk_hash_column_str)} after casting, "
118
+ f"before casting length: {previous_hash_column_length}."
119
+ )
120
+
121
+ pk_hash_columns.append(pk_hash_column_str)
122
+
123
+ pk_hash_columns.append(IDENTIFIER_FIELD_DELIMITER)
124
+ pk_hash_columns_concatenated = pc.binary_join_element_wise(
125
+ *pk_hash_columns, null_handling="replace"
126
+ )
127
+ assert len(pk_hash_columns_concatenated) == previous_hash_column_length, (
128
+ f"Concatenated column Length mismatch: Final concatenated identifier column has length {len(pk_hash_columns_concatenated)}, "
129
+ f"before concatenating length: {previous_hash_column_length}."
130
+ )
131
+
132
+ return pk_hash_columns_concatenated
133
+
134
+
135
+ def write_sliced_table(
136
+ table: Union[LocalTable, DistributedDataset],
137
+ base_path: str,
138
+ table_writer_kwargs: Optional[Dict[str, Any]],
139
+ content_type: ContentType = ContentType.PARQUET,
140
+ max_records_per_file: Optional[int] = 4000000,
141
+ filesystem: Optional[Union[AbstractFileSystem, pa.fs.FileSystem]] = None,
142
+ **kwargs,
143
+ ) -> List[str]:
144
+ """
145
+ Writes the given table to 1 or more files and return the paths
146
+ of the files written.
147
+ """
148
+ if isinstance(filesystem, pa.fs.FileSystem):
149
+ table_writer_fn = get_table_writer(table)
150
+ table_slicer_fn = get_table_slicer(table)
151
+
152
+ # Create a wrapper for the table writer that ensures directory creation
153
+ def table_writer_with_dir_creation(
154
+ dataframe: Any,
155
+ base_path: str,
156
+ filesystem: Optional[Union[AbstractFileSystem, pa.fs.FileSystem]],
157
+ block_path_provider: Callable,
158
+ content_type: str = ContentType.PARQUET.value,
159
+ **kwargs,
160
+ ):
161
+ try:
162
+ # Ensure base path directory exists
163
+ if isinstance(base_path, str):
164
+ # Normalize the base path and ensure it's treated as a directory path
165
+ base_dir = base_path.rstrip("/")
166
+ filesystem.create_dir(base_dir, recursive=True)
167
+ except Exception:
168
+ # Directory might already exist or there might be permission issues
169
+ # Let the original write attempt proceed
170
+ pass
171
+ return table_writer_fn(
172
+ dataframe,
173
+ base_path,
174
+ filesystem,
175
+ block_path_provider,
176
+ content_type,
177
+ **kwargs,
178
+ )
179
+
180
+ # TODO(pdames): Disable redundant file info fetch currently
181
+ # used to construct unused manifest entry metadata.
182
+ manifest_entry_list = types_write_sliced_table(
183
+ table=table,
184
+ base_path=base_path,
185
+ filesystem=filesystem,
186
+ max_records_per_entry=max_records_per_file,
187
+ table_writer_fn=table_writer_with_dir_creation,
188
+ table_slicer_fn=table_slicer_fn,
189
+ table_writer_kwargs=table_writer_kwargs,
190
+ content_type=content_type,
191
+ )
192
+ paths = [entry.uri for entry in manifest_entry_list]
193
+ return paths
194
+ else:
195
+ return upload_table_with_retry(
196
+ table=table,
197
+ s3_url_prefix=base_path,
198
+ s3_table_writer_kwargs=table_writer_kwargs,
199
+ content_type=content_type,
200
+ max_records_per_file=max_records_per_file,
201
+ s3_file_system=filesystem,
202
+ **kwargs,
203
+ )
@@ -4,14 +4,16 @@ from tenacity import (
4
4
  stop_after_delay,
5
5
  wait_random_exponential,
6
6
  )
7
- from typing import Union
8
- from deltacat.aws.s3u import CapturedBlockWritePaths, UuidBlockWritePathProvider
7
+ from typing import Union, Optional, Dict, Any, List, Callable
8
+ from deltacat.types.tables import (
9
+ CapturedBlockWritePaths,
10
+ UuidBlockWritePathProvider,
11
+ )
9
12
  from deltacat.types.tables import (
10
13
  get_table_writer,
11
14
  get_table_length,
12
15
  TABLE_CLASS_TO_SLICER_FUNC,
13
16
  )
14
- from typing import Optional, Dict, Any, List
15
17
  from deltacat.exceptions import RetryableError
16
18
  from deltacat.storage import (
17
19
  DistributedDataset,
@@ -21,19 +23,22 @@ from deltacat.types.media import (
21
23
  ContentEncoding,
22
24
  ContentType,
23
25
  )
24
- from deltacat.aws.s3u import UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY
26
+ from deltacat.constants import UPLOAD_SLICED_TABLE_RETRY_STOP_AFTER_DELAY
25
27
  import s3fs
28
+ import boto3
29
+ from boto3.session import Session
30
+ from botocore.credentials import Credentials
26
31
 
27
32
 
28
- def get_credential():
29
- import boto3
30
-
31
- boto3_session = boto3.Session()
32
- credentials = boto3_session.get_credentials()
33
+ def get_credential() -> Credentials:
34
+ boto3_session: Session = boto3.Session()
35
+ credentials: Credentials = boto3_session.get_credentials()
33
36
  return credentials
34
37
 
35
38
 
36
39
  def get_s3_file_system(content_type):
40
+ import s3fs # noqa: F401
41
+
37
42
  token_holder = get_credential()
38
43
  content_encoding = ContentEncoding.IDENTITY
39
44
 
@@ -57,12 +62,12 @@ def upload_table_with_retry(
57
62
  s3_table_writer_kwargs: Optional[Dict[str, Any]],
58
63
  content_type: ContentType = ContentType.PARQUET,
59
64
  max_records_per_file: Optional[int] = 4000000,
60
- s3_file_system=None,
61
- **s3_client_kwargs,
65
+ filesystem: Optional[s3fs.S3FileSystem] = None,
66
+ **s3_client_kwargs: Any,
62
67
  ) -> List[str]:
63
68
  """
64
- Writes the given table to 1 or more S3 files and return Redshift
65
- manifest entries describing the uploaded files.
69
+ Writes the given table to 1 or more S3 files and return the paths
70
+ of the S3 files written.
66
71
  """
67
72
  retrying = Retrying(
68
73
  wait=wait_random_exponential(multiplier=1, max=60),
@@ -73,11 +78,11 @@ def upload_table_with_retry(
73
78
  if s3_table_writer_kwargs is None:
74
79
  s3_table_writer_kwargs = {}
75
80
 
76
- if not s3_file_system:
77
- s3_file_system = get_s3_file_system(content_type=content_type)
81
+ if not filesystem:
82
+ filesystem = get_s3_file_system(content_type=content_type)
78
83
  capture_object = CapturedBlockWritePaths()
79
84
  block_write_path_provider = UuidBlockWritePathProvider(
80
- capture_object=capture_object
85
+ capture_object=capture_object, base_path=s3_url_prefix
81
86
  )
82
87
  s3_table_writer_func = get_table_writer(table)
83
88
  table_record_count = get_table_length(table)
@@ -86,7 +91,7 @@ def upload_table_with_retry(
86
91
  fn=upload_table,
87
92
  table_slices=table,
88
93
  s3_base_url=f"{s3_url_prefix}",
89
- s3_file_system=s3_file_system,
94
+ s3_file_system=filesystem,
90
95
  s3_table_writer_func=s3_table_writer_func,
91
96
  s3_table_writer_kwargs=s3_table_writer_kwargs,
92
97
  block_write_path_provider=block_write_path_provider,
@@ -101,7 +106,7 @@ def upload_table_with_retry(
101
106
  fn=upload_table,
102
107
  table_slices=table_slice,
103
108
  s3_base_url=f"{s3_url_prefix}",
104
- s3_file_system=s3_file_system,
109
+ s3_file_system=filesystem,
105
110
  s3_table_writer_func=s3_table_writer_func,
106
111
  s3_table_writer_kwargs=s3_table_writer_kwargs,
107
112
  block_write_path_provider=block_write_path_provider,
@@ -110,18 +115,28 @@ def upload_table_with_retry(
110
115
  )
111
116
  del block_write_path_provider
112
117
  write_paths = capture_object.write_paths()
113
- return write_paths
118
+ s3_write_paths = []
119
+ for path in write_paths:
120
+ s3_write_path = construct_s3_url(path)
121
+ s3_write_paths.append(s3_write_path)
122
+ return s3_write_paths
123
+
124
+
125
+ def construct_s3_url(path: Optional[str]) -> Optional[str]:
126
+ if path:
127
+ return f"s3://{path}"
128
+ return None
114
129
 
115
130
 
116
131
  def upload_table(
117
- table_slices,
118
- s3_base_url,
119
- s3_file_system,
120
- s3_table_writer_func,
121
- block_write_path_provider,
122
- content_type,
123
- s3_table_writer_kwargs,
124
- ):
132
+ table_slices: Union[LocalTable, DistributedDataset],
133
+ s3_base_url: str,
134
+ s3_file_system: s3fs.S3FileSystem,
135
+ s3_table_writer_func: Callable,
136
+ block_write_path_provider: UuidBlockWritePathProvider,
137
+ content_type: ContentType,
138
+ s3_table_writer_kwargs: Dict[str, Any],
139
+ ) -> None:
125
140
  s3_table_writer_func(
126
141
  table_slices,
127
142
  s3_base_url,