deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -11,15 +11,15 @@ from deltacat.exceptions import (
11
11
  UnclassifiedDeltaCatError,
12
12
  )
13
13
  from daft.exceptions import DaftTransientError
14
- from deltacat.tests.local_deltacat_storage.exceptions import (
14
+ from deltacat.tests.utils.exceptions import (
15
15
  InvalidNamespaceError,
16
- LocalStorageValidationError,
16
+ MainStorageValidationError,
17
17
  )
18
+ from deltacat.tests.utils import main_deltacat_storage_mock as ds
18
19
  from botocore.exceptions import NoCredentialsError
19
20
  from tenacity import retry, retry_if_exception_type, stop_after_attempt
20
21
 
21
22
  from pyarrow.lib import ArrowCapacityError
22
- import deltacat.tests.local_deltacat_storage as ds
23
23
 
24
24
 
25
25
  class MockUnknownException(Exception):
@@ -41,7 +41,7 @@ def mock_remote_task(exception_to_raise):
41
41
  mock_raise_exception(exception_to_raise)
42
42
 
43
43
 
44
- class TestCategorizeErrors(unittest.TestCase):
44
+ class TestCategorizeErrorsMain(unittest.TestCase):
45
45
  def test_pyarrow_exception_categorizer(self):
46
46
  self.assertRaises(
47
47
  DependencyPyarrowCapacityError,
@@ -50,7 +50,7 @@ class TestCategorizeErrors(unittest.TestCase):
50
50
 
51
51
  def test_storage_exception_categorizer(self):
52
52
  self.assertRaises(
53
- LocalStorageValidationError,
53
+ MainStorageValidationError,
54
54
  lambda: mock_raise_exception(InvalidNamespaceError, deltacat_storage=ds),
55
55
  )
56
56
 
@@ -98,3 +98,7 @@ class TestCategorizeErrors(unittest.TestCase):
98
98
  return
99
99
 
100
100
  self.assertFalse(True)
101
+
102
+
103
+ if __name__ == "__main__":
104
+ unittest.main()
@@ -1,8 +1,9 @@
1
1
  from typing import List, Optional, Union
2
2
  import pyarrow as pa
3
3
  from deltacat.storage import Delta, Partition, PartitionLocator, DeltaLocator
4
- import deltacat.tests.local_deltacat_storage as ds
4
+ from deltacat.storage import metastore
5
5
  from deltacat.types.media import StorageType, ContentType
6
+ from deltacat.storage.model.schema import Schema
6
7
 
7
8
 
8
9
  def create_delta_from_csv_file(
@@ -14,58 +15,89 @@ def create_delta_from_csv_file(
14
15
  *args,
15
16
  **kwargs,
16
17
  ) -> Delta:
18
+ assert file_paths is not None, "file_paths cannot be empty"
19
+ pa_table = create_table_from_csv_file_paths(file_paths)
20
+ schema = Schema.of(pa_table.schema)
17
21
  staged_partition = stage_partition_from_file_paths(
18
22
  namespace,
19
23
  file_paths,
24
+ schema,
20
25
  *args,
21
26
  table_name=table_name,
22
27
  table_version=table_version,
23
28
  **kwargs,
24
29
  )
25
30
  committed_delta = commit_delta_to_staged_partition(
26
- staged_partition, file_paths, content_type=content_type, *args, **kwargs
31
+ staged_partition,
32
+ pa_table,
33
+ content_type,
34
+ *args,
35
+ **kwargs,
27
36
  )
28
37
  return committed_delta
29
38
 
30
39
 
40
+ def create_table_from_csv_file_paths(
41
+ file_paths: List[str],
42
+ ) -> pa.Table:
43
+ tables = []
44
+ for file_path in file_paths:
45
+ table = pa.csv.read_csv(file_path)
46
+ tables.append(table)
47
+ return pa.concat_tables(tables)
48
+
49
+
31
50
  def stage_partition_from_file_paths(
32
51
  namespace: str,
33
52
  file_paths: List[str],
53
+ schema: Schema,
34
54
  table_name: Optional[str] = None,
35
55
  table_version: int = 1,
36
56
  *args,
37
57
  **kwargs,
38
58
  ) -> Partition:
39
- ds.create_namespace(namespace, {}, **kwargs)
59
+ if not metastore.namespace_exists(namespace, **kwargs):
60
+ metastore.create_namespace(namespace, **kwargs)
40
61
  if table_name is None:
41
62
  table_name = "-".join(file_paths).replace("/", "_")
42
- ds.create_table_version(namespace, table_name, str(table_version), **kwargs)
43
- stream = ds.get_stream(namespace, table_name, str(table_version), **kwargs)
44
- staged_partition = ds.stage_partition(stream, [], **kwargs)
63
+ metastore.create_table_version(
64
+ namespace,
65
+ table_name,
66
+ str(table_version),
67
+ schema=schema,
68
+ **kwargs,
69
+ )
70
+ stream = metastore.get_stream(
71
+ namespace,
72
+ table_name,
73
+ str(table_version),
74
+ **kwargs,
75
+ )
76
+ staged_partition = metastore.stage_partition(stream, **kwargs)
45
77
  return staged_partition
46
78
 
47
79
 
48
80
  def commit_delta_to_staged_partition(
49
81
  staged_partition,
50
- file_paths: List[str],
82
+ pa_table: pa.Table,
51
83
  content_type: ContentType = ContentType.PARQUET,
52
84
  *args,
53
85
  **kwargs,
54
86
  ) -> Delta:
55
87
  committed_delta = commit_delta_to_partition(
56
88
  staged_partition,
89
+ pa_table,
90
+ content_type,
57
91
  *args,
58
- file_paths=file_paths,
59
- content_type=content_type,
60
92
  **kwargs,
61
93
  )
62
- ds.commit_partition(staged_partition, **kwargs)
94
+ metastore.commit_partition(staged_partition, **kwargs)
63
95
  return committed_delta
64
96
 
65
97
 
66
98
  def download_delta(delta_like: Union[Delta, DeltaLocator], *args, **kwargs) -> Delta:
67
99
  return pa.concat_tables(
68
- ds.download_delta(
100
+ metastore.download_delta(
69
101
  delta_like,
70
102
  storage_type=StorageType.LOCAL,
71
103
  *args,
@@ -76,23 +108,22 @@ def download_delta(delta_like: Union[Delta, DeltaLocator], *args, **kwargs) -> D
76
108
 
77
109
  def commit_delta_to_partition(
78
110
  partition: Union[Partition, PartitionLocator],
79
- file_paths: List[str],
111
+ pa_table: pa.Table = None,
80
112
  content_type: ContentType = ContentType.PARQUET,
81
113
  *args,
82
114
  **kwargs,
83
115
  ) -> Delta:
84
- tables = []
85
116
 
86
117
  if isinstance(partition, PartitionLocator):
87
- partition = ds.get_partition(
118
+ partition = metastore.get_partition(
88
119
  partition.stream_locator, partition.partition_values, *args, **kwargs
89
120
  )
90
121
 
91
- for file_path in file_paths:
92
- table = pa.csv.read_csv(file_path)
93
- tables.append(table)
94
-
95
- table = pa.concat_tables(tables)
96
- staged_delta = ds.stage_delta(table, partition, content_type=content_type, **kwargs)
122
+ staged_delta = metastore.stage_delta(
123
+ pa_table,
124
+ partition,
125
+ content_type=content_type,
126
+ **kwargs,
127
+ )
97
128
 
98
- return ds.commit_delta(staged_delta, **kwargs)
129
+ return metastore.commit_delta(staged_delta, **kwargs)
@@ -25,12 +25,16 @@ from deltacat.storage import (
25
25
  NullOrder,
26
26
  Partition,
27
27
  PartitionKey,
28
+ PartitionKeyList,
28
29
  PartitionLocator,
29
30
  PartitionScheme,
31
+ PartitionSchemeList,
30
32
  Schema,
31
33
  SchemaList,
32
34
  SortScheme,
35
+ SortSchemeList,
33
36
  SortKey,
37
+ SortKeyList,
34
38
  SortOrder,
35
39
  StreamLocator,
36
40
  StreamFormat,
@@ -59,7 +63,10 @@ def create_empty_delta(
59
63
  manifest_entry_id: Optional[str] = None,
60
64
  ) -> Delta:
61
65
  stream_position = current_time_ms()
62
- delta_locator = DeltaLocator.of(partition.locator, stream_position=stream_position)
66
+ delta_locator = DeltaLocator.of(
67
+ partition.locator,
68
+ stream_position=stream_position,
69
+ )
63
70
 
64
71
  if manifest_entry_id:
65
72
  manifest = Manifest.of(
@@ -131,12 +138,12 @@ def create_test_table_version():
131
138
  PartitionKey.of(
132
139
  key=["some_string", "some_int32"],
133
140
  name="test_partition_key",
134
- field_id="test_field_id",
141
+ field_id=1,
135
142
  transform=bucket_transform,
136
143
  )
137
144
  ]
138
145
  partition_scheme = PartitionScheme.of(
139
- keys=partition_keys,
146
+ keys=PartitionKeyList.of(partition_keys),
140
147
  name="test_partition_scheme",
141
148
  scheme_id="test_partition_scheme_id",
142
149
  )
@@ -151,7 +158,7 @@ def create_test_table_version():
151
158
  )
152
159
  ]
153
160
  sort_scheme = SortScheme.of(
154
- keys=sort_keys,
161
+ keys=SortKeyList.of(sort_keys),
155
162
  name="test_sort_scheme",
156
163
  scheme_id="test_sort_scheme_id",
157
164
  )
@@ -166,8 +173,8 @@ def create_test_table_version():
166
173
  watermark=None,
167
174
  lifecycle_state=LifecycleState.CREATED,
168
175
  schemas=SchemaList.of([schema]),
169
- partition_schemes=[partition_scheme],
170
- sort_schemes=[sort_scheme],
176
+ partition_schemes=PartitionSchemeList.of([partition_scheme]),
177
+ sort_schemes=SortSchemeList.of([sort_scheme]),
171
178
  )
172
179
 
173
180
 
@@ -189,12 +196,12 @@ def create_test_stream():
189
196
  PartitionKey.of(
190
197
  key=["some_string", "some_int32"],
191
198
  name="test_partition_key",
192
- field_id="test_field_id",
199
+ field_id=1,
193
200
  transform=bucket_transform,
194
201
  )
195
202
  ]
196
203
  partition_scheme = PartitionScheme.of(
197
- keys=partition_keys,
204
+ keys=PartitionKeyList.of(partition_keys),
198
205
  name="test_partition_scheme",
199
206
  scheme_id="test_partition_scheme_id",
200
207
  )
@@ -217,28 +224,8 @@ def create_test_partition():
217
224
  partition_values=["a", 1],
218
225
  partition_id="test_partition_id",
219
226
  )
220
- schema = Schema.of(
221
- [
222
- Field.of(
223
- field=pa.field("some_string", pa.string(), nullable=False),
224
- field_id=1,
225
- is_merge_key=True,
226
- ),
227
- Field.of(
228
- field=pa.field("some_int32", pa.int32(), nullable=False),
229
- field_id=2,
230
- is_merge_key=True,
231
- ),
232
- Field.of(
233
- field=pa.field("some_float64", pa.float64()),
234
- field_id=3,
235
- is_merge_key=False,
236
- ),
237
- ]
238
- )
239
227
  return Partition.of(
240
228
  locator=partition_locator,
241
- schema=schema,
242
229
  content_types=[ContentType.PARQUET],
243
230
  state=CommitState.STAGED,
244
231
  previous_stream_position=0,
@@ -274,12 +261,14 @@ def create_test_delta():
274
261
  entry_params=manifest_entry_params,
275
262
  )
276
263
  manifest = Manifest.of(
277
- entries=[
278
- ManifestEntry.of(
279
- url="s3://test/url",
280
- meta=manifest_meta,
281
- )
282
- ],
264
+ entries=ManifestEntryList(
265
+ [
266
+ ManifestEntry.of(
267
+ url="s3://test/url",
268
+ meta=manifest_meta,
269
+ )
270
+ ]
271
+ ),
283
272
  author=ManifestAuthor.of(
284
273
  name="deltacat",
285
274
  version="2.0",
File without changes
@@ -0,0 +1,104 @@
1
+ import pytest
2
+ import pandas as pd
3
+ import pyarrow as pa
4
+
5
+ from deltacat.types.tables import (
6
+ to_pandas,
7
+ to_pyarrow,
8
+ get_table_length,
9
+ )
10
+
11
+
12
+ def test_convert_to_pandas_error_cases():
13
+ """Test convert_to_pandas with invalid inputs."""
14
+ # Test None input
15
+ with pytest.raises(
16
+ ValueError, match="No pandas conversion function found for table type"
17
+ ):
18
+ to_pandas(None)
19
+
20
+ # Test unsupported type
21
+ with pytest.raises(
22
+ ValueError, match="No pandas conversion function found for table type"
23
+ ):
24
+ to_pandas("invalid_string")
25
+
26
+ # Test unsupported type with complex object
27
+ with pytest.raises(
28
+ ValueError, match="No pandas conversion function found for table type"
29
+ ):
30
+ to_pandas({"not": "a_dataframe"})
31
+
32
+
33
+ def test_convert_to_arrow_error_cases():
34
+ """Test convert_to_arrow with invalid inputs."""
35
+ # Test None input
36
+ with pytest.raises(
37
+ ValueError, match="No pyarrow conversion function found for table type"
38
+ ):
39
+ to_pyarrow(None)
40
+
41
+ # Test unsupported type
42
+ with pytest.raises(
43
+ ValueError, match="No pyarrow conversion function found for table type"
44
+ ):
45
+ to_pyarrow("invalid_string")
46
+
47
+ # Test unsupported type with complex object
48
+ with pytest.raises(
49
+ ValueError, match="No pyarrow conversion function found for table type"
50
+ ):
51
+ to_pyarrow({"not": "a_table"})
52
+
53
+
54
+ def test_conversion_functions_with_real_data():
55
+ """Test conversion functions with actual data structures."""
56
+ # Create test data
57
+ test_df = pd.DataFrame({"id": [1, 2], "name": ["test1", "test2"]})
58
+ test_table = pa.Table.from_pandas(test_df)
59
+
60
+ # Test pandas conversion
61
+ converted_df = to_pandas(test_df)
62
+ assert isinstance(converted_df, pd.DataFrame)
63
+ assert converted_df.equals(test_df)
64
+
65
+ # Test arrow conversion
66
+ converted_table = to_pyarrow(test_table)
67
+ assert isinstance(converted_table, pa.Table)
68
+ assert converted_table.equals(test_table)
69
+
70
+ # Test cross-conversion
71
+ df_from_table = to_pandas(test_table)
72
+ table_from_df = to_pyarrow(test_df)
73
+ assert isinstance(df_from_table, pd.DataFrame)
74
+ assert isinstance(table_from_df, pa.Table)
75
+
76
+
77
+ def test_conversion_roundtrip_consistency():
78
+ """Test that conversion functions maintain data integrity through roundtrips."""
79
+ # Create test data
80
+ original_df = pd.DataFrame(
81
+ {
82
+ "id": [1, 2, 3, 4, 5],
83
+ "name": ["Alice", "Bob", "Charlie", "Dave", "Eve"],
84
+ "age": [25, 30, 35, 40, 45],
85
+ "city": ["NYC", "LA", "Chicago", "Houston", "Phoenix"],
86
+ }
87
+ )
88
+
89
+ # Test pandas -> arrow -> pandas roundtrip
90
+ arrow_table = to_pyarrow(original_df)
91
+ roundtrip_df = to_pandas(arrow_table)
92
+
93
+ # Verify data integrity (allowing for potential type changes)
94
+ assert get_table_length(original_df) == get_table_length(
95
+ roundtrip_df
96
+ ), "Row count should be preserved"
97
+ assert list(original_df.columns) == list(
98
+ roundtrip_df.columns
99
+ ), "Column names should be preserved"
100
+
101
+ # Verify ID column integrity (critical for merge operations)
102
+ original_ids = sorted(original_df["id"].tolist())
103
+ roundtrip_ids = sorted(roundtrip_df["id"].tolist())
104
+ assert original_ids == roundtrip_ids, "ID column should be preserved exactly"
@@ -0,0 +1,22 @@
1
+ """
2
+ Exception classes for main storage testing that mirror the local storage exceptions.
3
+ These are used to test the main metastore error categorization functionality.
4
+ """
5
+
6
+
7
+ class InvalidNamespaceError(Exception):
8
+ """Exception raised when an invalid namespace is provided to main storage."""
9
+
10
+ error_name = "InvalidNamespaceError"
11
+
12
+
13
+ class MainStorageValidationError(Exception):
14
+ """Exception raised when main storage validation fails."""
15
+
16
+ error_name = "MainStorageValidationError"
17
+
18
+
19
+ class MainStorageError(Exception):
20
+ """General exception for main storage operations."""
21
+
22
+ error_name = "MainStorageError"
@@ -0,0 +1,31 @@
1
+ """
2
+ Mock module that provides storage-specific error categorization functions for main storage testing.
3
+ """
4
+
5
+ from deltacat.tests.utils.exceptions import (
6
+ InvalidNamespaceError,
7
+ MainStorageValidationError,
8
+ )
9
+
10
+
11
+ def can_categorize(e: BaseException, **kwargs) -> bool:
12
+ """
13
+ Mock implementation of can_categorize for main storage testing.
14
+ Returns True if the input error can be categorized by main storage.
15
+ """
16
+ if isinstance(e, InvalidNamespaceError):
17
+ return True
18
+ else:
19
+ return False
20
+
21
+
22
+ def raise_categorized_error(e: BaseException, **kwargs):
23
+ """
24
+ Mock implementation of raise_categorized_error for main storage testing.
25
+ Converts categorizable errors to their main storage equivalent.
26
+ """
27
+ if isinstance(e, InvalidNamespaceError):
28
+ raise MainStorageValidationError("Namespace provided is invalid!")
29
+ else:
30
+ # If we can't categorize it, re-raise the original exception
31
+ raise e
@@ -6,6 +6,8 @@ from fsspec import AbstractFileSystem
6
6
  from ray.data.datasource import FilenameProvider
7
7
  from deltacat.types.media import ContentType
8
8
  import ray
9
+ import gzip
10
+ import json
9
11
 
10
12
 
11
13
  class TestDatasetToFile:
@@ -20,7 +22,13 @@ class TestDatasetToFile:
20
22
 
21
23
  @pytest.fixture(scope="module")
22
24
  def mock_dataset(self):
23
- return from_items([{"col1": i, "col2": i * 2} for i in range(1000)])
25
+ # Include data that would need escaping to test quoting behavior
26
+ return from_items([{"col1": "a,b\tc|d", "col2": 0} for _ in range(5)])
27
+
28
+ @pytest.fixture(scope="module")
29
+ def mock_unescaped_dataset(self):
30
+ # Use data without delimiters for unescaped TSV test
31
+ return from_items([{"col1": "abc", "col2": 0} for _ in range(5)])
24
32
 
25
33
  @pytest.fixture(scope="module")
26
34
  def mock_filename_provider(self):
@@ -35,12 +43,12 @@ class TestDatasetToFile:
35
43
  def test_parquet_sanity(self, mock_dataset, mock_filename_provider):
36
44
  from deltacat.utils.ray_utils.dataset import dataset_to_file
37
45
 
38
- fs: AbstractFileSystem = fsspec.filesystem("local")
46
+ fs: AbstractFileSystem = fsspec.filesystem("file")
39
47
 
40
48
  dataset_to_file(
41
49
  mock_dataset,
42
50
  self.BASE_PATH,
43
- file_system=fs,
51
+ filesystem=fs,
44
52
  block_path_provider=mock_filename_provider,
45
53
  )
46
54
 
@@ -51,16 +59,126 @@ class TestDatasetToFile:
51
59
  def test_csv_sanity(self, mock_dataset, mock_filename_provider):
52
60
  from deltacat.utils.ray_utils.dataset import dataset_to_file
53
61
 
54
- fs: AbstractFileSystem = fsspec.filesystem("local")
62
+ fs: AbstractFileSystem = fsspec.filesystem("file")
55
63
 
56
64
  dataset_to_file(
57
65
  mock_dataset,
58
66
  self.BASE_PATH,
59
- file_system=fs,
67
+ filesystem=fs,
60
68
  block_path_provider=mock_filename_provider,
61
69
  content_type=ContentType.CSV.value,
62
70
  )
63
71
 
64
72
  file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
65
73
  assert fs.exists(file_expected_at), "file was not written"
74
+
75
+ # Verify CSV format and content
76
+ with fs.open(file_expected_at, "rb") as f:
77
+ with gzip.GzipFile(fileobj=f) as gz:
78
+ content = gz.read().decode("utf-8")
79
+ # Should be quoted due to commas in data
80
+ assert '"a,b\tc|d",0' in content
81
+
82
+ fs.delete(file_expected_at)
83
+
84
+ def test_tsv_sanity(self, mock_dataset, mock_filename_provider):
85
+ from deltacat.utils.ray_utils.dataset import dataset_to_file
86
+
87
+ fs: AbstractFileSystem = fsspec.filesystem("file")
88
+
89
+ dataset_to_file(
90
+ mock_dataset,
91
+ self.BASE_PATH,
92
+ filesystem=fs,
93
+ block_path_provider=mock_filename_provider,
94
+ content_type=ContentType.TSV.value,
95
+ )
96
+
97
+ file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
98
+ assert fs.exists(file_expected_at), "file was not written"
99
+
100
+ # Verify TSV format and content
101
+ with fs.open(file_expected_at, "rb") as f:
102
+ with gzip.GzipFile(fileobj=f) as gz:
103
+ content = gz.read().decode("utf-8")
104
+ # Should be quoted due to tabs in data
105
+ assert '"a,b\tc|d"\t0' in content
106
+
107
+ fs.delete(file_expected_at)
108
+
109
+ def test_psv_sanity(self, mock_dataset, mock_filename_provider):
110
+ from deltacat.utils.ray_utils.dataset import dataset_to_file
111
+
112
+ fs: AbstractFileSystem = fsspec.filesystem("file")
113
+
114
+ dataset_to_file(
115
+ mock_dataset,
116
+ self.BASE_PATH,
117
+ filesystem=fs,
118
+ block_path_provider=mock_filename_provider,
119
+ content_type=ContentType.PSV.value,
120
+ )
121
+
122
+ file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
123
+ assert fs.exists(file_expected_at), "file was not written"
124
+
125
+ # Verify PSV format and content
126
+ with fs.open(file_expected_at, "rb") as f:
127
+ with gzip.GzipFile(fileobj=f) as gz:
128
+ content = gz.read().decode("utf-8")
129
+ # Should be quoted due to pipes in data
130
+ assert '"a,b\tc|d"|0' in content
131
+
132
+ fs.delete(file_expected_at)
133
+
134
+ def test_unescaped_tsv_sanity(self, mock_unescaped_dataset, mock_filename_provider):
135
+ from deltacat.utils.ray_utils.dataset import dataset_to_file
136
+
137
+ fs: AbstractFileSystem = fsspec.filesystem("file")
138
+
139
+ dataset_to_file(
140
+ mock_unescaped_dataset,
141
+ self.BASE_PATH,
142
+ filesystem=fs,
143
+ block_path_provider=mock_filename_provider,
144
+ content_type=ContentType.UNESCAPED_TSV.value,
145
+ )
146
+
147
+ file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
148
+ assert fs.exists(file_expected_at), "file was not written"
149
+
150
+ # Verify UNESCAPED_TSV format and content
151
+ with fs.open(file_expected_at, "rb") as f:
152
+ with gzip.GzipFile(fileobj=f) as gz:
153
+ content = gz.read().decode("utf-8")
154
+ # Should NOT be quoted since data has no delimiters
155
+ assert "abc\t0" in content
156
+
157
+ fs.delete(file_expected_at)
158
+
159
+ def test_json_sanity(self, mock_dataset, mock_filename_provider):
160
+ from deltacat.utils.ray_utils.dataset import dataset_to_file
161
+
162
+ fs: AbstractFileSystem = fsspec.filesystem("file")
163
+
164
+ dataset_to_file(
165
+ mock_dataset,
166
+ self.BASE_PATH,
167
+ filesystem=fs,
168
+ block_path_provider=mock_filename_provider,
169
+ content_type=ContentType.JSON.value,
170
+ )
171
+
172
+ file_expected_at = f"{self.BASE_PATH}/{self.SUB_PATH}"
173
+ assert fs.exists(file_expected_at), "file was not written"
174
+
175
+ # Verify JSON format and content
176
+ with fs.open(file_expected_at, "rb") as f:
177
+ with gzip.GzipFile(fileobj=f) as gz:
178
+ content = gz.read().decode("utf-8")
179
+ # Each line should be a valid JSON object
180
+ first_line = content.split("\n")[0]
181
+ record = json.loads(first_line)
182
+ assert record == {"col1": "a,b\tc|d", "col2": 0}
183
+
66
184
  fs.delete(file_expected_at)