deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,687 @@
1
+ import os
2
+ import json
3
+ import pickle
4
+ import tempfile
5
+ import uuid
6
+ import base64
7
+ import shutil
8
+ from datetime import datetime
9
+ from polars.exceptions import PanicException
10
+ from typing import List, Dict, Any, Tuple
11
+
12
+ import deltacat as dc
13
+ from deltacat import Catalog
14
+ from deltacat.catalog import CatalogProperties
15
+ from deltacat.types.media import ContentType, DatasetType
16
+ from deltacat.types.tables import (
17
+ from_pyarrow,
18
+ TableWriteMode,
19
+ get_dataset_type,
20
+ get_table_length,
21
+ get_table_column_names,
22
+ get_table_schema,
23
+ )
24
+ from deltacat.storage import Metafile, Delta
25
+ from deltacat.utils.pyarrow import get_supported_test_types
26
+ import pyarrow as pa
27
+ import pyarrow.parquet as pq
28
+ import pyarrow.orc as orc
29
+ import pyarrow.feather as feather
30
+
31
+
32
+ def get_version_info():
33
+ """Capture version information for all libraries."""
34
+ version_info = {
35
+ "test_date": datetime.now().isoformat(),
36
+ "pyarrow_version": pa.__version__,
37
+ }
38
+
39
+ # Get DeltaCAT version
40
+ try:
41
+ version_info["deltacat_version"] = dc.__version__
42
+ except AttributeError:
43
+ # Fallback if __version__ not available
44
+ try:
45
+ import pkg_resources
46
+
47
+ version_info["deltacat_version"] = pkg_resources.get_distribution(
48
+ "deltacat"
49
+ ).version
50
+ except Exception:
51
+ version_info["deltacat_version"] = "unknown"
52
+
53
+ # Get Pandas version
54
+ try:
55
+ import pandas as pd
56
+
57
+ version_info["pandas_version"] = pd.__version__
58
+ except ImportError:
59
+ version_info["pandas_version"] = "not_available"
60
+
61
+ # Get Polars version
62
+ try:
63
+ import polars as pl
64
+
65
+ version_info["polars_version"] = pl.__version__
66
+ except ImportError:
67
+ version_info["polars_version"] = "not_available"
68
+
69
+ # Get Daft version
70
+ try:
71
+ import daft
72
+
73
+ version_info["daft_version"] = daft.__version__
74
+ except (ImportError, AttributeError):
75
+ version_info["daft_version"] = "not_available"
76
+
77
+ # Get Ray version
78
+ try:
79
+ import ray
80
+
81
+ version_info["ray_version"] = ray.__version__
82
+ except (ImportError, AttributeError):
83
+ version_info["ray_version"] = "not_available"
84
+
85
+ return version_info
86
+
87
+
88
+ def get_comprehensive_test_types() -> List[Tuple[str, str, List[Any]]]:
89
+ """Get comprehensive Arrow types for testing."""
90
+ return get_supported_test_types()
91
+
92
+
93
+ def extract_file_paths_from_deltas(all_objects: List[Any]) -> List[str]:
94
+ """Extract file paths from Delta objects by parsing manifest entries."""
95
+ file_paths = []
96
+
97
+ for obj in all_objects:
98
+ obj_type = Metafile.get_class(obj)
99
+
100
+ if obj_type == Delta:
101
+ delta_obj = obj
102
+ # Access manifest entries to get file paths
103
+ if hasattr(delta_obj, "manifest") and delta_obj.manifest:
104
+ manifest = delta_obj.manifest
105
+ if hasattr(manifest, "entries") and manifest.entries:
106
+ for entry in manifest.entries:
107
+ file_url = entry.uri or entry.url
108
+
109
+ # Convert file:// URLs to local paths
110
+ if file_url.startswith("file://"):
111
+ file_path = file_url[7:]
112
+ else:
113
+ file_path = file_url
114
+
115
+ file_paths.append(file_path)
116
+
117
+ return file_paths
118
+
119
+
120
+ def inspect_specific_file_physical_schema(
121
+ file_path: str, content_type: ContentType
122
+ ) -> Dict[str, Any]:
123
+ """Inspect the physical schema of a specific file."""
124
+
125
+ try:
126
+ if not os.path.exists(file_path):
127
+ return {"error": f"File not found: {file_path}"}
128
+
129
+ if content_type == ContentType.PARQUET:
130
+ parquet_file = pq.ParquetFile(file_path)
131
+ arrow_schema = parquet_file.schema_arrow
132
+ parquet_schema = parquet_file.schema
133
+ parquet_schema_string = str(parquet_schema)
134
+ column_info = {}
135
+ parquet_col_index = 0
136
+
137
+ for i in range(len(arrow_schema)):
138
+ arrow_field = arrow_schema.field(i)
139
+ arrow_type_str = str(arrow_field.type)
140
+
141
+ # For collection types, we need to handle them specially
142
+ col = parquet_schema.column(parquet_col_index)
143
+ if col.max_definition_level > 1 or col.max_repetition_level > 1:
144
+ parquet_physical_type_name_suffix = "Unknown"
145
+ if col.max_repetition_level > 0 and "list" in parquet_schema_string:
146
+ parquet_physical_type_name_suffix = "List"
147
+ elif col.max_definition_level > 0:
148
+ if "map" in parquet_schema_string:
149
+ parquet_physical_type_name_suffix = "Map"
150
+ else:
151
+ parquet_physical_type_name_suffix = "Struct"
152
+ parquet_physical_type_name_prefix = (
153
+ f"{col.max_definition_level}-Level"
154
+ if col.max_definition_level > 0
155
+ else ""
156
+ )
157
+ parquet_physical_type_name = f"{parquet_physical_type_name_prefix} {parquet_physical_type_name_suffix}"
158
+ parquet_logical_type_name = (
159
+ "LIST"
160
+ if "(List)" in parquet_schema_string
161
+ else "MAP"
162
+ if "(Map)" in parquet_schema_string
163
+ else ""
164
+ )
165
+ # For collection types, use the Arrow type as the "physical" representation
166
+ # since Parquet's physical schema doesn't directly represent these structures
167
+ print(f"Logical Type: {parquet_logical_type_name}")
168
+ print(f"Physical Type: {parquet_physical_type_name}")
169
+ print(f"Path: {col.path}")
170
+ print(f"Max Definition Level: {col.max_definition_level}")
171
+ print(f"Max Repetition Level: {col.max_repetition_level}")
172
+ column_info[f"column_{i}"] = {
173
+ "arrow_type": arrow_type_str,
174
+ "parquet_physical_type": parquet_physical_type_name,
175
+ "parquet_logical_type": parquet_logical_type_name,
176
+ "parquet_converted_type": "unknown",
177
+ "nullable": arrow_field.nullable,
178
+ }
179
+ # Skip the nested columns that are part of this complex type
180
+ if "list<" in arrow_type_str.lower():
181
+ parquet_col_index += 1 # Lists have nested structure
182
+ elif "struct<" in arrow_type_str.lower():
183
+ # Count the number of fields in the struct
184
+ struct_fields = (
185
+ arrow_type_str.count(",") + 1
186
+ if "," in arrow_type_str
187
+ else 1
188
+ )
189
+ parquet_col_index += struct_fields
190
+ elif "dictionary<" in arrow_type_str.lower():
191
+ parquet_col_index += 1 # Dictionary has values storage
192
+ else:
193
+ # For simple types, use the actual Parquet column info
194
+ try:
195
+ col = parquet_schema.column(parquet_col_index)
196
+ column_info[f"column_{i}"] = {
197
+ "arrow_type": arrow_type_str,
198
+ "parquet_physical_type": str(col.physical_type),
199
+ "parquet_logical_type": str(col.logical_type)
200
+ if col.logical_type
201
+ else None,
202
+ "parquet_converted_type": str(col.converted_type)
203
+ if col.converted_type
204
+ else None,
205
+ "nullable": arrow_field.nullable,
206
+ }
207
+ parquet_col_index += 1
208
+ except (IndexError, Exception):
209
+ # Fallback if we can't match to parquet column
210
+ column_info[f"column_{i}"] = {
211
+ "arrow_type": arrow_type_str,
212
+ "parquet_physical_type": "UNKNOWN",
213
+ "parquet_logical_type": None,
214
+ "parquet_converted_type": None,
215
+ "nullable": arrow_field.nullable,
216
+ }
217
+
218
+ return {
219
+ "format": "parquet",
220
+ "columns": column_info,
221
+ "file_size": os.path.getsize(file_path),
222
+ "file_path": file_path,
223
+ }
224
+
225
+ elif content_type == ContentType.FEATHER:
226
+ feather_table = feather.read_table(file_path)
227
+
228
+ column_info = {}
229
+ for i, field in enumerate(feather_table.schema):
230
+ column_info[f"column_{i}"] = {
231
+ "arrow_type": str(field.type),
232
+ "feather_preserved_type": str(field.type),
233
+ "nullable": field.nullable,
234
+ }
235
+
236
+ return {
237
+ "format": "feather",
238
+ "columns": column_info,
239
+ "file_size": os.path.getsize(file_path),
240
+ "file_path": file_path,
241
+ }
242
+
243
+ elif content_type == ContentType.AVRO:
244
+ # For Avro, use fastavro to read the schema
245
+ import fastavro
246
+
247
+ with open(file_path, "rb") as f:
248
+ reader = fastavro.reader(f)
249
+ avro_schema = reader.writer_schema
250
+
251
+ column_info = {}
252
+ if "fields" in avro_schema:
253
+ for i, field in enumerate(avro_schema["fields"]):
254
+ field_type = field["type"]
255
+ # Handle union types (used for nullable fields)
256
+ if isinstance(field_type, list):
257
+ # Find the non-null type in union
258
+ non_null_types = [t for t in field_type if t != "null"]
259
+ if non_null_types:
260
+ field_type = non_null_types[0]
261
+ nullable = "null" in field_type
262
+ else:
263
+ nullable = False
264
+
265
+ column_info[f"column_{i}"] = {
266
+ "field_name": field["name"],
267
+ "avro_type": str(field_type),
268
+ "nullable": nullable,
269
+ "original_field": field,
270
+ }
271
+
272
+ return {
273
+ "format": "avro",
274
+ "columns": column_info,
275
+ "avro_schema": avro_schema,
276
+ "file_size": os.path.getsize(file_path),
277
+ "file_path": file_path,
278
+ }
279
+
280
+ elif content_type == ContentType.ORC:
281
+ orc_file = orc.ORCFile(file_path)
282
+
283
+ column_info = {}
284
+ for i, field in enumerate(orc_file.schema):
285
+ column_info[f"column_{i}"] = {
286
+ "arrow_type": str(field.type),
287
+ "orc_type_kind": str(field.type).split("(")[0]
288
+ if "(" in str(field.type)
289
+ else str(field.type),
290
+ "nullable": field.nullable,
291
+ }
292
+
293
+ return {
294
+ "format": "orc",
295
+ "columns": column_info,
296
+ "file_size": os.path.getsize(file_path),
297
+ "file_path": file_path,
298
+ }
299
+
300
+ except (PanicException, Exception) as e:
301
+ return {
302
+ "error": f"Physical inspection failed: {str(e)}",
303
+ "error_type": type(e).__name__,
304
+ "file_path": file_path,
305
+ }
306
+
307
+
308
+ def test_dataset_read_compatibility(
309
+ table_name: str,
310
+ namespace: str,
311
+ catalog_name: str,
312
+ dataset_types: List[DatasetType],
313
+ ) -> List[Dict[str, Any]]:
314
+ """Test reading the table with different dataset types."""
315
+ read_results = []
316
+
317
+ for read_dataset_type in dataset_types:
318
+ print(f" Testing read with {read_dataset_type.value}")
319
+ try:
320
+ read_result = dc.read_table(
321
+ table=table_name,
322
+ namespace=namespace,
323
+ catalog=catalog_name,
324
+ read_as=read_dataset_type,
325
+ max_parallelism=1,
326
+ )
327
+
328
+ # Verify the actual dataset type matches what we expected
329
+ actual_dataset_type = get_dataset_type(read_result)
330
+
331
+ # Extract basic information about the read result
332
+ result_info = {
333
+ "dataset_type": read_dataset_type.value,
334
+ "actual_dataset_type": actual_dataset_type.value,
335
+ "success": True,
336
+ "error": None,
337
+ "result_type": type(read_result).__name__,
338
+ }
339
+
340
+ # Use proper utility functions based on expected dataset type
341
+ try:
342
+ result_info["num_rows"] = get_table_length(read_result)
343
+ except Exception as e:
344
+ result_info["num_rows"] = f"Error getting length: {str(e)}"
345
+
346
+ try:
347
+ column_names = get_table_column_names(read_result)
348
+ result_info["num_columns"] = len(column_names)
349
+ result_info["column_names"] = column_names
350
+ except Exception as e:
351
+ result_info["num_columns"] = f"Error getting columns: {str(e)}"
352
+
353
+ # Get schema information using the utility function
354
+ try:
355
+ schema = get_table_schema(read_result)
356
+ result_info["schema"] = str(schema)
357
+ if schema.metadata is not None:
358
+ result_info["has_metadata"] = True
359
+ except Exception as e:
360
+ result_info["schema"] = f"Schema error: {str(e)}"
361
+
362
+ read_results.append(result_info)
363
+ print(f" ✅ Read successful")
364
+
365
+ except (PanicException, Exception) as e:
366
+ read_results.append(
367
+ {
368
+ "dataset_type": read_dataset_type.value,
369
+ "success": False,
370
+ "error": str(e),
371
+ "error_type": type(e).__name__,
372
+ "result_type": None,
373
+ "schema": None,
374
+ "num_columns": 0,
375
+ "num_rows": 0,
376
+ }
377
+ )
378
+ print(f" ❌ Read failed: {str(e)[:100]}...")
379
+
380
+ return read_results
381
+
382
+
383
+ def run_single_test(
384
+ arrow_type_name: str,
385
+ arrow_type_code: str,
386
+ test_data: List[Any],
387
+ dataset_type: DatasetType,
388
+ content_type: ContentType,
389
+ catalog_name: str,
390
+ ) -> Dict[str, Any]:
391
+ """Run a single test with proper file-to-test mapping using dc.list."""
392
+
393
+ try:
394
+ # Create Arrow table
395
+ arrow_type = eval(arrow_type_code)
396
+ arrow_table = pa.Table.from_arrays(
397
+ [pa.array(test_data, type=arrow_type)], names=[arrow_type_name]
398
+ )
399
+
400
+ # Convert to dataset type
401
+ write_dataset = from_pyarrow(arrow_table, dataset_type)
402
+
403
+ # Create unique table name with timestamp to avoid conflicts
404
+ timestamp = datetime.now().strftime("%H%M%S%f")
405
+ table_name = f"test_{arrow_type_name}_{dataset_type.value}_{content_type.value.replace('/', '_')}_{timestamp}"
406
+ namespace = "test_namespace"
407
+
408
+ print(f" Writing to table: {table_name}")
409
+
410
+ # Write to DeltaCAT with reader compatibility validation disabled
411
+ dc.write_to_table(
412
+ data=write_dataset,
413
+ table=table_name,
414
+ namespace=namespace,
415
+ catalog=catalog_name,
416
+ mode=TableWriteMode.CREATE,
417
+ content_type=content_type,
418
+ table_properties={
419
+ "supported_reader_types": None # Disable reader compatibility validation
420
+ },
421
+ )
422
+
423
+ # Try to read back with PyArrow for type verification
424
+ pyarrow_read_success = True
425
+ read_result = None
426
+ pyarrow_read_error = None
427
+
428
+ try:
429
+ read_result = dc.read_table(
430
+ table=table_name,
431
+ namespace=namespace,
432
+ catalog=catalog_name,
433
+ read_as=DatasetType.PYARROW,
434
+ max_parallelism=1,
435
+ )
436
+ print(f" ✅ PyArrow read-back successful")
437
+ except Exception as e:
438
+ pyarrow_read_success = False
439
+ pyarrow_read_error = str(e)
440
+ print(f" ⚠️ PyArrow read-back failed: {str(e)[:100]}...")
441
+
442
+ # Test read compatibility with different dataset types
443
+ print(f" Testing read compatibility with other dataset types...")
444
+ additional_dataset_types = [
445
+ DatasetType.PANDAS,
446
+ DatasetType.POLARS,
447
+ DatasetType.DAFT,
448
+ DatasetType.RAY_DATASET,
449
+ ]
450
+
451
+ dataset_read_results = test_dataset_read_compatibility(
452
+ table_name, namespace, catalog_name, additional_dataset_types
453
+ )
454
+
455
+ # Use dc.list with recursive=True to find the objects for this specific table
456
+ table_url = dc.DeltaCatUrl(f"dc://{catalog_name}/{namespace}/{table_name}")
457
+ print(f" Listing objects for: {table_url}")
458
+
459
+ try:
460
+ table_objects = dc.list(table_url, recursive=True)
461
+ print(f" Found {len(table_objects)} objects for table")
462
+
463
+ # Extract file paths from Delta objects
464
+ file_paths = extract_file_paths_from_deltas(table_objects)
465
+ print(f" Extracted {len(file_paths)} file paths")
466
+
467
+ if file_paths:
468
+ # Use the first file path (should be the one we just wrote)
469
+ file_path = file_paths[0]
470
+ print(f" Inspecting file: {file_path}")
471
+
472
+ # Inspect the physical schema of this specific file
473
+ physical_schema = inspect_specific_file_physical_schema(
474
+ file_path, content_type
475
+ )
476
+ else:
477
+ physical_schema = {"error": "No file paths found in Delta objects"}
478
+
479
+ except Exception as e:
480
+ physical_schema = {"error": f"Failed to list table objects: {str(e)}"}
481
+
482
+ # Serialize the PyArrow type for reliable deserialization later
483
+ serialized_arrow_type = base64.b64encode(pickle.dumps(arrow_type)).decode(
484
+ "utf-8"
485
+ )
486
+
487
+ return {
488
+ "arrow_type": arrow_type_name,
489
+ "dataset_type": dataset_type.value,
490
+ "content_type": content_type.value,
491
+ "success": True, # Write was successful
492
+ "pyarrow_read_success": pyarrow_read_success,
493
+ "pyarrow_read_error": pyarrow_read_error,
494
+ "original_arrow_type": str(arrow_type),
495
+ "serialized_arrow_type": serialized_arrow_type,
496
+ "read_back_type": str(read_result.schema.field(0).type)
497
+ if read_result and hasattr(read_result, "schema")
498
+ else "unknown",
499
+ "physical_schema": physical_schema,
500
+ "type_preserved": str(arrow_type) == str(read_result.schema.field(0).type)
501
+ if read_result and hasattr(read_result, "schema")
502
+ else False,
503
+ "error": None,
504
+ "table_name": table_name,
505
+ "dataset_read_results": dataset_read_results,
506
+ }
507
+
508
+ except (PanicException, Exception) as e:
509
+ print(f" Test failed with error: {str(e)}")
510
+
511
+ # Try to serialize the arrow_type even on failure (if arrow_type was created)
512
+ try:
513
+ arrow_type = eval(arrow_type_code)
514
+ original_arrow_type = str(arrow_type)
515
+ serialized_arrow_type = base64.b64encode(pickle.dumps(arrow_type)).decode(
516
+ "utf-8"
517
+ )
518
+ except Exception:
519
+ # If we can't create the arrow_type, we can't serialize it
520
+ original_arrow_type = "unknown"
521
+ serialized_arrow_type = None
522
+
523
+ return {
524
+ "arrow_type": arrow_type_name,
525
+ "dataset_type": dataset_type.value,
526
+ "content_type": content_type.value,
527
+ "success": False, # Write failed
528
+ "pyarrow_read_success": False,
529
+ "pyarrow_read_error": None, # Write failed, not read
530
+ "original_arrow_type": original_arrow_type,
531
+ "serialized_arrow_type": serialized_arrow_type,
532
+ "read_back_type": "unknown",
533
+ "physical_schema": {},
534
+ "type_preserved": False,
535
+ "error": str(e),
536
+ "error_category": "unknown",
537
+ "table_name": f"failed_{arrow_type_name}_{dataset_type.value}",
538
+ "dataset_read_results": [],
539
+ }
540
+
541
+
542
+ def run_type_mapping_tests(catalog_name: str) -> List[Dict[str, Any]]:
543
+ """Run the actual type mapping tests and return results."""
544
+ arrow_types = get_comprehensive_test_types()
545
+ dataset_types = [
546
+ DatasetType.PYARROW,
547
+ DatasetType.PANDAS,
548
+ DatasetType.POLARS,
549
+ DatasetType.DAFT,
550
+ DatasetType.RAY_DATASET,
551
+ ] # All dataset types
552
+ content_types = [
553
+ ContentType.PARQUET,
554
+ ContentType.FEATHER,
555
+ ContentType.AVRO,
556
+ ContentType.ORC,
557
+ ] # Test 4 content types
558
+
559
+ print(
560
+ f"Testing {len(arrow_types)} Arrow types × {len(dataset_types)} dataset types × {len(content_types)} content types"
561
+ )
562
+ print()
563
+
564
+ all_results = []
565
+ test_count = 0
566
+ total_tests = len(arrow_types) * len(dataset_types) * len(content_types)
567
+
568
+ for arrow_type_name, arrow_type_code, test_data in arrow_types:
569
+ print(f"Testing PyArrow type: {arrow_type_name}")
570
+
571
+ for dataset_type in dataset_types:
572
+ for content_type in content_types:
573
+ test_count += 1
574
+ print(
575
+ f" [{test_count:2d}/{total_tests}] {dataset_type.value} → {content_type.value}"
576
+ )
577
+
578
+ result = run_single_test(
579
+ arrow_type_name,
580
+ arrow_type_code,
581
+ test_data,
582
+ dataset_type,
583
+ content_type,
584
+ catalog_name,
585
+ )
586
+
587
+ if result["success"]:
588
+ # Write was successful, check read status
589
+ read_status = (
590
+ "✅" if result.get("pyarrow_read_success", True) else "⚠️"
591
+ )
592
+
593
+ if result["physical_schema"].get("error"):
594
+ print(
595
+ f" {read_status} Write ✅, Physical schema error: {result['physical_schema']['error']}"
596
+ )
597
+ else:
598
+ # Show extracted physical type
599
+ columns = result["physical_schema"].get("columns", {})
600
+ if columns:
601
+ first_col = next(iter(columns.values()))
602
+ if content_type == ContentType.PARQUET:
603
+ physical_type = first_col.get(
604
+ "parquet_physical_type", "unknown"
605
+ )
606
+ print(
607
+ f" {read_status} Write ✅, Physical type: {physical_type}"
608
+ )
609
+ elif content_type == ContentType.FEATHER:
610
+ physical_type = first_col.get(
611
+ "feather_preserved_type", "unknown"
612
+ )
613
+ print(
614
+ f" {read_status} Write ✅, Physical type: {physical_type}"
615
+ )
616
+ elif content_type == ContentType.AVRO:
617
+ physical_type = first_col.get("avro_type", "unknown")
618
+ print(
619
+ f" {read_status} Write ✅, Physical type: {physical_type}"
620
+ )
621
+ elif content_type == ContentType.ORC:
622
+ physical_type = first_col.get(
623
+ "orc_type_kind", "unknown"
624
+ )
625
+ print(
626
+ f" {read_status} Write ✅, Physical type: {physical_type}"
627
+ )
628
+ else:
629
+ print(f" {read_status} Write ✅, No column info found")
630
+
631
+ # Show read error if any
632
+ if not result.get("pyarrow_read_success", True):
633
+ read_error = result.get("pyarrow_read_error", "unknown")
634
+ print(f" PyArrow read failed: {read_error[:100]}...")
635
+ else:
636
+ print(f" ❌ Write failed: {result.get('error', 'unknown')}")
637
+
638
+ all_results.append(result)
639
+ print()
640
+
641
+ return all_results
642
+
643
+
644
+ def main():
645
+ print("=" * 80)
646
+ print("PHYSICAL SCHEMA EXTRACTION TEST")
647
+ print("=" * 80)
648
+ print("Using dc.list with table-specific URLs to map files to tests")
649
+
650
+ # Setup
651
+ temp_dir = tempfile.mkdtemp()
652
+ catalog_name = f"test-catalog-{uuid.uuid4()}"
653
+ catalog_props = CatalogProperties(root=temp_dir)
654
+ dc.put_catalog(catalog_name, catalog=Catalog(config=catalog_props))
655
+
656
+ print(f"Using catalog directory: {temp_dir}")
657
+
658
+ try:
659
+ # Run the tests
660
+ all_results = run_type_mapping_tests(catalog_name)
661
+
662
+ # Save detailed results with version information
663
+ version_info = get_version_info()
664
+ output_data = {"metadata": version_info, "test_results": all_results}
665
+
666
+ output_file_name = "generate_type_mappings_results.json"
667
+ with open(output_file_name, "w") as f:
668
+ json.dump(output_data, f, indent=2, default=str)
669
+
670
+ print(f"Detailed results saved to: {output_file_name}")
671
+ print(f"Catalog directory: {temp_dir}")
672
+
673
+ finally:
674
+ # Clean up test catalog and temporary directory
675
+ try:
676
+ dc.clear_catalogs() # Clear catalog from memory
677
+ shutil.rmtree(temp_dir) # Remove temporary directory and all contents
678
+ print(f"✅ Cleaned up test catalog directory: {temp_dir}")
679
+ except Exception as cleanup_error:
680
+ print(
681
+ f"⚠️ Warning: Failed to clean up catalog directory {temp_dir}: {cleanup_error}"
682
+ )
683
+ print("NOTE: You may need to manually delete this directory")
684
+
685
+
686
+ if __name__ == "__main__":
687
+ main()