deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,673 @@
1
+ import json
2
+ import sys
3
+ import pickle
4
+ import base64
5
+ from typing import Dict, List, Any
6
+ from pathlib import Path
7
+
8
+ from deltacat.utils.pyarrow import get_base_arrow_type_name
9
+
10
+
11
+ def load_test_data(json_file: str) -> tuple[List[Dict[str, Any]], Dict[str, Any]]:
12
+ """Load test results and metadata from JSON file."""
13
+ with open(json_file, "r") as f:
14
+ data = json.load(f)
15
+
16
+ if isinstance(data, dict):
17
+ if "test_results" in data and "metadata" in data:
18
+ # New format with metadata
19
+ return data["test_results"], data["metadata"]
20
+ else:
21
+ raise ValueError(f"Unexpected JSON structure in {json_file}")
22
+ elif isinstance(data, list):
23
+ # Old format - just a list of results
24
+ return data, {}
25
+ else:
26
+ raise ValueError(f"Unexpected JSON structure in {json_file}")
27
+
28
+
29
+ def load_test_results(json_file: str) -> List[Dict[str, Any]]:
30
+ """Load test results from JSON file (backward compatibility)."""
31
+ results, _ = load_test_data(json_file)
32
+ return results
33
+
34
+
35
+ def extract_physical_type_mapping_from_json(
36
+ result: Dict[str, Any], content_type_key: str
37
+ ) -> str:
38
+ """Extract physical type mapping from JSON result."""
39
+ if not result.get("success", False):
40
+ return None
41
+
42
+ # Even if PyArrow read failed, we can still extract physical schema if files were written
43
+ # The physical schema inspection happens at the file level, not via PyArrow read
44
+
45
+ physical_schema = result.get("physical_schema", {})
46
+
47
+ if physical_schema.get("error"):
48
+ return None
49
+
50
+ if content_type_key == "parquet":
51
+ columns = physical_schema.get("columns", {})
52
+ if columns:
53
+ first_col = next(iter(columns.values()))
54
+ physical_type = first_col.get("parquet_physical_type", "unknown")
55
+ logical_type = first_col.get("parquet_logical_type")
56
+ if logical_type and logical_type != "None":
57
+ return f"{physical_type} ({logical_type})"
58
+ return physical_type
59
+
60
+ elif content_type_key == "feather":
61
+ columns = physical_schema.get("columns", {})
62
+ if columns:
63
+ first_col = next(iter(columns.values()))
64
+ return first_col.get("feather_preserved_type", "unknown")
65
+
66
+ elif content_type_key == "avro":
67
+ columns = physical_schema.get("columns", {})
68
+ if columns:
69
+ first_col = next(iter(columns.values()))
70
+ avro_type = first_col.get("avro_type")
71
+ if avro_type:
72
+ return str(avro_type)
73
+ return "unknown"
74
+
75
+ elif content_type_key == "orc":
76
+ columns = physical_schema.get("columns", {})
77
+ if columns:
78
+ first_col = next(iter(columns.values()))
79
+ return first_col.get("orc_type_kind", "unknown")
80
+
81
+ return None
82
+
83
+
84
+ def generate_type_table_markdown(
85
+ arrow_type: str, arrow_description: str, results: List[Dict[str, Any]]
86
+ ) -> str:
87
+ """Generate a single type table in markdown format."""
88
+
89
+ # Filter results for this arrow type
90
+ type_results = [r for r in results if r["arrow_type"] == arrow_type]
91
+
92
+ if not type_results:
93
+ return (
94
+ f"\n#### **{arrow_description}** \nNo test results found for this type.\n"
95
+ )
96
+
97
+ # Organize results by dataset type and content type
98
+ dataset_types = ["pyarrow", "pandas", "polars", "daft", "ray_dataset"]
99
+ content_types = [
100
+ "application/parquet",
101
+ "application/feather",
102
+ "application/avro",
103
+ "application/orc",
104
+ ]
105
+ content_type_keys = ["parquet", "feather", "avro", "orc"]
106
+
107
+ # Build result matrix and physical mappings per dataset type
108
+ result_matrix = {}
109
+ dataset_physical_mappings = {}
110
+
111
+ for dataset_type in dataset_types:
112
+ result_matrix[dataset_type] = {}
113
+ dataset_physical_mappings[dataset_type] = {}
114
+
115
+ for content_type in content_types:
116
+ # Find the specific result
117
+ specific_result = next(
118
+ (
119
+ r
120
+ for r in type_results
121
+ if r["dataset_type"] == dataset_type
122
+ and r["content_type"] == content_type
123
+ ),
124
+ None,
125
+ )
126
+
127
+ if specific_result:
128
+ write_success = specific_result["success"]
129
+
130
+ if write_success:
131
+ result_matrix[dataset_type][content_type] = "✅"
132
+ else:
133
+ result_matrix[dataset_type][content_type] = "❌" # Write failed
134
+
135
+ # Extract physical type mapping for this dataset type
136
+ content_key = content_type.replace("application/", "")
137
+ physical_type = extract_physical_type_mapping_from_json(
138
+ specific_result, content_key
139
+ )
140
+ if physical_type and physical_type != "unknown":
141
+ dataset_physical_mappings[dataset_type][content_key] = physical_type
142
+ else:
143
+ result_matrix[dataset_type][content_type] = "❓"
144
+
145
+ # Generate markdown table
146
+ markdown = f"\n#### **{arrow_description}**\n"
147
+ markdown += "| Dataset Type | Parquet | Feather | Avro | ORC | Physical Types |\n"
148
+ markdown += "|--------------|---------|---------|------|-----|---------------|\n"
149
+
150
+ for dataset_type in dataset_types:
151
+ row_results = result_matrix.get(dataset_type, {})
152
+ parquet_result = row_results.get("application/parquet", "❓")
153
+ feather_result = row_results.get("application/feather", "❓")
154
+ avro_result = row_results.get("application/avro", "❓")
155
+ orc_result = row_results.get("application/orc", "❓")
156
+
157
+ # Build physical types string for this dataset type
158
+ dataset_mappings = dataset_physical_mappings.get(dataset_type, {})
159
+ physical_parts = []
160
+
161
+ for content_key in content_type_keys:
162
+ if content_key in dataset_mappings:
163
+ physical_parts.append(
164
+ f"{content_key.title()}:`{dataset_mappings[content_key]}`"
165
+ )
166
+
167
+ physical_col = "; ".join(physical_parts) if physical_parts else ""
168
+
169
+ markdown += f"| `{dataset_type}` | {parquet_result} | {feather_result} | {avro_result} | {orc_result} | {physical_col} |\n"
170
+
171
+ return markdown
172
+
173
+
174
+ def generate_read_compatibility_matrix_markdown(
175
+ results: List[Dict[str, Any]], arrow_type_descriptions: Dict[str, str]
176
+ ) -> str:
177
+ """Generate read compatibility matrix markdown from test results."""
178
+
179
+ # Collect all read compatibility data
180
+ read_compat_data = (
181
+ {}
182
+ ) # arrow_type -> writer_dataset -> content_type -> {reader_dataset: success}
183
+
184
+ for result in results:
185
+ arrow_type = result["arrow_type"]
186
+ arrow_type_description = arrow_type_descriptions.get(arrow_type, arrow_type)
187
+ writer_dataset = result["dataset_type"]
188
+ content_type = result["content_type"]
189
+ write_success = result.get("success", False)
190
+ dataset_read_results = result.get("dataset_read_results", [])
191
+
192
+ if arrow_type_description not in read_compat_data:
193
+ read_compat_data[arrow_type_description] = {}
194
+ if writer_dataset not in read_compat_data[arrow_type_description]:
195
+ read_compat_data[arrow_type_description][writer_dataset] = {}
196
+ if content_type not in read_compat_data[arrow_type_description][writer_dataset]:
197
+ read_compat_data[arrow_type_description][writer_dataset][content_type] = {}
198
+
199
+ if write_success:
200
+ # Only process read results if the write was successful
201
+ # Add PyArrow read result based on actual read success
202
+ # If pyarrow_read_success field is missing, we can't assume it succeeded
203
+ pyarrow_read_success = result.get("pyarrow_read_success")
204
+ if pyarrow_read_success is not None:
205
+ read_compat_data[arrow_type_description][writer_dataset][content_type][
206
+ "pyarrow"
207
+ ] = pyarrow_read_success
208
+
209
+ # Add other dataset type read results
210
+ for read_result in dataset_read_results:
211
+ reader_dataset = read_result["dataset_type"]
212
+ success = read_result["success"]
213
+ read_compat_data[arrow_type_description][writer_dataset][content_type][
214
+ reader_dataset
215
+ ] = success
216
+ else:
217
+ # Write failed - mark all readers as incompatible (represented by "—")
218
+ # This ensures the writer appears in the table but shows no compatibility data
219
+ pass
220
+
221
+ if not read_compat_data:
222
+ return (
223
+ "\n## Read Compatibility Tables\n\nNo read compatibility data available.\n"
224
+ )
225
+
226
+ # Generate markdown
227
+ markdown = """\n## Read Compatibility Tables\n\n
228
+ The following tables show read compatibility for each Arrow type across available writer/reader combinations.\n
229
+
230
+ This information is automatically used by DeltaCAT at write time to ensure that data written in one format can be
231
+ read by all supported reader types defined in a table's `SUPPORTED_READER_TYPES` table property. If data to be
232
+ written cannot be read by one or more supported reader types, then a `TableValidationError` will be raised.
233
+ """
234
+
235
+ # Get all dataset types that appear as readers
236
+ all_readers = set()
237
+ for arrow_data in read_compat_data.values():
238
+ for writer_data in arrow_data.values():
239
+ for content_data in writer_data.values():
240
+ all_readers.update(content_data.keys())
241
+ all_readers = sorted(list(all_readers))
242
+
243
+ # Generate table for each arrow type
244
+ for arrow_type in sorted(read_compat_data.keys()):
245
+ markdown += f"\n### {arrow_type}\n\n"
246
+
247
+ # Organize by content type
248
+ content_types = set()
249
+ for writer_data in read_compat_data[arrow_type].values():
250
+ content_types.update(writer_data.keys())
251
+ content_types = sorted(list(content_types))
252
+
253
+ for content_type in content_types:
254
+ markdown += f"\n#### {content_type}\n\n"
255
+
256
+ # Find all writers for this content type
257
+ writers = []
258
+ for writer_dataset in sorted(read_compat_data[arrow_type].keys()):
259
+ if content_type in read_compat_data[arrow_type][writer_dataset]:
260
+ writers.append(writer_dataset)
261
+
262
+ if not writers:
263
+ continue
264
+
265
+ # Create table header
266
+ markdown += "| Writer \\ Reader | " + " | ".join(all_readers) + " |\n"
267
+ markdown += "|" + "---|" * (len(all_readers) + 1) + "\n"
268
+
269
+ # Create table rows
270
+ for writer in writers:
271
+ row = [f"**{writer}**"]
272
+ reader_data = read_compat_data[arrow_type][writer][content_type]
273
+
274
+ for reader in all_readers:
275
+ if reader in reader_data:
276
+ result = reader_data[reader]
277
+ row.append("✅" if result else "❌")
278
+ else:
279
+ row.append("—")
280
+
281
+ markdown += "| " + " | ".join(row) + " |\n"
282
+
283
+ markdown += "\n"
284
+
285
+ return markdown
286
+
287
+
288
+ def _normalize_complex_types(serialized_arrow_type: str) -> str:
289
+ """Normalize complex arrow types to their base type names without parameters.
290
+
291
+ This function uses the serialized PyArrow type for reliable normalization.
292
+
293
+ Args:
294
+ serialized_arrow_type: Base64-encoded pickled PyArrow type (required)
295
+
296
+ Returns:
297
+ Normalized type name using the common utility function
298
+
299
+ Raises:
300
+ ValueError: If serialized_arrow_type is None or deserialization fails
301
+ """
302
+ if not serialized_arrow_type:
303
+ raise ValueError(
304
+ "serialized_arrow_type is required for reliable type normalization"
305
+ )
306
+
307
+ # Deserialize the PyArrow type from base64-encoded pickle
308
+ serialized_bytes = base64.b64decode(serialized_arrow_type)
309
+ pa_type = pickle.loads(serialized_bytes)
310
+
311
+ # Use the common utility function for normalization
312
+ return get_base_arrow_type_name(pa_type)
313
+
314
+
315
+ def generate_reader_compatibility_mapping(
316
+ results: List[Dict[str, Any]],
317
+ output_file: str = "./reader_compatibility_mapping.py",
318
+ ) -> str:
319
+ """Generate reader compatibility mapping Python file from test results."""
320
+
321
+ # Collect compatibility data: (arrow_type, writer_dataset) -> list of compatible readers
322
+ compatibility_mapping = {}
323
+
324
+ for result in results:
325
+ if not result.get("success", False):
326
+ continue
327
+
328
+ # Get serialized arrow type (required for normalization)
329
+ serialized_arrow_type = result.get("serialized_arrow_type")
330
+
331
+ # Normalize complex types to base type names using serialized type
332
+ arrow_type = _normalize_complex_types(serialized_arrow_type)
333
+ writer_dataset = result["dataset_type"]
334
+ content_type = result["content_type"]
335
+
336
+ # Create key tuple
337
+ key = (arrow_type, writer_dataset, content_type)
338
+
339
+ compatible_readers = []
340
+
341
+ # Check PyArrow read success
342
+ pyarrow_read_success = result.get("pyarrow_read_success")
343
+ if pyarrow_read_success:
344
+ compatible_readers.append("PYARROW")
345
+
346
+ # Check other dataset type read results
347
+ dataset_read_results = result.get("dataset_read_results", [])
348
+ for read_result in dataset_read_results:
349
+ reader_dataset = read_result["dataset_type"]
350
+ success = read_result["success"]
351
+ if success:
352
+ # Map to DatasetType enum values
353
+ dataset_type_mapping = {
354
+ "pyarrow": "PYARROW",
355
+ "pandas": "PANDAS",
356
+ "polars": "POLARS",
357
+ "daft": "DAFT",
358
+ "ray_dataset": "RAY_DATASET",
359
+ }
360
+ enum_value = dataset_type_mapping.get(reader_dataset)
361
+ if enum_value and enum_value not in compatible_readers:
362
+ compatible_readers.append(enum_value)
363
+
364
+ if compatible_readers:
365
+ # Merge with existing compatibility for same key (union of compatible readers)
366
+ if key in compatibility_mapping:
367
+ existing_readers = set(compatibility_mapping[key])
368
+ new_readers = set(compatible_readers)
369
+ compatibility_mapping[key] = list(existing_readers.union(new_readers))
370
+ else:
371
+ compatibility_mapping[key] = compatible_readers
372
+
373
+ # Generate Python file content
374
+ python_content = '''"""
375
+ Reader compatibility mapping generated from test results.
376
+
377
+ This mapping shows which DatasetType readers can successfully read data
378
+ written by each (arrow_type, writer_dataset_type, content_type) combination.
379
+
380
+ Keys: (arrow_type, writer_dataset_type, content_type)
381
+ Values: List of compatible DatasetType enum values
382
+ """
383
+
384
+ from deltacat.types.tables import DatasetType
385
+
386
+ # Mapping of (arrow_type, writer_dataset_type, content_type) -> list of compatible readers
387
+ READER_COMPATIBILITY_MAPPING = {
388
+ '''
389
+
390
+ # Sort keys for consistent output
391
+ for key in sorted(compatibility_mapping.keys()):
392
+ compatible_readers = compatibility_mapping[key]
393
+ arrow_type, writer_dataset, content_type = key
394
+
395
+ # Format as Python tuple and list
396
+ readers_str = (
397
+ "["
398
+ + ", ".join(
399
+ [f"DatasetType.{reader}" for reader in sorted(compatible_readers)]
400
+ )
401
+ + "]"
402
+ )
403
+ python_content += f' ("{arrow_type}", "{writer_dataset}", "{content_type}"): {readers_str},\n'
404
+
405
+ python_content += '''}
406
+
407
+ def get_compatible_readers(arrow_type: str, writer_dataset_type: str, content_type: str):
408
+ """Get list of compatible reader DatasetTypes for given combination."""
409
+ key = (arrow_type, writer_dataset_type, content_type)
410
+ compatible_readers = READER_COMPATIBILITY_MAPPING.get(key, [])
411
+ if (
412
+ DatasetType.PANDAS in compatible_readers
413
+ and DatasetType.NUMPY not in compatible_readers
414
+ ):
415
+ compatible_readers = compatible_readers + [DatasetType.NUMPY]
416
+ return compatible_readers
417
+
418
+ def is_reader_compatible(arrow_type: str, writer_dataset_type: str, content_type: str, reader_dataset_type: DatasetType) -> bool:
419
+ """Check if a specific reader is compatible with given combination."""
420
+ compatible_readers = get_compatible_readers(arrow_type, writer_dataset_type, content_type)
421
+ return reader_dataset_type in compatible_readers
422
+ '''
423
+
424
+ # Write to file
425
+ with open(output_file, "w") as f:
426
+ f.write(python_content)
427
+
428
+ print(f"✅ Generated reader compatibility mapping: {output_file}")
429
+ return output_file
430
+
431
+
432
+ def generate_complete_markdown_from_json(
433
+ json_file: str, output_file: str = "./docs/schema/README.md"
434
+ ):
435
+ """Generate complete markdown from JSON results."""
436
+
437
+ print(f"Loading results from {json_file}...")
438
+ results, metadata = load_test_data(json_file)
439
+ print(f"Loaded {len(results)} test results")
440
+
441
+ if metadata:
442
+ print(f"Found metadata with test date: {metadata.get('test_date', 'unknown')}")
443
+ print(f"PyArrow version: {metadata.get('pyarrow_version', 'unknown')}")
444
+ else:
445
+ raise ValueError(f"No metadata found in {json_file}")
446
+
447
+ # Get unique arrow types from results
448
+ arrow_types_in_results = sorted(list(set(r["arrow_type"] for r in results)))
449
+ print(
450
+ f"Found {len(arrow_types_in_results)} unique arrow types: {arrow_types_in_results}"
451
+ )
452
+ # map arrow type names to their descriptions using each results original_arrow_type field
453
+ arrow_type_descriptions = {}
454
+ for arrow_type in arrow_types_in_results:
455
+ # extract original_arrow_type field from each result
456
+ original_arrow_type = next(
457
+ (
458
+ r["original_arrow_type"]
459
+ for r in results
460
+ if r["arrow_type"] == arrow_type
461
+ ),
462
+ None,
463
+ )
464
+ if original_arrow_type:
465
+ arrow_type_descriptions[arrow_type] = original_arrow_type
466
+
467
+ # Generate dynamic metadata section
468
+ test_date = metadata.get("test_date", "unknown")
469
+ if "T" in test_date:
470
+ # Convert ISO format to date only
471
+ test_date = test_date.split("T")[0]
472
+
473
+ pyarrow_version = metadata.get("pyarrow_version", "unknown")
474
+
475
+ markdown = f"""# Schemas
476
+
477
+ DeltaCAT tables may either be schemaless or backed by a schema based on the [Arrow type system](https://arrow.apache.org/docs/python/api/datatypes.html).
478
+
479
+ ## Schemaless Tables
480
+ A schemaless table is created via `dc.create_table(new_table_name)` (schema omitted) or
481
+ `dc.write_to_table(data, new_table_name, schema=None)` (schema explicitly set to `None` when writing
482
+ to a new table). Schemaless tables only save a record of files written to them over time without schema
483
+ inference, data validation, or data coercion. Since it may not be possible to derive a unified schema on
484
+ read, data returned via `manifest_table = dc.read_table(table_name)` is always a **Manifest Table**
485
+ containing an ordered list of files written to the table and their manifest entry info (e.g., size,
486
+ content type, content encoding, etc.). For example:
487
+
488
+ | Column | Value | Type | Description |
489
+ |----------------------------|---------------------------|----------|------------------------------------------------------|
490
+ | author_name | "deltacat.write_to_table" | str | Manifest producer name |
491
+ | author_version | "2.0.0b12" | str | Manifest producer version |
492
+ | id | None | str | Manifest entry ID (can be None) |
493
+ | mandatory | True | bool | Raise error if file is missing (True/False) |
494
+ | meta_content_encoding | "identity" | str | File content encoding (identity = no encoding) |
495
+ | meta_content_length | 2413 | int64 | File size in bytes (2.4 KB) |
496
+ | meta_content_type | "application/parquet" | str | File format (Parquet) |
497
+ | meta_record_count | 2 | int64 | Number of records in this file |
498
+ | meta_source_content_length | 176 | int64 | Original data size in memory (176 bytes) |
499
+ | previous_stream_position | 1 | int64 | Previous delta stream position |
500
+ | stream_position | 2 | int64 | This delta's stream position |
501
+ | path | /my_catalog/data/file.pq | str | File path relative to catalog root |
502
+
503
+ If you know that all paths can be read into a standard DeltaCAT dataset type (e.g., Daft, Ray Data, PyArrow,
504
+ Pandas, Polars), then this manifest table can be materialized via
505
+ `dataframe = dc.from_manifest_table(manifest_table)`.
506
+
507
+ Once created, schemaless tables cannot be altered to have a schema.
508
+
509
+ ## Standard Tables
510
+ Tables with schemas have their data validation and schema evolution behavior governed by **Schema
511
+ Consistency Types** and **Schema Evolution Modes**. This ensures that the table can always be materialized
512
+ with a unified schema at read time. By default, any DeltaCAT table created via
513
+ `dc.write_to_table(data, new_table_name)` infers a unified Arrow schema on write, and rejects writes
514
+ that would break reads for one or more supported dataset types. Once created, a standard table's
515
+ schema cannot be dropped.
516
+
517
+ ## Schema Consistency Types
518
+ DeltaCAT table schemas can either be **inferred** (default behavior) to follow the shape of written data
519
+ or **enforced** to define the shape of written data. The default schema consistency type of all fields
520
+ in a DeltaCAT table schema is configured by setting the `DEFAULT_SCHEMA_CONSISTENCY_TYPE` table property
521
+ to one of the following values:
522
+
523
+ \n\n**NONE** (default): No data consistency checks are run. The schema field's type will be automatically
524
+ promoted to the most permissive Arrow data type that all values can be safely cast to using
525
+ `pyarrow.unify_schemas(schemas, promote_options="permissive")`. If safe casting is impossible,
526
+ then a `SchemaValidationError` will be raised.
527
+
528
+ \n\n**COERCE**: Coerce fields to fit the schema whenever possible, even if data truncation is required. Fields
529
+ will be coerced using either `pyarrow.compute.cast` or `daft.expression.cast` with default options. If the
530
+ field cannot be coerced to fit the given type, then a `SchemaValidationError` will be raised.
531
+
532
+ \n\n**VALIDATE**: Strict data consistency checks. An error is raised for any field that doesn't match the schema.
533
+
534
+ A field's Schema Consistency Type can only be updated from least to most permissive (VALIDATE -> COERCE -> NONE).
535
+
536
+ ## Schema Evolution Modes
537
+ Schema evolution modes control how schema changes are handled when writing to a table.
538
+ A table's schema evolution mode is configured by setting the `SCHEMA_EVOLUTION_MODE`
539
+ table property to one of the following values:
540
+
541
+ \n\n**AUTO** (default): New fields are automatically added to the table schema at write time with their
542
+ Schema Consistency Type set by the `DEFAULT_SCHEMA_CONSISTENCY_TYPE` table property.
543
+
544
+ \n\n**MANUAL**: Existing schema fields with a Schema Consistency Type of `None` will continue to be automatically
545
+ updated to match the written data. New fields and other schema changes must be made explicitly via
546
+ `dc.alter_table(table_name, schema_updates=new_schema_updates)`. Attempts to write data with fields not in the
547
+ existing schema will raise a `SchemaValidationError`.
548
+
549
+ \n\n**DISABLED**: Existing schema fields with a Schema Consistency Type of `None` will continue to be automatically
550
+ updated to match the written data. All other schema changes are disabled, and manual attempts to alter the table's
551
+ schema will raise a `TableValidationError`.
552
+
553
+ A table's Schema Evolution Mode can be updated at any time.
554
+
555
+ ## Arrow to File Format Type Mappings
556
+ The tables below show DeltaCAT's actual Arrow write type mappings across all supported dataset and content types.
557
+ These mappings are generated by:
558
+
559
+ 1. Creating a PyArrow table with the target PyArrow data type via `pa.Table.from_arrays([pa.array(test_data, type=arrow_type)])`.
560
+ 2. Casting to the target dataset type via `data = dc.from_pyarrow(pyarrow_table, target_dataset_type)`.
561
+ 3. Writing to the target content type via `dc.write_to_table(data, table_name, content_type=target_content_type)`.
562
+
563
+ More details are available in the [type mapping generation script](../../deltacat/docs/autogen/schema/inference/generate_type_mappings.py).
564
+
565
+ ### Runtime Environment
566
+ **Generation Date:** {test_date}
567
+ \n**PyArrow Version:** {pyarrow_version}"""
568
+
569
+ # Add other version information if available
570
+ if metadata.get("deltacat_version"):
571
+ markdown += f"\n\n**DeltaCAT Version:** {metadata['deltacat_version']}"
572
+ if metadata.get("pandas_version"):
573
+ markdown += f"\n\n**Pandas Version:** {metadata['pandas_version']}"
574
+ if metadata.get("polars_version"):
575
+ markdown += f"\n\n**Polars Version:** {metadata['polars_version']}"
576
+ if metadata.get("daft_version") and metadata["daft_version"] != "not_available":
577
+ markdown += f"\n\n**Daft Version:** {metadata['daft_version']}"
578
+ if metadata.get("ray_version") and metadata["ray_version"] != "not_available":
579
+ markdown += f"\n\n**Ray Version:** {metadata['ray_version']}"
580
+
581
+ markdown += f"""
582
+
583
+ ### Type Mapping Tables
584
+ """
585
+
586
+ # Generate tables for each arrow type
587
+ for arrow_type in arrow_types_in_results:
588
+ description = arrow_type_descriptions.get(arrow_type, arrow_type)
589
+ type_table = generate_type_table_markdown(arrow_type, description, results)
590
+ markdown += type_table
591
+ print(f"Generated table for {arrow_type}")
592
+
593
+ # Generate read compatibility matrix
594
+ print("Generating read compatibility matrix...")
595
+ read_compat_markdown = generate_read_compatibility_matrix_markdown(
596
+ results, arrow_type_descriptions
597
+ )
598
+ markdown += read_compat_markdown
599
+ print("Generated read compatibility matrix")
600
+
601
+ # Write to file
602
+ with open(output_file, "w") as f:
603
+ f.write(markdown)
604
+
605
+ print(f"✅ Generated markdown: {output_file}")
606
+
607
+ # Analyze the results to identify the physical schema extraction issues
608
+ print("\n" + "=" * 80)
609
+ print("ANALYSIS: Physical Schema Extraction Issues")
610
+ print("=" * 80)
611
+
612
+ successful_extractions = 0
613
+ failed_extractions = 0
614
+ no_physical_data = 0
615
+
616
+ for result in results:
617
+ if result.get("success", False):
618
+ physical_schema = result.get("physical_schema", {})
619
+ if physical_schema.get("error"):
620
+ failed_extractions += 1
621
+ if "no written files found" in physical_schema.get("error", "").lower():
622
+ no_physical_data += 1
623
+ elif physical_schema.get("columns"):
624
+ successful_extractions += 1
625
+ else:
626
+ no_physical_data += 1
627
+
628
+ print(f"Successful physical schema extractions: {successful_extractions}")
629
+ print(f"Failed extractions: {failed_extractions}")
630
+ print(f"No physical data: {no_physical_data}")
631
+ print(
632
+ f"Total successful tests: {len([r for r in results if r.get('success', False)])}"
633
+ )
634
+
635
+ return output_file
636
+
637
+
638
+ def main():
639
+ if len(sys.argv) < 2 or len(sys.argv) > 3:
640
+ print(
641
+ "Usage: python parse_json_type_mappings.py <json_results_file> [--python]"
642
+ )
643
+ sys.exit(1)
644
+
645
+ json_file = sys.argv[1]
646
+ generate_python = len(sys.argv) == 3 and sys.argv[2] == "--python"
647
+
648
+ if generate_python:
649
+ # Generate reader compatibility mapping
650
+ print(f"Loading results from {json_file} for compatibility mapping...")
651
+ results, _ = load_test_data(json_file)
652
+ print(f"Loaded {len(results)} test results")
653
+
654
+ # Navigate to project root for output
655
+ project_root = Path(__file__)
656
+ while project_root.name != "deltacat":
657
+ project_root = project_root.parent
658
+ output_file_path = project_root / "utils" / "reader_compatibility_mapping.py"
659
+ print(f"Writing reader compatibility mapping to {output_file_path}")
660
+ generate_reader_compatibility_mapping(results, str(output_file_path))
661
+ else:
662
+ # Generate markdown documentation
663
+ # keep navigating to parent directories until we find the docs directory
664
+ docs_dir = Path(__file__)
665
+ while docs_dir.name != "docs":
666
+ docs_dir = docs_dir.parent
667
+ output_file_path = docs_dir / "schema" / "README.md"
668
+ print(f"Writing to {output_file_path}")
669
+ generate_complete_markdown_from_json(json_file, output_file_path)
670
+
671
+
672
+ if __name__ == "__main__":
673
+ main()
deltacat/env.py CHANGED
@@ -1,3 +1,4 @@
1
+ import argparse
1
2
  import os
2
3
  import logging
3
4
  from typing import Dict, Any
@@ -49,3 +50,12 @@ def create_ray_runtime_environment() -> Dict[str, Any]:
49
50
  "env_vars": worker_env_vars,
50
51
  }
51
52
  return runtime_environment
53
+
54
+
55
+ def store_cli_args_in_os_environ(script_args_list=[]):
56
+ parser = argparse.ArgumentParser()
57
+ for args, kwargs in script_args_list:
58
+ parser.add_argument(*args, **kwargs)
59
+ args = parser.parse_args()
60
+ print(f"Command Line Arguments: {args}")
61
+ os.environ.update(vars(args))