deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
deltacat/types/media.py CHANGED
@@ -1,11 +1,14 @@
1
1
  from enum import Enum
2
- from typing import Set
2
+ from typing import Set, Dict
3
3
 
4
4
 
5
5
  class ContentType(str, Enum):
6
6
  """
7
- Enumeration used to resolve the entity-body Media Type (formerly known as
8
- MIME type) in an HTTP request.
7
+ Enumeration used to resolve a file's entity-body Media Type (formerly known
8
+ as MIME type). All content types here are writeable by at least one
9
+ :class:`deltacat.types.media.DatasetType`. The Media Type is used as the
10
+ content type of each :class:`deltacat.storage.model.manifest.ManifestEntry`
11
+ written by that dataset type.
9
12
 
10
13
  https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17
11
14
 
@@ -16,16 +19,10 @@ class ContentType(str, Enum):
16
19
  AVRO = "application/avro"
17
20
  BINARY = "application/octet-stream"
18
21
  CSV = "text/csv"
19
- HDF = "application/x-hdf"
20
- HTML = "text/html"
21
22
  JSON = "application/json"
22
- TEXT = "text/plain"
23
- WEBDATASET = "application/x-web-dataset"
24
- XML = "text/xml"
25
23
 
26
24
  # unregistered types
27
25
  FEATHER = "application/feather"
28
- ION = "application/x-amzn-ion"
29
26
  ORC = "application/orc"
30
27
  PARQUET = "application/parquet"
31
28
  PSV = "text/psv"
@@ -37,7 +34,7 @@ class ContentEncoding(str, Enum):
37
34
  """
38
35
  Enumeration used as a modifier for :class:`deltacat.types.media.ContentType`
39
36
  to indicate that additional encodings have been applied to the entity-body
40
- Media Type in an HTTP request.
37
+ Media Type.
41
38
 
42
39
  https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.11
43
40
 
@@ -55,6 +52,46 @@ class ContentEncoding(str, Enum):
55
52
  SNAPPY = "snappy"
56
53
 
57
54
 
55
+ # Map of file extensions to content types
56
+ EXT_TO_CONTENT_TYPE: Dict[str, ContentType] = {
57
+ ".parquet": ContentType.PARQUET,
58
+ ".pq": ContentType.PARQUET,
59
+ ".csv": ContentType.CSV,
60
+ ".tsv": ContentType.TSV,
61
+ ".psv": ContentType.PSV,
62
+ ".json": ContentType.JSON,
63
+ ".feather": ContentType.FEATHER,
64
+ ".avro": ContentType.AVRO,
65
+ ".orc": ContentType.ORC,
66
+ }
67
+
68
+ # Inverse map of content types to file extensions
69
+ CONTENT_TYPE_TO_EXT: Dict[ContentType, str] = {
70
+ v: k for k, v in EXT_TO_CONTENT_TYPE.items()
71
+ }
72
+
73
+ # Map of file extensions to content encodings
74
+ EXT_TO_CONTENT_ENCODING: Dict[str, ContentEncoding] = {
75
+ ".gz": ContentEncoding.GZIP,
76
+ ".bz2": ContentEncoding.BZIP2,
77
+ ".zst": ContentEncoding.ZSTD,
78
+ ".sz": ContentEncoding.SNAPPY,
79
+ ".zz": ContentEncoding.DEFLATE,
80
+ ".zip": ContentEncoding.DEFLATE,
81
+ }
82
+
83
+ # Inverse map of content encodings to file extensions
84
+ CONTENT_ENCODING_TO_EXT: Dict[ContentEncoding, str] = {
85
+ v: k for k, v in EXT_TO_CONTENT_ENCODING.items()
86
+ }
87
+
88
+ SCHEMA_CONTENT_TYPES: Set[str] = {
89
+ ContentType.PARQUET.value,
90
+ ContentType.ORC.value,
91
+ ContentType.FEATHER.value,
92
+ ContentType.AVRO.value,
93
+ }
94
+
58
95
  DELIMITED_TEXT_CONTENT_TYPES: Set[str] = {
59
96
  ContentType.UNESCAPED_TSV.value,
60
97
  ContentType.TSV.value,
@@ -119,6 +156,158 @@ class DatasetType(str, Enum):
119
156
  DatasetType.PYARROW_PARQUET,
120
157
  }
121
158
 
159
+ def readable_content_types(self) -> Set[ContentType]:
160
+ # if this is DAFT then it can read PARQUET, JSON, and CSV
161
+ if self == DatasetType.DAFT:
162
+ return {
163
+ ContentType.PARQUET,
164
+ ContentType.JSON,
165
+ ContentType.CSV,
166
+ ContentType.PSV,
167
+ ContentType.TSV,
168
+ ContentType.UNESCAPED_TSV,
169
+ }
170
+ if self == DatasetType.RAY_DATASET:
171
+ return {
172
+ ContentType.CSV,
173
+ ContentType.TSV,
174
+ ContentType.UNESCAPED_TSV,
175
+ ContentType.PSV,
176
+ ContentType.PARQUET,
177
+ ContentType.JSON,
178
+ ContentType.AVRO,
179
+ ContentType.ORC,
180
+ ContentType.FEATHER,
181
+ }
182
+ if self == DatasetType.PYARROW:
183
+ return {
184
+ ContentType.CSV,
185
+ ContentType.TSV,
186
+ ContentType.UNESCAPED_TSV,
187
+ ContentType.PSV,
188
+ ContentType.PARQUET,
189
+ ContentType.FEATHER,
190
+ ContentType.JSON,
191
+ ContentType.AVRO,
192
+ ContentType.ORC,
193
+ }
194
+ if self == DatasetType.PANDAS:
195
+ return {
196
+ ContentType.CSV,
197
+ ContentType.TSV,
198
+ ContentType.UNESCAPED_TSV,
199
+ ContentType.PSV,
200
+ ContentType.PARQUET,
201
+ ContentType.FEATHER,
202
+ ContentType.JSON,
203
+ ContentType.AVRO,
204
+ ContentType.ORC,
205
+ }
206
+ if self == DatasetType.POLARS:
207
+ return {
208
+ ContentType.CSV,
209
+ ContentType.TSV,
210
+ ContentType.UNESCAPED_TSV,
211
+ ContentType.PSV,
212
+ ContentType.PARQUET,
213
+ ContentType.FEATHER,
214
+ ContentType.JSON,
215
+ ContentType.AVRO,
216
+ ContentType.ORC,
217
+ }
218
+ if self == DatasetType.NUMPY:
219
+ return {
220
+ ContentType.CSV,
221
+ ContentType.TSV,
222
+ ContentType.UNESCAPED_TSV,
223
+ ContentType.PSV,
224
+ ContentType.PARQUET,
225
+ ContentType.FEATHER,
226
+ ContentType.JSON,
227
+ ContentType.AVRO,
228
+ ContentType.ORC,
229
+ }
230
+ if self == DatasetType.PYARROW_PARQUET:
231
+ return {ContentType.PARQUET}
232
+ raise ValueError(f"No readable content types for {self}")
233
+
234
+ def writable_content_types(self) -> Set[ContentType]:
235
+ if self == DatasetType.PYARROW:
236
+ return {
237
+ ContentType.CSV,
238
+ ContentType.TSV,
239
+ ContentType.UNESCAPED_TSV,
240
+ ContentType.PSV,
241
+ ContentType.PARQUET,
242
+ ContentType.FEATHER,
243
+ ContentType.JSON,
244
+ ContentType.AVRO,
245
+ ContentType.ORC,
246
+ }
247
+ if self == DatasetType.PANDAS:
248
+ return {
249
+ ContentType.CSV,
250
+ ContentType.TSV,
251
+ ContentType.UNESCAPED_TSV,
252
+ ContentType.PSV,
253
+ ContentType.PARQUET,
254
+ ContentType.FEATHER,
255
+ ContentType.JSON,
256
+ ContentType.AVRO,
257
+ ContentType.ORC,
258
+ }
259
+ if self == DatasetType.POLARS:
260
+ return {
261
+ ContentType.CSV,
262
+ ContentType.TSV,
263
+ ContentType.UNESCAPED_TSV,
264
+ ContentType.PSV,
265
+ ContentType.PARQUET,
266
+ ContentType.FEATHER,
267
+ ContentType.JSON,
268
+ ContentType.AVRO,
269
+ ContentType.ORC,
270
+ }
271
+ if self == DatasetType.RAY_DATASET:
272
+ return {
273
+ ContentType.CSV,
274
+ ContentType.TSV,
275
+ ContentType.UNESCAPED_TSV,
276
+ ContentType.PSV,
277
+ ContentType.PARQUET,
278
+ ContentType.JSON,
279
+ }
280
+ if self == DatasetType.DAFT:
281
+ return {
282
+ ContentType.CSV,
283
+ ContentType.TSV,
284
+ ContentType.UNESCAPED_TSV,
285
+ ContentType.PSV,
286
+ ContentType.PARQUET,
287
+ ContentType.JSON,
288
+ }
289
+ if self == DatasetType.NUMPY:
290
+ return {
291
+ ContentType.CSV,
292
+ ContentType.TSV,
293
+ ContentType.UNESCAPED_TSV,
294
+ ContentType.PSV,
295
+ ContentType.PARQUET,
296
+ ContentType.FEATHER,
297
+ ContentType.JSON,
298
+ ContentType.AVRO,
299
+ ContentType.ORC,
300
+ }
301
+ if self == DatasetType.PYARROW_PARQUET:
302
+ return {}
303
+ raise ValueError(f"No writable content types for {self}")
304
+
305
+ def can_read(self, content_type: ContentType) -> bool:
306
+ return content_type in self.readable_content_types()
307
+
308
+ def can_write(self, content_type: ContentType) -> bool:
309
+ return content_type in self.writable_content_types()
310
+
122
311
 
123
312
  # deprecated by DatasetType - populated dynamically for backwards compatibility
124
313
  TableType = Enum(
@@ -140,6 +329,26 @@ class StorageType(str, Enum):
140
329
  DISTRIBUTED = "distributed"
141
330
 
142
331
 
332
+ DATASET_TYPE_TO_SUPPORTED_READ_CONTENT_TYPES: Dict[DatasetType, Set[str]] = {
333
+ DatasetType.DAFT: {
334
+ ContentType.CSV,
335
+ ContentType.PARQUET,
336
+ ContentType.JSON,
337
+ },
338
+ DatasetType.RAY_DATASET: {
339
+ ContentType.CSV,
340
+ ContentType.TSV,
341
+ ContentType.UNESCAPED_TSV,
342
+ ContentType.PSV,
343
+ ContentType.PARQUET,
344
+ ContentType.JSON,
345
+ ContentType.AVRO,
346
+ ContentType.ORC,
347
+ ContentType.FEATHER,
348
+ },
349
+ }
350
+
351
+
143
352
  class DatastoreType(str, Enum):
144
353
  """
145
354
  Enumeration used to identify the type of reader required to connect to and
@@ -148,10 +357,11 @@ class DatastoreType(str, Enum):
148
357
  writer for that data store. Note that, although some overlap exists between
149
358
  enum values here and in :class:`deltacat.types.media.ContentType`, each
150
359
  enum serve a different purpose. The purpose of
151
- :class:`deltacat.types.media.ContentType` is to resolve the MIME type for
152
- specific types of files, and may be used together with multi-content-type
153
- datastore types to describe the specific file types read/written to that
154
- datastore (e.g., Iceberg, Hudi, Delta Lake, Audio, Images, Video, etc.)
360
+ :class:`deltacat.types.media.ContentType` is to resolve a file's MIME type,
361
+ and may be used together with datastores that support storing different
362
+ file types to describe the specific file type read/written from/to that
363
+ datastore (e.g., DeltaCAT, Iceberg, Hudi, Delta Lake, Audio, Images, Video,
364
+ etc.)
155
365
  """
156
366
 
157
367
  # DeltaCAT Catalog Datasets