deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
deltacat/utils/pandas.py CHANGED
@@ -1,27 +1,305 @@
1
1
  import csv
2
- import io
3
2
  import logging
4
3
  import math
4
+ import bz2
5
+ import gzip
6
+ from functools import partial
5
7
  from typing import Any, Callable, Dict, Iterable, List, Optional, Union
6
8
 
7
9
  import pandas as pd
8
10
  import pyarrow as pa
11
+ import pyarrow.fs as pafs
9
12
  from fsspec import AbstractFileSystem
10
13
  from ray.data.datasource import FilenameProvider
11
14
 
12
15
  from deltacat import logs
13
16
  from deltacat.types.media import (
14
17
  DELIMITED_TEXT_CONTENT_TYPES,
15
- EXPLICIT_COMPRESSION_CONTENT_TYPES,
16
18
  TABULAR_CONTENT_TYPES,
17
19
  ContentEncoding,
18
20
  ContentType,
19
21
  )
20
22
  from deltacat.utils.common import ContentTypeKwargsProvider, ReadKwargsProvider
21
23
  from deltacat.utils.performance import timed_invocation
24
+ from deltacat.utils.filesystem import resolve_path_and_filesystem
25
+ from deltacat.types.partial_download import PartialFileDownloadParams
22
26
 
23
27
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
24
28
 
29
+ # Encoding to file initialization function mapping
30
+ ENCODING_TO_FILE_INIT: Dict[str, Callable] = {
31
+ ContentEncoding.GZIP.value: partial(gzip.open, mode="rb"),
32
+ ContentEncoding.BZIP2.value: partial(bz2.open, mode="rb"),
33
+ ContentEncoding.IDENTITY.value: lambda file_path: file_path,
34
+ }
35
+
36
+
37
+ def read_csv(
38
+ path: str,
39
+ *,
40
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
41
+ fs_open_kwargs: Dict[str, any] = {},
42
+ content_encoding: str = ContentEncoding.IDENTITY.value,
43
+ **read_kwargs,
44
+ ) -> pd.DataFrame:
45
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
46
+ path, filesystem = resolve_path_and_filesystem(path)
47
+ with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
48
+ # Handle compression with smart detection for PyArrow auto-decompression
49
+ if content_encoding in [
50
+ ContentEncoding.GZIP.value,
51
+ ContentEncoding.BZIP2.value,
52
+ ]:
53
+ try:
54
+ # First try to read as if already decompressed by PyArrow
55
+ return pd.read_csv(f, **read_kwargs)
56
+ except (
57
+ gzip.BadGzipFile,
58
+ OSError,
59
+ UnicodeDecodeError,
60
+ pd.errors.EmptyDataError,
61
+ Exception,
62
+ ):
63
+ # If that fails, we need to reopen the file since the stream may be closed/corrupted
64
+ pass
65
+
66
+ # Reopen and try manual decompression
67
+ with filesystem.open_input_stream(path, **fs_open_kwargs) as f_retry:
68
+ input_file_init = ENCODING_TO_FILE_INIT.get(
69
+ content_encoding, lambda x: x
70
+ )
71
+ with input_file_init(f_retry) as input_file:
72
+ return pd.read_csv(input_file, **read_kwargs)
73
+ else:
74
+ return pd.read_csv(f, **read_kwargs)
75
+ else:
76
+ # fsspec AbstractFileSystem
77
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
78
+ # Handle compression
79
+ input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
80
+ with input_file_init(f) as input_file:
81
+ return pd.read_csv(input_file, **read_kwargs)
82
+
83
+
84
+ def read_parquet(
85
+ path: str,
86
+ *,
87
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
88
+ fs_open_kwargs: Dict[str, any] = {},
89
+ content_encoding: str = ContentEncoding.IDENTITY.value,
90
+ **read_kwargs,
91
+ ) -> pd.DataFrame:
92
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
93
+ path, filesystem = resolve_path_and_filesystem(path)
94
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f:
95
+ # Handle compression with smart detection for PyArrow auto-decompression
96
+ if content_encoding in [
97
+ ContentEncoding.GZIP.value,
98
+ ContentEncoding.BZIP2.value,
99
+ ]:
100
+ try:
101
+ # First try to read as if already decompressed by PyArrow
102
+ return pd.read_parquet(f, **read_kwargs)
103
+ except (gzip.BadGzipFile, OSError, pa.ArrowInvalid, Exception):
104
+ # If that fails, we need to reopen the file
105
+ pass
106
+
107
+ # Reopen and try manual decompression
108
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f_retry:
109
+ input_file_init = ENCODING_TO_FILE_INIT.get(
110
+ content_encoding, lambda x: x
111
+ )
112
+ with input_file_init(f_retry) as input_file:
113
+ return pd.read_parquet(input_file, **read_kwargs)
114
+ else:
115
+ return pd.read_parquet(f, **read_kwargs)
116
+ else:
117
+ # fsspec AbstractFileSystem
118
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
119
+ # Handle compression
120
+ input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
121
+ with input_file_init(f) as input_file:
122
+ return pd.read_parquet(input_file, **read_kwargs)
123
+
124
+
125
+ def read_feather(
126
+ path: str,
127
+ *,
128
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
129
+ fs_open_kwargs: Dict[str, any] = {},
130
+ content_encoding: str = ContentEncoding.IDENTITY.value,
131
+ **read_kwargs,
132
+ ) -> pd.DataFrame:
133
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
134
+ path, filesystem = resolve_path_and_filesystem(path)
135
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f:
136
+ # Handle compression with smart detection for PyArrow auto-decompression
137
+ if content_encoding in [
138
+ ContentEncoding.GZIP.value,
139
+ ContentEncoding.BZIP2.value,
140
+ ]:
141
+ try:
142
+ # First try to read as if already decompressed by PyArrow
143
+ return pd.read_feather(f, **read_kwargs)
144
+ except (gzip.BadGzipFile, OSError, pa.ArrowInvalid, Exception):
145
+ # If that fails, we need to reopen the file
146
+ pass
147
+
148
+ # Reopen and try manual decompression
149
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f_retry:
150
+ input_file_init = ENCODING_TO_FILE_INIT.get(
151
+ content_encoding, lambda x: x
152
+ )
153
+ with input_file_init(f_retry) as input_file:
154
+ return pd.read_feather(input_file, **read_kwargs)
155
+ else:
156
+ return pd.read_feather(f, **read_kwargs)
157
+ else:
158
+ # fsspec AbstractFileSystem
159
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
160
+ # Handle compression
161
+ input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
162
+ with input_file_init(f) as input_file:
163
+ return pd.read_feather(input_file, **read_kwargs)
164
+
165
+
166
+ def read_orc(
167
+ path: str,
168
+ *,
169
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
170
+ fs_open_kwargs: Dict[str, any] = {},
171
+ content_encoding: str = ContentEncoding.IDENTITY.value,
172
+ **read_kwargs,
173
+ ) -> pd.DataFrame:
174
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
175
+ path, filesystem = resolve_path_and_filesystem(path)
176
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f:
177
+ # Handle compression with smart detection for PyArrow auto-decompression
178
+ if content_encoding in [
179
+ ContentEncoding.GZIP.value,
180
+ ContentEncoding.BZIP2.value,
181
+ ]:
182
+ try:
183
+ # First try to read as if already decompressed by PyArrow
184
+ return pd.read_orc(f, **read_kwargs)
185
+ except (gzip.BadGzipFile, OSError, pa.ArrowInvalid, Exception):
186
+ # If that fails, we need to reopen the file
187
+ pass
188
+
189
+ # Reopen and try manual decompression
190
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f_retry:
191
+ input_file_init = ENCODING_TO_FILE_INIT.get(
192
+ content_encoding, lambda x: x
193
+ )
194
+ with input_file_init(f_retry) as input_file:
195
+ return pd.read_orc(input_file, **read_kwargs)
196
+ else:
197
+ return pd.read_orc(f, **read_kwargs)
198
+ else:
199
+ # fsspec AbstractFileSystem
200
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
201
+ # Handle compression
202
+ input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
203
+ with input_file_init(f) as input_file:
204
+ return pd.read_orc(input_file, **read_kwargs)
205
+
206
+
207
+ def read_json(
208
+ path: str,
209
+ *,
210
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
211
+ fs_open_kwargs: Dict[str, any] = {},
212
+ content_encoding: str = ContentEncoding.IDENTITY.value,
213
+ **read_kwargs,
214
+ ) -> pd.DataFrame:
215
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
216
+ path, filesystem = resolve_path_and_filesystem(path)
217
+ with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
218
+ # Handle compression with smart detection for PyArrow auto-decompression
219
+ if content_encoding in [
220
+ ContentEncoding.GZIP.value,
221
+ ContentEncoding.BZIP2.value,
222
+ ]:
223
+ try:
224
+ # First try to read as if already decompressed by PyArrow
225
+ return pd.read_json(f, **read_kwargs)
226
+ except (
227
+ gzip.BadGzipFile,
228
+ OSError,
229
+ UnicodeDecodeError,
230
+ ValueError,
231
+ Exception,
232
+ ):
233
+ # If that fails, we need to reopen the file
234
+ pass
235
+
236
+ # Reopen and try manual decompression
237
+ with filesystem.open_input_stream(path, **fs_open_kwargs) as f_retry:
238
+ input_file_init = ENCODING_TO_FILE_INIT.get(
239
+ content_encoding, lambda x: x
240
+ )
241
+ with input_file_init(f_retry) as input_file:
242
+ return pd.read_json(input_file, **read_kwargs)
243
+ else:
244
+ return pd.read_json(f, **read_kwargs)
245
+ else:
246
+ # fsspec AbstractFileSystem
247
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
248
+ # Handle compression
249
+ input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
250
+ with input_file_init(f) as input_file:
251
+ return pd.read_json(input_file, **read_kwargs)
252
+
253
+
254
+ def read_avro(
255
+ path: str,
256
+ *,
257
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
258
+ fs_open_kwargs: Dict[str, any] = {},
259
+ content_encoding: str = ContentEncoding.IDENTITY.value,
260
+ **read_kwargs,
261
+ ) -> pd.DataFrame:
262
+ """
263
+ Read an Avro file using polars and convert to pandas.
264
+ """
265
+ import polars as pl
266
+
267
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
268
+ path, filesystem = resolve_path_and_filesystem(path)
269
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f:
270
+ # Handle compression with smart detection for PyArrow auto-decompression
271
+ if content_encoding in [
272
+ ContentEncoding.GZIP.value,
273
+ ContentEncoding.BZIP2.value,
274
+ ]:
275
+ try:
276
+ # First try to read as if already decompressed by PyArrow
277
+ pl_df = pl.read_avro(f, **read_kwargs)
278
+ return pl_df.to_pandas()
279
+ except (gzip.BadGzipFile, OSError, Exception):
280
+ # If that fails, we need to reopen the file
281
+ pass
282
+
283
+ # Reopen and try manual decompression
284
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f_retry:
285
+ input_file_init = ENCODING_TO_FILE_INIT.get(
286
+ content_encoding, lambda x: x
287
+ )
288
+ with input_file_init(f_retry) as input_file:
289
+ pl_df = pl.read_avro(input_file, **read_kwargs)
290
+ return pl_df.to_pandas()
291
+ else:
292
+ pl_df = pl.read_avro(f, **read_kwargs)
293
+ return pl_df.to_pandas()
294
+ else:
295
+ # fsspec AbstractFileSystem
296
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
297
+ # Handle compression
298
+ input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
299
+ with input_file_init(f) as input_file:
300
+ pl_df = pl.read_avro(input_file, **read_kwargs)
301
+ return pl_df.to_pandas()
302
+
25
303
 
26
304
  CONTENT_TYPE_TO_PD_READ_FUNC: Dict[str, Callable] = {
27
305
  ContentType.UNESCAPED_TSV.value: pd.read_csv,
@@ -32,6 +310,21 @@ CONTENT_TYPE_TO_PD_READ_FUNC: Dict[str, Callable] = {
32
310
  ContentType.FEATHER.value: pd.read_feather,
33
311
  ContentType.ORC.value: pd.read_orc,
34
312
  ContentType.JSON.value: pd.read_json,
313
+ ContentType.AVRO.value: read_avro,
314
+ }
315
+
316
+
317
+ # New mapping for encoding-aware reader functions used by file_to_dataframe
318
+ CONTENT_TYPE_TO_READ_FN: Dict[str, Callable] = {
319
+ ContentType.UNESCAPED_TSV.value: read_csv,
320
+ ContentType.TSV.value: read_csv,
321
+ ContentType.CSV.value: read_csv,
322
+ ContentType.PSV.value: read_csv,
323
+ ContentType.PARQUET.value: read_parquet,
324
+ ContentType.FEATHER.value: read_feather,
325
+ ContentType.ORC.value: read_orc,
326
+ ContentType.JSON.value: read_json,
327
+ ContentType.AVRO.value: read_avro,
35
328
  }
36
329
 
37
330
 
@@ -67,6 +360,7 @@ def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
67
360
  "header": None,
68
361
  "na_values": [""],
69
362
  "keep_default_na": False,
363
+ "quoting": csv.QUOTE_NONE,
70
364
  }
71
365
  if content_type == ContentType.TSV.value:
72
366
  return {"sep": "\t", "header": None}
@@ -74,11 +368,13 @@ def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
74
368
  return {"sep": ",", "header": None}
75
369
  if content_type == ContentType.PSV.value:
76
370
  return {"sep": "|", "header": None}
371
+ if content_type == ContentType.JSON.value:
372
+ return {"lines": True} # Support NDJSON format
77
373
  if content_type in {
78
374
  ContentType.PARQUET.value,
79
375
  ContentType.FEATHER.value,
80
376
  ContentType.ORC.value,
81
- ContentType.JSON.value,
377
+ ContentType.AVRO.value,
82
378
  }:
83
379
  return {}
84
380
  raise ValueError(f"Unsupported content type: {content_type}")
@@ -92,7 +388,8 @@ ENCODING_TO_PD_COMPRESSION: Dict[str, str] = {
92
388
 
93
389
 
94
390
  def slice_dataframe(
95
- dataframe: pd.DataFrame, max_len: Optional[int]
391
+ dataframe: pd.DataFrame,
392
+ max_len: Optional[int],
96
393
  ) -> List[pd.DataFrame]:
97
394
  """
98
395
  Iteratively create dataframe slices.
@@ -114,6 +411,22 @@ def concat_dataframes(dataframes: List[pd.DataFrame]) -> Optional[pd.DataFrame]:
114
411
  return pd.concat(dataframes, axis=0, copy=False)
115
412
 
116
413
 
414
+ def append_column_to_dataframe(
415
+ dataframe: pd.DataFrame,
416
+ column_name: str,
417
+ column_value: Any,
418
+ ) -> pd.DataFrame:
419
+ dataframe[column_name] = column_value
420
+ return dataframe
421
+
422
+
423
+ def select_columns(
424
+ dataframe: pd.DataFrame,
425
+ column_names: List[str],
426
+ ) -> pd.DataFrame:
427
+ return dataframe[column_names]
428
+
429
+
117
430
  def _add_column_kwargs(
118
431
  content_type: str,
119
432
  column_names: Optional[List[str]],
@@ -135,38 +448,68 @@ def _add_column_kwargs(
135
448
  )
136
449
 
137
450
 
138
- def s3_file_to_dataframe(
139
- s3_url: str,
451
+ def file_to_dataframe(
452
+ path: str,
140
453
  content_type: str,
141
- content_encoding: str,
454
+ content_encoding: str = ContentEncoding.IDENTITY.value,
455
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
142
456
  column_names: Optional[List[str]] = None,
143
457
  include_columns: Optional[List[str]] = None,
144
458
  pd_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
145
- **s3_client_kwargs,
459
+ partial_file_download_params: Optional[PartialFileDownloadParams] = None,
460
+ fs_open_kwargs: Dict[str, Any] = {},
461
+ **kwargs,
146
462
  ) -> pd.DataFrame:
147
-
148
- from deltacat.aws import s3u as s3_utils
149
-
463
+ """
464
+ Read a file into a Pandas DataFrame using any filesystem.
465
+
466
+ Args:
467
+ path: The file path to read
468
+ content_type: The content type of the file (e.g., ContentType.CSV.value)
469
+ content_encoding: The content encoding (default: IDENTITY)
470
+ filesystem: The filesystem to use (if None, will be inferred from path)
471
+ column_names: Optional column names to assign
472
+ include_columns: Optional columns to include in the result
473
+ pd_read_func_kwargs_provider: Optional kwargs provider for customization
474
+ fs_open_kwargs: Optional kwargs for filesystem open operations
475
+ **kwargs: Additional kwargs passed to the reader function
476
+
477
+ Returns:
478
+ pd.DataFrame: The loaded DataFrame
479
+ """
150
480
  logger.debug(
151
- f"Reading {s3_url} to Pandas. Content type: {content_type}. "
481
+ f"Reading {path} to Pandas. Content type: {content_type}. "
152
482
  f"Encoding: {content_encoding}"
153
483
  )
154
- s3_obj = s3_utils.get_object_at_url(s3_url, **s3_client_kwargs)
155
- logger.debug(f"Read S3 object from {s3_url}: {s3_obj}")
156
- pd_read_func = CONTENT_TYPE_TO_PD_READ_FUNC[content_type]
157
- args = [io.BytesIO(s3_obj["Body"].read())]
158
- kwargs = content_type_to_reader_kwargs(content_type)
159
- _add_column_kwargs(content_type, column_names, include_columns, kwargs)
160
-
161
- if content_type in EXPLICIT_COMPRESSION_CONTENT_TYPES:
162
- kwargs["compression"] = ENCODING_TO_PD_COMPRESSION.get(
163
- content_encoding, "infer"
484
+
485
+ pd_read_func = CONTENT_TYPE_TO_READ_FN.get(content_type)
486
+ if not pd_read_func:
487
+ raise NotImplementedError(
488
+ f"Pandas reader for content type '{content_type}' not "
489
+ f"implemented. Known content types: "
490
+ f"{list(CONTENT_TYPE_TO_READ_FN.keys())}"
164
491
  )
492
+
493
+ reader_kwargs = content_type_to_reader_kwargs(content_type)
494
+ _add_column_kwargs(content_type, column_names, include_columns, reader_kwargs)
495
+
496
+ # Merge with provided kwargs
497
+ reader_kwargs.update(kwargs)
498
+
165
499
  if pd_read_func_kwargs_provider:
166
- kwargs = pd_read_func_kwargs_provider(content_type, kwargs)
167
- logger.debug(f"Reading {s3_url} via {pd_read_func} with kwargs: {kwargs}")
168
- dataframe, latency = timed_invocation(pd_read_func, *args, **kwargs)
169
- logger.debug(f"Time to read {s3_url} into Pandas Dataframe: {latency}s")
500
+ reader_kwargs = pd_read_func_kwargs_provider(content_type, reader_kwargs)
501
+
502
+ logger.debug(f"Reading {path} via {pd_read_func} with kwargs: {reader_kwargs}")
503
+
504
+ dataframe, latency = timed_invocation(
505
+ pd_read_func,
506
+ path,
507
+ filesystem=filesystem,
508
+ fs_open_kwargs=fs_open_kwargs,
509
+ content_encoding=content_encoding,
510
+ **reader_kwargs,
511
+ )
512
+ logger.debug(f"Time to read {path} into Pandas DataFrame: {latency}s")
170
513
  return dataframe
171
514
 
172
515
 
@@ -176,35 +519,210 @@ def dataframe_size(dataframe: pd.DataFrame) -> int:
176
519
 
177
520
 
178
521
  def write_csv(
179
- dataframe: pd.DataFrame, path: str, *, filesystem: AbstractFileSystem, **kwargs
522
+ dataframe: pd.DataFrame,
523
+ path: str,
524
+ *,
525
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
526
+ fs_open_kwargs: Dict[str, any] = {},
527
+ **kwargs,
180
528
  ) -> None:
181
- with filesystem.open(path, "wb") as f:
182
- # TODO (pdames): Add support for client-specified compression types.
183
- with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
184
- dataframe.to_csv(out, **kwargs)
529
+ # TODO (pdames): Add support for client-specified compression types.
530
+ if kwargs.get("header") is None:
531
+ kwargs["header"] = False
532
+
533
+ # Check if the path already indicates compression to avoid double compression
534
+ should_compress = path.endswith(".gz")
535
+
536
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
537
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
538
+ with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
539
+ if should_compress:
540
+ # Path ends with .gz, PyArrow filesystem automatically compresses, no need for additional compression
541
+ dataframe.to_csv(f, **kwargs)
542
+ else:
543
+ # No compression indicated, apply explicit compression
544
+ with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
545
+ dataframe.to_csv(out, **kwargs)
546
+ else:
547
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
548
+ if should_compress:
549
+ # For fsspec filesystems, we need to apply compression explicitly
550
+ with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
551
+ dataframe.to_csv(out, **kwargs)
552
+ else:
553
+ # No compression indicated, apply explicit compression
554
+ with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
555
+ dataframe.to_csv(out, **kwargs)
556
+
557
+
558
+ def _preprocess_dataframe_for_parquet(dataframe: pd.DataFrame) -> pd.DataFrame:
559
+ """
560
+ Preprocess DataFrame to convert PyArrow types to native Python types for parquet compatibility.
561
+
562
+ This handles the case where from_pyarrow() creates pandas DataFrames with PyArrow array objects
563
+ that cannot be serialized by pandas.to_parquet().
564
+ """
565
+ # Check if any columns contain PyArrow arrays
566
+ needs_conversion = False
567
+ for col in dataframe.columns:
568
+ if dataframe[col].dtype == object:
569
+ # Check if the column contains PyArrow arrays
570
+ sample_val = dataframe[col].iloc[0] if len(dataframe) > 0 else None
571
+ if (
572
+ sample_val is not None
573
+ and hasattr(sample_val, "__class__")
574
+ and "pyarrow" in str(type(sample_val))
575
+ ):
576
+ needs_conversion = True
577
+ break
578
+
579
+ if not needs_conversion:
580
+ return dataframe
581
+
582
+ # Create a copy and convert PyArrow types
583
+ df_copy = dataframe.copy()
584
+
585
+ for col in df_copy.columns:
586
+ if df_copy[col].dtype == object and len(df_copy) > 0:
587
+ sample_val = df_copy[col].iloc[0]
588
+
589
+ # Convert PyArrow arrays to Python lists
590
+ if hasattr(sample_val, "__class__") and "pyarrow" in str(type(sample_val)):
591
+ try:
592
+ if hasattr(sample_val, "to_pylist"):
593
+ # PyArrow array - convert to Python list
594
+ df_copy[col] = df_copy[col].apply(
595
+ lambda x: x.to_pylist() if hasattr(x, "to_pylist") else x
596
+ )
597
+ elif hasattr(sample_val, "as_py"):
598
+ # PyArrow scalar - convert to Python value
599
+ df_copy[col] = df_copy[col].apply(
600
+ lambda x: x.as_py() if hasattr(x, "as_py") else x
601
+ )
602
+ except Exception as e:
603
+ logger.warning(
604
+ f"Could not convert PyArrow column {col}: {e}. Keeping original values."
605
+ )
606
+
607
+ return df_copy
185
608
 
186
609
 
187
610
  def write_parquet(
188
- dataframe: pd.DataFrame, path: str, *, filesystem: AbstractFileSystem, **kwargs
611
+ dataframe: pd.DataFrame,
612
+ path: str,
613
+ *,
614
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
615
+ fs_open_kwargs: Dict[str, any] = {},
616
+ **kwargs,
189
617
  ) -> None:
190
- with filesystem.open(path, "wb") as f:
191
- dataframe.to_parquet(f, **kwargs)
618
+ # Preprocess DataFrame to handle PyArrow types
619
+ processed_df = _preprocess_dataframe_for_parquet(dataframe)
620
+
621
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
622
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
623
+ with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
624
+ processed_df.to_parquet(f, **kwargs)
625
+ else:
626
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
627
+ processed_df.to_parquet(f, **kwargs)
628
+
629
+
630
+ def write_orc(
631
+ dataframe: pd.DataFrame,
632
+ path: str,
633
+ *,
634
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
635
+ fs_open_kwargs: Dict[str, any] = {},
636
+ **kwargs,
637
+ ) -> None:
638
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
639
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
640
+ with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
641
+ dataframe.to_orc(f, **kwargs)
642
+ else:
643
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
644
+ dataframe.to_orc(f, **kwargs)
192
645
 
193
646
 
194
647
  def write_feather(
195
- dataframe: pd.DataFrame, path: str, *, filesystem: AbstractFileSystem, **kwargs
648
+ dataframe: pd.DataFrame,
649
+ path: str,
650
+ *,
651
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
652
+ fs_open_kwargs: Dict[str, any] = {},
653
+ **kwargs,
196
654
  ) -> None:
197
- with filesystem.open(path, "wb") as f:
198
- dataframe.to_feather(f, **kwargs)
655
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
656
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
657
+ with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
658
+ dataframe.to_feather(f, **kwargs)
659
+ else:
660
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
661
+ dataframe.to_feather(f, **kwargs)
199
662
 
200
663
 
201
664
  def write_json(
202
- dataframe: pd.DataFrame, path: str, *, filesystem: AbstractFileSystem, **kwargs
665
+ dataframe: pd.DataFrame,
666
+ path: str,
667
+ *,
668
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
669
+ fs_open_kwargs: Dict[str, any] = {},
670
+ **kwargs,
671
+ ) -> None:
672
+ # Check if the path already indicates compression to avoid double compression
673
+ should_compress = path.endswith(".gz")
674
+
675
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
676
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
677
+ with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
678
+ if should_compress:
679
+ # Path ends with .gz, PyArrow filesystem automatically compresses, no need for additional compression
680
+ dataframe.to_json(f, **kwargs)
681
+ else:
682
+ # No compression indicated, apply explicit compression
683
+ with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
684
+ dataframe.to_json(out, **kwargs)
685
+ else:
686
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
687
+ if should_compress:
688
+ # For fsspec filesystems, we need to apply compression explicitly
689
+ with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
690
+ dataframe.to_json(out, **kwargs)
691
+ else:
692
+ # No compression indicated, apply explicit compression
693
+ with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
694
+ dataframe.to_json(out, **kwargs)
695
+
696
+
697
+ def write_avro(
698
+ dataframe: pd.DataFrame,
699
+ path: str,
700
+ *,
701
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
702
+ fs_open_kwargs: Dict[str, any] = {},
703
+ **kwargs,
203
704
  ) -> None:
204
- with filesystem.open(path, "wb") as f:
205
- # TODO (pdames): Add support for client-specified compression types.
206
- with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
207
- dataframe.to_json(out, **kwargs)
705
+ """
706
+ Write a pandas DataFrame to an AVRO file by delegating to polars implementation.
707
+ """
708
+ import polars as pl
709
+ from deltacat.utils.polars import write_avro as polars_write_avro
710
+
711
+ # Convert pandas DataFrame to polars
712
+ include_index = kwargs.pop("index", False)
713
+ pl_df = pl.from_pandas(dataframe, include_index=include_index)
714
+
715
+ # Remove pandas-specific kwargs before passing to polars
716
+ polars_kwargs = {k: v for k, v in kwargs.items() if k not in ["index"]}
717
+
718
+ # Delegate to polars write_avro implementation
719
+ polars_write_avro(
720
+ pl_df,
721
+ path,
722
+ filesystem=filesystem,
723
+ fs_open_kwargs=fs_open_kwargs,
724
+ **polars_kwargs,
725
+ )
208
726
 
209
727
 
210
728
  CONTENT_TYPE_TO_PD_WRITE_FUNC: Dict[str, Callable] = {
@@ -215,6 +733,8 @@ CONTENT_TYPE_TO_PD_WRITE_FUNC: Dict[str, Callable] = {
215
733
  ContentType.PARQUET.value: write_parquet,
216
734
  ContentType.FEATHER.value: write_feather,
217
735
  ContentType.JSON.value: write_json,
736
+ ContentType.AVRO.value: write_avro,
737
+ ContentType.ORC.value: write_orc,
218
738
  }
219
739
 
220
740
 
@@ -224,7 +744,7 @@ def content_type_to_writer_kwargs(content_type: str) -> Dict[str, Any]:
224
744
  "sep": "\t",
225
745
  "header": False,
226
746
  "na_rep": [""],
227
- "line_terminator": "\n",
747
+ "lineterminator": "\n",
228
748
  "quoting": csv.QUOTE_NONE,
229
749
  "index": False,
230
750
  }
@@ -232,28 +752,36 @@ def content_type_to_writer_kwargs(content_type: str) -> Dict[str, Any]:
232
752
  return {
233
753
  "sep": "\t",
234
754
  "header": False,
235
- "line_terminator": "\n",
755
+ "lineterminator": "\n",
756
+ "quoting": csv.QUOTE_MINIMAL,
236
757
  "index": False,
237
758
  }
238
759
  if content_type == ContentType.CSV.value:
239
760
  return {
240
761
  "sep": ",",
241
762
  "header": False,
242
- "line_terminator": "\n",
763
+ "index": False,
764
+ "lineterminator": "\n",
765
+ "quoting": csv.QUOTE_MINIMAL,
243
766
  "index": False,
244
767
  }
245
768
  if content_type == ContentType.PSV.value:
246
769
  return {
247
770
  "sep": "|",
248
771
  "header": False,
249
- "line_terminator": "\n",
250
772
  "index": False,
773
+ "lineterminator": "\n",
774
+ "quoting": csv.QUOTE_MINIMAL,
251
775
  }
252
776
  if content_type == ContentType.PARQUET.value:
253
777
  return {"index": False}
254
778
  if content_type == ContentType.FEATHER.value:
255
779
  return {}
256
780
  if content_type == ContentType.JSON.value:
781
+ return {"index": False, "orient": "records", "lines": True}
782
+ if content_type == ContentType.AVRO.value:
783
+ return {"index": False}
784
+ if content_type == ContentType.ORC.value:
257
785
  return {"index": False}
258
786
  raise ValueError(f"Unsupported content type: {content_type}")
259
787
 
@@ -261,9 +789,10 @@ def content_type_to_writer_kwargs(content_type: str) -> Dict[str, Any]:
261
789
  def dataframe_to_file(
262
790
  dataframe: pd.DataFrame,
263
791
  base_path: str,
264
- file_system: AbstractFileSystem,
792
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
265
793
  block_path_provider: Union[Callable, FilenameProvider],
266
794
  content_type: str = ContentType.PARQUET.value,
795
+ schema: Optional[pa.Schema] = None,
267
796
  **kwargs,
268
797
  ) -> None:
269
798
  """
@@ -279,4 +808,4 @@ def dataframe_to_file(
279
808
  f"{CONTENT_TYPE_TO_PD_WRITE_FUNC.keys}"
280
809
  )
281
810
  path = block_path_provider(base_path)
282
- writer(dataframe, path, filesystem=file_system, **writer_kwargs)
811
+ writer(dataframe, path, filesystem=filesystem, **writer_kwargs)