deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,759 @@
1
+ import logging
2
+ import bz2
3
+ import gzip
4
+ from functools import partial
5
+ from typing import Optional, List, Dict, Callable, Union, Iterable, Any
6
+
7
+ import polars as pl
8
+ import pyarrow as pa
9
+ import pyarrow.fs as pafs
10
+
11
+ from fsspec import AbstractFileSystem
12
+ from ray.data.datasource import FilenameProvider
13
+
14
+ from deltacat import logs
15
+ from deltacat.utils.filesystem import resolve_path_and_filesystem
16
+ from deltacat.utils.common import ContentTypeKwargsProvider, ReadKwargsProvider
17
+ from deltacat.utils.performance import timed_invocation
18
+
19
+ from deltacat.types.media import (
20
+ ContentType,
21
+ ContentEncoding,
22
+ DELIMITED_TEXT_CONTENT_TYPES,
23
+ TABULAR_CONTENT_TYPES,
24
+ )
25
+ from deltacat.types.partial_download import PartialFileDownloadParams
26
+
27
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
28
+
29
+ # Encoding to file initialization function mapping
30
+ ENCODING_TO_FILE_INIT: Dict[str, Callable] = {
31
+ ContentEncoding.GZIP.value: partial(gzip.open, mode="rb"),
32
+ ContentEncoding.BZIP2.value: partial(bz2.open, mode="rb"),
33
+ ContentEncoding.IDENTITY.value: lambda file_path: file_path,
34
+ }
35
+
36
+
37
+ def write_json(
38
+ table: pl.DataFrame,
39
+ path: str,
40
+ *,
41
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
42
+ fs_open_kwargs: Dict[str, any] = {},
43
+ **write_kwargs,
44
+ ) -> None:
45
+ # Check if the path already indicates compression to avoid double compression
46
+ should_compress = path.endswith(".gz")
47
+
48
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
49
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
50
+ with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
51
+ if should_compress:
52
+ # Path ends with .gz, PyArrow filesystem automatically compresses
53
+ table.write_ndjson(f, **write_kwargs)
54
+ else:
55
+ # No compression indicated, write uncompressed
56
+ table.write_ndjson(f, **write_kwargs)
57
+ else:
58
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
59
+ if should_compress:
60
+ # For fsspec filesystems, we need to apply compression explicitly
61
+ with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
62
+ table.write_ndjson(out, **write_kwargs)
63
+ else:
64
+ # No compression indicated, write uncompressed
65
+ table.write_ndjson(f, **write_kwargs)
66
+
67
+
68
+ def content_type_to_writer_kwargs(content_type: str) -> Dict[str, any]:
69
+ """
70
+ Returns writer kwargs for the given content type when writing with polars.
71
+ """
72
+ if content_type == ContentType.UNESCAPED_TSV.value:
73
+ return {
74
+ "separator": "\t",
75
+ "include_header": False,
76
+ "null_value": "",
77
+ "quote_style": "never", # Equivalent to QUOTE_NONE in pandas
78
+ }
79
+ if content_type == ContentType.TSV.value:
80
+ return {
81
+ "separator": "\t",
82
+ "include_header": False,
83
+ "quote_style": "necessary",
84
+ }
85
+ if content_type == ContentType.CSV.value:
86
+ return {
87
+ "separator": ",",
88
+ "include_header": False,
89
+ "quote_style": "necessary",
90
+ }
91
+ if content_type == ContentType.PSV.value:
92
+ return {
93
+ "separator": "|",
94
+ "include_header": False,
95
+ "quote_style": "necessary",
96
+ }
97
+ if content_type in {
98
+ ContentType.PARQUET.value,
99
+ ContentType.FEATHER.value,
100
+ ContentType.JSON.value,
101
+ ContentType.AVRO.value,
102
+ ContentType.ORC.value,
103
+ }:
104
+ return {}
105
+ raise ValueError(f"Unsupported content type: {content_type}")
106
+
107
+
108
+ def write_csv(
109
+ table: pl.DataFrame,
110
+ path: str,
111
+ *,
112
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
113
+ fs_open_kwargs: Dict[str, any] = {},
114
+ **kwargs,
115
+ ) -> None:
116
+ """
117
+ Write a polars DataFrame to a CSV file (or other delimited text format).
118
+ """
119
+ # Check if the path already indicates compression to avoid double compression
120
+ should_compress = path.endswith(".gz")
121
+
122
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
123
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
124
+ with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
125
+ if should_compress:
126
+ # Path ends with .gz, PyArrow filesystem automatically compresses
127
+ table.write_csv(f, **kwargs)
128
+ else:
129
+ # No compression indicated, write uncompressed
130
+ table.write_csv(f, **kwargs)
131
+ else:
132
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
133
+ if should_compress:
134
+ # For fsspec filesystems, we need to apply compression explicitly
135
+ with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
136
+ table.write_csv(out, **kwargs)
137
+ else:
138
+ # No compression indicated, write uncompressed
139
+ table.write_csv(f, **kwargs)
140
+
141
+
142
+ def write_avro(
143
+ table: pl.DataFrame,
144
+ path: str,
145
+ *,
146
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
147
+ fs_open_kwargs: Dict[str, any] = {},
148
+ **write_kwargs,
149
+ ) -> None:
150
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
151
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
152
+ with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
153
+ table.write_avro(f, **write_kwargs)
154
+ else:
155
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
156
+ table.write_avro(f, **write_kwargs)
157
+
158
+
159
+ def write_parquet(
160
+ table: pl.DataFrame,
161
+ path: str,
162
+ *,
163
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
164
+ fs_open_kwargs: Dict[str, any] = {},
165
+ **write_kwargs,
166
+ ) -> None:
167
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
168
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
169
+ with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
170
+ table.write_parquet(f, **write_kwargs)
171
+ else:
172
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
173
+ table.write_parquet(f, **write_kwargs)
174
+
175
+
176
+ def write_feather(
177
+ table: pl.DataFrame,
178
+ path: str,
179
+ *,
180
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
181
+ fs_open_kwargs: Dict[str, any] = {},
182
+ **kwargs,
183
+ ) -> None:
184
+ """
185
+ Write a polars DataFrame to a Feather file.
186
+ """
187
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
188
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
189
+ with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
190
+ table.write_ipc(f, **kwargs)
191
+ else:
192
+ with filesystem.open(path, "wb", **fs_open_kwargs) as f:
193
+ table.write_ipc(f, **kwargs)
194
+
195
+
196
+ def write_orc(
197
+ table: pl.DataFrame,
198
+ path: str,
199
+ *,
200
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
201
+ fs_open_kwargs: Dict[str, any] = {},
202
+ **write_kwargs,
203
+ ) -> None:
204
+ """
205
+ Write a polars DataFrame to an ORC file by delegating to PyArrow implementation.
206
+ """
207
+ from deltacat.utils.pyarrow import write_orc as pyarrow_write_orc
208
+
209
+ # Convert polars DataFrame to PyArrow Table
210
+ pa_table = table.to_arrow()
211
+
212
+ # Delegate to PyArrow write_orc implementation
213
+ pyarrow_write_orc(
214
+ pa_table,
215
+ path,
216
+ filesystem=filesystem,
217
+ fs_open_kwargs=fs_open_kwargs,
218
+ **write_kwargs,
219
+ )
220
+
221
+
222
+ CONTENT_TYPE_TO_PL_WRITE_FUNC: Dict[str, Callable] = {
223
+ ContentType.UNESCAPED_TSV.value: write_csv,
224
+ ContentType.TSV.value: write_csv,
225
+ ContentType.CSV.value: write_csv,
226
+ ContentType.PSV.value: write_csv,
227
+ ContentType.PARQUET.value: write_parquet,
228
+ ContentType.FEATHER.value: write_feather,
229
+ ContentType.JSON.value: write_json,
230
+ ContentType.AVRO.value: write_avro,
231
+ ContentType.ORC.value: write_orc,
232
+ }
233
+
234
+
235
+ def slice_table(table: pl.DataFrame, max_len: Optional[int]) -> List[pl.DataFrame]:
236
+ """
237
+ Iteratively create 0-copy table slices.
238
+ """
239
+ if max_len is None:
240
+ return [table]
241
+ tables = []
242
+ offset = 0
243
+ records_remaining = len(table)
244
+ while records_remaining > 0:
245
+ records_this_entry = min(max_len, records_remaining)
246
+ tables.append(table.slice(offset, records_this_entry))
247
+ records_remaining -= records_this_entry
248
+ offset += records_this_entry
249
+ return tables
250
+
251
+
252
+ def dataframe_size(table: pl.DataFrame) -> int:
253
+ return table.estimated_size()
254
+
255
+
256
+ def dataframe_to_file(
257
+ table: pl.DataFrame,
258
+ base_path: str,
259
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
260
+ block_path_provider: Union[Callable, FilenameProvider],
261
+ content_type: str = ContentType.PARQUET.value,
262
+ schema: Optional[pa.Schema] = None,
263
+ **kwargs,
264
+ ) -> None:
265
+ """
266
+ Writes the given Polars DataFrame to a file.
267
+ """
268
+ writer = CONTENT_TYPE_TO_PL_WRITE_FUNC.get(content_type)
269
+ writer_kwargs = content_type_to_writer_kwargs(content_type)
270
+ writer_kwargs.update(kwargs)
271
+ if not writer:
272
+ raise NotImplementedError(
273
+ f"Polars writer for content type '{content_type}' not "
274
+ f"implemented. Known content types: "
275
+ f"{CONTENT_TYPE_TO_PL_WRITE_FUNC.keys()}"
276
+ )
277
+ path = block_path_provider(base_path)
278
+ logger.debug(f"Writing table: {table} with kwargs: {writer_kwargs} to path: {path}")
279
+ writer(table, path, filesystem=filesystem, **writer_kwargs)
280
+
281
+
282
+ def write_table(
283
+ table: pl.DataFrame,
284
+ path: str,
285
+ *,
286
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
287
+ fs_open_kwargs: Dict[str, any] = {},
288
+ content_type: str = ContentType.PARQUET.value,
289
+ **kwargs,
290
+ ) -> None:
291
+ """
292
+ Write a polars DataFrame to a file in the specified format.
293
+ """
294
+ writer = CONTENT_TYPE_TO_PL_WRITE_FUNC.get(content_type)
295
+ writer_kwargs = content_type_to_writer_kwargs(content_type)
296
+ writer_kwargs.update(kwargs)
297
+ if not writer:
298
+ raise NotImplementedError(
299
+ f"Polars writer for content type '{content_type}' not "
300
+ f"implemented. Known content types: "
301
+ f"{CONTENT_TYPE_TO_PL_WRITE_FUNC.keys()}"
302
+ )
303
+ writer(
304
+ table,
305
+ path,
306
+ filesystem=filesystem,
307
+ fs_open_kwargs=fs_open_kwargs,
308
+ **writer_kwargs,
309
+ )
310
+
311
+
312
+ CONTENT_TYPE_TO_PL_READ_FUNC: Dict[str, Callable] = {
313
+ ContentType.UNESCAPED_TSV.value: pl.read_csv,
314
+ ContentType.TSV.value: pl.read_csv,
315
+ ContentType.CSV.value: pl.read_csv,
316
+ ContentType.PSV.value: pl.read_csv,
317
+ ContentType.PARQUET.value: pl.read_parquet,
318
+ ContentType.FEATHER.value: pl.read_ipc,
319
+ ContentType.JSON.value: pl.read_ndjson,
320
+ ContentType.AVRO.value: pl.read_avro,
321
+ }
322
+
323
+
324
+ class ReadKwargsProviderPolarsStringTypes(ContentTypeKwargsProvider):
325
+ """ReadKwargsProvider impl that reads columns of delimited text files
326
+ as UTF-8 strings (i.e. disables type inference). Useful for ensuring
327
+ lossless reads of UTF-8 delimited text datasets and improving read
328
+ performance in cases where type casting is not required."""
329
+
330
+ def __init__(self, include_columns: Optional[Iterable[str]] = None):
331
+ self.include_columns = include_columns
332
+
333
+ def _get_kwargs(self, content_type: str, kwargs: Dict[str, Any]) -> Dict[str, Any]:
334
+ if content_type in DELIMITED_TEXT_CONTENT_TYPES:
335
+ include_columns = (
336
+ self.include_columns if self.include_columns else kwargs.get("columns")
337
+ )
338
+ if not include_columns:
339
+ # read all columns as strings - disable schema inference
340
+ kwargs["infer_schema"] = False
341
+ else:
342
+ # read only the included columns as strings
343
+ kwargs["schema_overrides"] = {
344
+ column_name: pl.Utf8 for column_name in include_columns
345
+ }
346
+ return kwargs
347
+
348
+
349
+ def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
350
+ if content_type == ContentType.UNESCAPED_TSV.value:
351
+ return {
352
+ "separator": "\t",
353
+ "has_header": False,
354
+ "null_values": [""],
355
+ "quote_char": None,
356
+ }
357
+ if content_type == ContentType.TSV.value:
358
+ return {"separator": "\t", "has_header": False}
359
+ if content_type == ContentType.CSV.value:
360
+ return {"separator": ",", "has_header": False}
361
+ if content_type == ContentType.PSV.value:
362
+ return {"separator": "|", "has_header": False}
363
+ if content_type in {
364
+ ContentType.PARQUET.value,
365
+ ContentType.FEATHER.value,
366
+ ContentType.ORC.value,
367
+ ContentType.JSON.value,
368
+ ContentType.AVRO.value,
369
+ }:
370
+ return {}
371
+ raise ValueError(f"Unsupported content type: {content_type}")
372
+
373
+
374
+ def _add_column_kwargs(
375
+ content_type: str,
376
+ column_names: Optional[List[str]],
377
+ include_columns: Optional[List[str]],
378
+ kwargs: Dict[str, Any],
379
+ ):
380
+ if content_type in DELIMITED_TEXT_CONTENT_TYPES:
381
+ if column_names:
382
+ kwargs["new_columns"] = column_names
383
+ if include_columns:
384
+ kwargs["columns"] = include_columns
385
+ else:
386
+ if content_type in TABULAR_CONTENT_TYPES:
387
+ if include_columns:
388
+ kwargs["columns"] = include_columns
389
+ else:
390
+ if include_columns:
391
+ logger.warning(
392
+ f"Ignoring request to include columns {include_columns} "
393
+ f"for non-tabular content type {content_type}"
394
+ )
395
+
396
+
397
+ def concat_dataframes(dataframes: List[pl.DataFrame]) -> Optional[pl.DataFrame]:
398
+ if dataframes is None or not len(dataframes):
399
+ return None
400
+ if len(dataframes) == 1:
401
+ return next(iter(dataframes))
402
+ return pl.concat(dataframes)
403
+
404
+
405
+ def append_column_to_table(
406
+ table: pl.DataFrame,
407
+ column_name: str,
408
+ column_value: Any,
409
+ ) -> pl.DataFrame:
410
+ return table.with_columns(pl.lit(column_value).alias(column_name))
411
+
412
+
413
+ def select_columns(
414
+ table: pl.DataFrame,
415
+ column_names: List[str],
416
+ ) -> pl.DataFrame:
417
+ return table.select(column_names)
418
+
419
+
420
+ def file_to_dataframe(
421
+ path: str,
422
+ content_type: str,
423
+ content_encoding: str = ContentEncoding.IDENTITY.value,
424
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
425
+ column_names: Optional[List[str]] = None,
426
+ include_columns: Optional[List[str]] = None,
427
+ pl_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
428
+ partial_file_download_params: Optional[PartialFileDownloadParams] = None,
429
+ fs_open_kwargs: Dict[str, Any] = {},
430
+ **kwargs,
431
+ ) -> pl.DataFrame:
432
+ """
433
+ Read a file into a Polars DataFrame using any filesystem.
434
+
435
+ Args:
436
+ path: The file path to read
437
+ content_type: The content type of the file (e.g., ContentType.CSV.value)
438
+ content_encoding: The content encoding (default: IDENTITY)
439
+ filesystem: The filesystem to use (if None, will be inferred from path)
440
+ column_names: Optional column names to assign
441
+ include_columns: Optional columns to include in the result
442
+ pl_read_func_kwargs_provider: Optional kwargs provider for customization
443
+ fs_open_kwargs: Optional kwargs for filesystem open operations
444
+ **kwargs: Additional kwargs passed to the reader function
445
+
446
+ Returns:
447
+ pl.DataFrame: The loaded DataFrame
448
+ """
449
+ logger.debug(
450
+ f"Reading {path} to Polars. Content type: {content_type}. "
451
+ f"Encoding: {content_encoding}"
452
+ )
453
+
454
+ pl_read_func = CONTENT_TYPE_TO_READ_FN.get(content_type)
455
+ if not pl_read_func:
456
+ raise NotImplementedError(
457
+ f"Polars reader for content type '{content_type}' not "
458
+ f"implemented. Known content types: "
459
+ f"{list(CONTENT_TYPE_TO_READ_FN.keys())}"
460
+ )
461
+
462
+ reader_kwargs = content_type_to_reader_kwargs(content_type)
463
+ _add_column_kwargs(content_type, column_names, include_columns, reader_kwargs)
464
+
465
+ # Merge with provided kwargs
466
+ reader_kwargs.update(kwargs)
467
+
468
+ if pl_read_func_kwargs_provider:
469
+ reader_kwargs = pl_read_func_kwargs_provider(content_type, reader_kwargs)
470
+
471
+ logger.debug(f"Reading {path} via {pl_read_func} with kwargs: {reader_kwargs}")
472
+
473
+ dataframe, latency = timed_invocation(
474
+ pl_read_func,
475
+ path,
476
+ filesystem=filesystem,
477
+ fs_open_kwargs=fs_open_kwargs,
478
+ content_encoding=content_encoding,
479
+ **reader_kwargs,
480
+ )
481
+ logger.debug(f"Time to read {path} into Polars DataFrame: {latency}s")
482
+ return dataframe
483
+
484
+
485
+ def read_csv(
486
+ path: str,
487
+ *,
488
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
489
+ fs_open_kwargs: Dict[str, any] = {},
490
+ content_encoding: str = ContentEncoding.IDENTITY.value,
491
+ **read_kwargs,
492
+ ) -> pl.DataFrame:
493
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
494
+ path, filesystem = resolve_path_and_filesystem(path)
495
+ if content_encoding == ContentEncoding.IDENTITY.value:
496
+ with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
497
+ return pl.read_csv(f, **read_kwargs)
498
+ else:
499
+ # For compressed files with PyArrow, we need to be careful because PyArrow
500
+ # may auto-decompress some formats. Try to read directly first.
501
+ try:
502
+ with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
503
+ # Try reading as if it's already decompressed by PyArrow
504
+ return pl.read_csv(f, **read_kwargs)
505
+ except Exception:
506
+ # If that fails, try manual decompression
507
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f:
508
+ input_file_init = ENCODING_TO_FILE_INIT.get(
509
+ content_encoding, lambda x: x
510
+ )
511
+ with input_file_init(f) as input_file:
512
+ content = input_file.read()
513
+ if isinstance(content, str):
514
+ content = content.encode("utf-8")
515
+ return pl.read_csv(content, **read_kwargs)
516
+ else:
517
+ # fsspec AbstractFileSystem
518
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
519
+ # Handle compression
520
+ if content_encoding == ContentEncoding.IDENTITY.value:
521
+ return pl.read_csv(f, **read_kwargs)
522
+ else:
523
+ input_file_init = ENCODING_TO_FILE_INIT.get(
524
+ content_encoding, lambda x: x
525
+ )
526
+ with input_file_init(f) as input_file:
527
+ # Read decompressed content as bytes and pass to polars
528
+ content = input_file.read()
529
+ if isinstance(content, str):
530
+ content = content.encode("utf-8")
531
+ return pl.read_csv(content, **read_kwargs)
532
+
533
+
534
+ def read_parquet(
535
+ path: str,
536
+ *,
537
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
538
+ fs_open_kwargs: Dict[str, any] = {},
539
+ content_encoding: str = ContentEncoding.IDENTITY.value,
540
+ **read_kwargs,
541
+ ) -> pl.DataFrame:
542
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
543
+ path, filesystem = resolve_path_and_filesystem(path)
544
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f:
545
+ # Handle compression
546
+ if content_encoding == ContentEncoding.IDENTITY.value:
547
+ return pl.read_parquet(f, **read_kwargs)
548
+ else:
549
+ input_file_init = ENCODING_TO_FILE_INIT.get(
550
+ content_encoding, lambda x: x
551
+ )
552
+ with input_file_init(f) as input_file:
553
+ # Read decompressed content as bytes and pass to polars
554
+ content = input_file.read()
555
+ return pl.read_parquet(content, **read_kwargs)
556
+ else:
557
+ # fsspec AbstractFileSystem
558
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
559
+ # Handle compression
560
+ if content_encoding == ContentEncoding.IDENTITY.value:
561
+ return pl.read_parquet(f, **read_kwargs)
562
+ else:
563
+ input_file_init = ENCODING_TO_FILE_INIT.get(
564
+ content_encoding, lambda x: x
565
+ )
566
+ with input_file_init(f) as input_file:
567
+ # Read decompressed content as bytes and pass to polars
568
+ content = input_file.read()
569
+ return pl.read_parquet(content, **read_kwargs)
570
+
571
+
572
+ def read_ipc(
573
+ path: str,
574
+ *,
575
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
576
+ fs_open_kwargs: Dict[str, any] = {},
577
+ content_encoding: str = ContentEncoding.IDENTITY.value,
578
+ **read_kwargs,
579
+ ) -> pl.DataFrame:
580
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
581
+ path, filesystem = resolve_path_and_filesystem(path)
582
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f:
583
+ # Handle compression
584
+ if content_encoding == ContentEncoding.IDENTITY.value:
585
+ return pl.read_ipc(f, **read_kwargs)
586
+ else:
587
+ input_file_init = ENCODING_TO_FILE_INIT.get(
588
+ content_encoding, lambda x: x
589
+ )
590
+ with input_file_init(f) as input_file:
591
+ # Read decompressed content as bytes and pass to polars
592
+ content = input_file.read()
593
+ return pl.read_ipc(content, **read_kwargs)
594
+ else:
595
+ # fsspec AbstractFileSystem
596
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
597
+ # Handle compression
598
+ if content_encoding == ContentEncoding.IDENTITY.value:
599
+ return pl.read_ipc(f, **read_kwargs)
600
+ else:
601
+ input_file_init = ENCODING_TO_FILE_INIT.get(
602
+ content_encoding, lambda x: x
603
+ )
604
+ with input_file_init(f) as input_file:
605
+ # Read decompressed content as bytes and pass to polars
606
+ content = input_file.read()
607
+ return pl.read_ipc(content, **read_kwargs)
608
+
609
+
610
+ def read_ndjson(
611
+ path: str,
612
+ *,
613
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
614
+ fs_open_kwargs: Dict[str, any] = {},
615
+ content_encoding: str = ContentEncoding.IDENTITY.value,
616
+ **read_kwargs,
617
+ ) -> pl.DataFrame:
618
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
619
+ path, filesystem = resolve_path_and_filesystem(path)
620
+ if content_encoding == ContentEncoding.IDENTITY.value:
621
+ with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
622
+ return pl.read_ndjson(f, **read_kwargs)
623
+ else:
624
+ # For compressed files with PyArrow, we need to be careful because PyArrow
625
+ # may auto-decompress some formats. Try to read directly first.
626
+ try:
627
+ with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
628
+ # Try reading as if it's already decompressed by PyArrow
629
+ return pl.read_ndjson(f, **read_kwargs)
630
+ except Exception:
631
+ # If that fails, try manual decompression
632
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f:
633
+ input_file_init = ENCODING_TO_FILE_INIT.get(
634
+ content_encoding, lambda x: x
635
+ )
636
+ with input_file_init(f) as input_file:
637
+ content = input_file.read()
638
+ if isinstance(content, str):
639
+ content = content.encode("utf-8")
640
+ return pl.read_ndjson(content, **read_kwargs)
641
+ else:
642
+ # fsspec AbstractFileSystem
643
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
644
+ # Handle compression
645
+ if content_encoding == ContentEncoding.IDENTITY.value:
646
+ return pl.read_ndjson(f, **read_kwargs)
647
+ else:
648
+ input_file_init = ENCODING_TO_FILE_INIT.get(
649
+ content_encoding, lambda x: x
650
+ )
651
+ with input_file_init(f) as input_file:
652
+ # Read decompressed content as bytes and pass to polars
653
+ content = input_file.read()
654
+ if isinstance(content, str):
655
+ content = content.encode("utf-8")
656
+ return pl.read_ndjson(content, **read_kwargs)
657
+
658
+
659
+ def read_avro(
660
+ path: str,
661
+ *,
662
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
663
+ fs_open_kwargs: Dict[str, any] = {},
664
+ content_encoding: str = ContentEncoding.IDENTITY.value,
665
+ **read_kwargs,
666
+ ) -> pl.DataFrame:
667
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
668
+ path, filesystem = resolve_path_and_filesystem(path)
669
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f:
670
+ # Handle compression
671
+ if content_encoding == ContentEncoding.IDENTITY.value:
672
+ return pl.read_avro(f, **read_kwargs)
673
+ else:
674
+ input_file_init = ENCODING_TO_FILE_INIT.get(
675
+ content_encoding, lambda x: x
676
+ )
677
+ with input_file_init(f) as input_file:
678
+ # Read decompressed content as bytes and pass to polars
679
+ content = input_file.read()
680
+ return pl.read_avro(content, **read_kwargs)
681
+ else:
682
+ # fsspec AbstractFileSystem
683
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
684
+ # Handle compression
685
+ if content_encoding == ContentEncoding.IDENTITY.value:
686
+ return pl.read_avro(f, **read_kwargs)
687
+ else:
688
+ input_file_init = ENCODING_TO_FILE_INIT.get(
689
+ content_encoding, lambda x: x
690
+ )
691
+ with input_file_init(f) as input_file:
692
+ # Read decompressed content as bytes and pass to polars
693
+ content = input_file.read()
694
+ return pl.read_avro(content, **read_kwargs)
695
+
696
+
697
+ def read_orc(
698
+ path: str,
699
+ *,
700
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
701
+ fs_open_kwargs: Dict[str, any] = {},
702
+ content_encoding: str = ContentEncoding.IDENTITY.value,
703
+ **read_kwargs,
704
+ ) -> pl.DataFrame:
705
+ """
706
+ Read an ORC file using pandas and convert to polars since polars doesn't have native ORC support.
707
+ """
708
+ import pandas as pd
709
+
710
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
711
+ path, filesystem = resolve_path_and_filesystem(path)
712
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f:
713
+ # Handle compression
714
+ if content_encoding == ContentEncoding.IDENTITY.value:
715
+ pd_df = pd.read_orc(f, **read_kwargs)
716
+ return pl.from_pandas(pd_df)
717
+ else:
718
+ input_file_init = ENCODING_TO_FILE_INIT.get(
719
+ content_encoding, lambda x: x
720
+ )
721
+ with input_file_init(f) as input_file:
722
+ # Read decompressed content and pass to pandas
723
+ content = input_file.read()
724
+ import io
725
+
726
+ pd_df = pd.read_orc(io.BytesIO(content), **read_kwargs)
727
+ return pl.from_pandas(pd_df)
728
+ else:
729
+ # fsspec AbstractFileSystem
730
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
731
+ # Handle compression
732
+ if content_encoding == ContentEncoding.IDENTITY.value:
733
+ pd_df = pd.read_orc(f, **read_kwargs)
734
+ return pl.from_pandas(pd_df)
735
+ else:
736
+ input_file_init = ENCODING_TO_FILE_INIT.get(
737
+ content_encoding, lambda x: x
738
+ )
739
+ with input_file_init(f) as input_file:
740
+ # Read decompressed content and pass to pandas
741
+ content = input_file.read()
742
+ import io
743
+
744
+ pd_df = pd.read_orc(io.BytesIO(content), **read_kwargs)
745
+ return pl.from_pandas(pd_df)
746
+
747
+
748
+ # New mapping for encoding-aware reader functions used by file_to_dataframe
749
+ CONTENT_TYPE_TO_READ_FN: Dict[str, Callable] = {
750
+ ContentType.UNESCAPED_TSV.value: read_csv,
751
+ ContentType.TSV.value: read_csv,
752
+ ContentType.CSV.value: read_csv,
753
+ ContentType.PSV.value: read_csv,
754
+ ContentType.PARQUET.value: read_parquet,
755
+ ContentType.FEATHER.value: read_ipc,
756
+ ContentType.JSON.value: read_ndjson,
757
+ ContentType.AVRO.value: read_avro,
758
+ ContentType.ORC.value: read_orc,
759
+ }