deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
deltacat/utils/export.py CHANGED
@@ -5,7 +5,9 @@ import pyarrow.parquet
5
5
  import pyarrow.feather
6
6
  from typing import Callable, Dict
7
7
 
8
- from deltacat.storage.rivulet.reader.query_expression import QueryExpression
8
+ from deltacat.experimental.storage.rivulet.reader.query_expression import (
9
+ QueryExpression,
10
+ )
9
11
  from deltacat import logs
10
12
 
11
13
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -2,12 +2,13 @@ from __future__ import annotations
2
2
 
3
3
  import re
4
4
  from typing import Optional, Tuple, Union, List
5
+ from datetime import timedelta
6
+ from enum import Enum
5
7
 
6
8
  import sys
7
9
  import urllib
8
10
  import pathlib
9
11
 
10
- import pyarrow
11
12
  import pyarrow as pa
12
13
  from pyarrow.fs import (
13
14
  _resolve_filesystem_and_path,
@@ -17,15 +18,59 @@ from pyarrow.fs import (
17
18
  FileSystem,
18
19
  FSSpecHandler,
19
20
  PyFileSystem,
21
+ GcsFileSystem,
22
+ LocalFileSystem,
23
+ S3FileSystem,
24
+ AzureFileSystem,
25
+ HadoopFileSystem,
20
26
  )
21
27
 
22
28
  _LOCAL_SCHEME = "local"
23
29
 
24
30
 
31
+ class FilesystemType(str, Enum):
32
+ LOCAL = "local"
33
+ S3 = "s3"
34
+ GCS = "gcs"
35
+ AZURE = "azure"
36
+ HADOOP = "hadoop"
37
+ UNKNOWN = "unknown"
38
+
39
+ @classmethod
40
+ def from_filesystem(cls, filesystem: FileSystem) -> FilesystemType:
41
+ if isinstance(filesystem, LocalFileSystem):
42
+ return cls.LOCAL
43
+ elif isinstance(filesystem, S3FileSystem):
44
+ return cls.S3
45
+ elif isinstance(filesystem, GcsFileSystem):
46
+ return cls.GCS
47
+ elif isinstance(filesystem, AzureFileSystem):
48
+ return cls.AZURE
49
+ elif isinstance(filesystem, HadoopFileSystem):
50
+ return cls.HADOOP
51
+ else:
52
+ return cls.UNKNOWN
53
+
54
+ @classmethod
55
+ def to_filesystem(cls, filesystem_type: FilesystemType) -> FileSystem:
56
+ if filesystem_type == cls.LOCAL:
57
+ return LocalFileSystem()
58
+ elif filesystem_type == cls.S3:
59
+ return S3FileSystem()
60
+ elif filesystem_type == cls.GCS:
61
+ return GcsFileSystem()
62
+ elif filesystem_type == cls.AZURE:
63
+ return AzureFileSystem()
64
+ elif filesystem_type == cls.HADOOP:
65
+ return HadoopFileSystem()
66
+ else:
67
+ raise ValueError(f"Unsupported filesystem type: {filesystem_type}")
68
+
69
+
25
70
  def resolve_paths_and_filesystem(
26
71
  paths: Union[str, List[str]],
27
- filesystem: pyarrow.fs.FileSystem = None,
28
- ) -> Tuple[List[str], pyarrow.fs.FileSystem]:
72
+ filesystem: FileSystem = None,
73
+ ) -> Tuple[List[str], FileSystem]:
29
74
  """
30
75
  Resolves and normalizes all provided paths, infers a filesystem from the
31
76
  paths or validates the provided filesystem against the paths and ensures
@@ -113,19 +158,26 @@ def resolve_paths_and_filesystem(
113
158
  else:
114
159
  raise
115
160
  if filesystem is None:
116
- filesystem = resolved_filesystem
161
+ if isinstance(resolved_filesystem, GcsFileSystem):
162
+ # Configure a retry time limit for GcsFileSystem so that it
163
+ # doesn't hang forever trying to get file info (e.g., when
164
+ # trying to get a public file w/o anonymous=True).
165
+ filesystem = GcsFileSystem(
166
+ retry_time_limit=timedelta(seconds=60),
167
+ )
168
+ else:
169
+ filesystem = resolved_filesystem
117
170
  elif need_unwrap_path_protocol:
118
171
  resolved_path = _unwrap_protocol(resolved_path)
119
172
  resolved_path = filesystem.normalize_path(resolved_path)
120
173
  resolved_paths.append(resolved_path)
121
-
122
174
  return resolved_paths, filesystem
123
175
 
124
176
 
125
177
  def resolve_path_and_filesystem(
126
178
  path: str,
127
- filesystem: Optional[pyarrow.fs.FileSystem] = None,
128
- ) -> Tuple[str, pyarrow.fs.FileSystem]:
179
+ filesystem: Optional[FileSystem] = None,
180
+ ) -> Tuple[str, FileSystem]:
129
181
  """
130
182
  Resolves and normalizes the provided path, infers a filesystem from the
131
183
  path or validates the provided filesystem against the path.
@@ -148,7 +200,7 @@ def resolve_path_and_filesystem(
148
200
 
149
201
  def list_directory(
150
202
  path: str,
151
- filesystem: pyarrow.fs.FileSystem,
203
+ filesystem: FileSystem,
152
204
  exclude_prefixes: Optional[List[str]] = None,
153
205
  ignore_missing_path: bool = False,
154
206
  recursive: bool = False,
@@ -199,7 +251,7 @@ def list_directory(
199
251
 
200
252
  def get_file_info(
201
253
  path: str,
202
- filesystem: pyarrow.fs.FileSystem,
254
+ filesystem: FileSystem,
203
255
  ignore_missing_path: bool = False,
204
256
  ) -> FileInfo:
205
257
  """Get the file info for the provided path."""
@@ -213,6 +265,62 @@ def get_file_info(
213
265
  return file_info
214
266
 
215
267
 
268
+ def write_file(
269
+ path: str,
270
+ data: Union[str, bytes],
271
+ filesystem: Optional[FileSystem] = None,
272
+ ) -> None:
273
+ """
274
+ Write data to a file using any filesystem.
275
+
276
+ Args:
277
+ path: The file path to write to.
278
+ data: The data to write (string or bytes).
279
+ filesystem: The filesystem implementation to use. If None, will be inferred from the path.
280
+ """
281
+ resolved_path, resolved_filesystem = resolve_path_and_filesystem(
282
+ path=path,
283
+ filesystem=filesystem,
284
+ )
285
+
286
+ # Convert string to bytes if necessary
287
+ if isinstance(data, str):
288
+ data = data.encode("utf-8")
289
+
290
+ with resolved_filesystem.open_output_stream(resolved_path) as f:
291
+ f.write(data)
292
+
293
+
294
+ def read_file(
295
+ path: str,
296
+ filesystem: Optional[FileSystem] = None,
297
+ fail_if_not_found: bool = True,
298
+ ) -> Optional[bytes]:
299
+ """
300
+ Read data from a file using any filesystem.
301
+
302
+ Args:
303
+ path: The file path to read from.
304
+ filesystem: The filesystem implementation to use. If None, will be inferred from the path.
305
+ fail_if_not_found: Whether to raise an error if the file is not found.
306
+
307
+ Returns:
308
+ The file data as bytes, or None if file not found and fail_if_not_found is False.
309
+ """
310
+ try:
311
+ resolved_path, resolved_filesystem = resolve_path_and_filesystem(
312
+ path=path,
313
+ filesystem=filesystem,
314
+ )
315
+
316
+ with resolved_filesystem.open_input_stream(resolved_path) as f:
317
+ return f.read()
318
+ except FileNotFoundError:
319
+ if fail_if_not_found:
320
+ raise
321
+ return None
322
+
323
+
216
324
  def _handle_read_os_error(
217
325
  error: OSError,
218
326
  paths: Union[str, List[str]],
@@ -227,6 +335,9 @@ def _handle_read_os_error(
227
335
  r"(?:(.*)AWS Error ACCESS_DENIED during HeadObject operation: No response "
228
336
  r"body\.(.*))$"
229
337
  )
338
+ gcp_error_pattern = (
339
+ r"^(?:(.*)google::cloud::Status\(UNAVAILABLE:(.*?)Couldn't resolve host name)"
340
+ )
230
341
  if re.match(aws_error_pattern, str(error)):
231
342
  # Specially handle AWS error when reading files, to give a clearer error
232
343
  # message to avoid confusing users. The real issue is most likely that the AWS
@@ -243,9 +354,28 @@ def _handle_read_os_error(
243
354
  "You can also run AWS CLI command to get more detailed error message "
244
355
  "(e.g., aws s3 ls <file-name>). "
245
356
  "See https://awscli.amazonaws.com/v2/documentation/api/latest/reference/s3/index.html " # noqa
357
+ "and https://arrow.apache.org/docs/python/generated/pyarrow.fs.S3FileSystem.html "
358
+ "for more information."
359
+ )
360
+ )
361
+ elif re.match(gcp_error_pattern, str(error)):
362
+ # Special handling for GCP errors (e.g., handling the special case of
363
+ # requiring the filesystem to be instantiated with anonymous access to
364
+ # read public files).
365
+ if isinstance(paths, str):
366
+ paths = f'"{paths}"'
367
+ raise OSError(
368
+ (
369
+ f"Failing to read GCP GS file(s): {paths}. "
370
+ "Please check that file exists and has properly configured access. "
371
+ "If this is a public file, please instantiate a filesystem with "
372
+ "anonymous access via `pyarrow.fs.GcsFileSystem(anonymous=True)` "
373
+ "to read it. See https://google.aip.dev/auth/4110 and "
374
+ "https://arrow.apache.org/docs/python/generated/pyarrow.fs.GcsFileSystem.html" # noqa
246
375
  "for more information."
247
376
  )
248
377
  )
378
+
249
379
  else:
250
380
  raise error
251
381
 
@@ -1,6 +1,7 @@
1
1
  import posixpath
2
2
  import pyarrow.fs
3
3
 
4
+ from deltacat.constants import REV_DIR_NAME
4
5
  from deltacat.storage.model.partition import PartitionLocator
5
6
  from deltacat.utils.filesystem import resolve_path_and_filesystem
6
7
 
@@ -28,7 +29,7 @@ def _find_first_child_with_rev(
28
29
  )
29
30
  for child in children:
30
31
  if child.type == pyarrow.fs.FileType.Directory:
31
- rev_path = posixpath.join(child.path, "rev")
32
+ rev_path = posixpath.join(child.path, REV_DIR_NAME)
32
33
  if filesystem.get_file_info(rev_path).type == pyarrow.fs.FileType.Directory:
33
34
  return child.base_name
34
35
  raise ValueError(f"No directory with 'rev/' found under {parent_path}")
deltacat/utils/numpy.py CHANGED
@@ -1,14 +1,21 @@
1
- from typing import List, Optional, Callable, Union
1
+ from typing import List, Optional, Callable, Union, Dict, Any
2
2
 
3
+ import pandas as pd
3
4
  import numpy as np
4
- import pyarrow as pa
5
5
  from fsspec import AbstractFileSystem
6
+ import pyarrow.fs as pafs
7
+ import logging
6
8
 
7
9
  from ray.data.datasource import FilenameProvider
8
- from deltacat.types.media import ContentType
10
+ from deltacat.types.media import ContentType, ContentEncoding
9
11
  from deltacat.utils import pandas as pd_utils
10
- from deltacat.utils import pyarrow as pa_utils
12
+
11
13
  from deltacat.utils.common import ReadKwargsProvider
14
+ from deltacat import logs
15
+ from deltacat.utils.performance import timed_invocation
16
+ from deltacat.types.partial_download import PartialFileDownloadParams
17
+
18
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
12
19
 
13
20
 
14
21
  def slice_ndarray(np_array: np.ndarray, max_len: Optional[int]) -> List[np.ndarray]:
@@ -22,26 +29,61 @@ def slice_ndarray(np_array: np.ndarray, max_len: Optional[int]) -> List[np.ndarr
22
29
  return [np_array[i : i + max_len] for i in range(0, len(np_array), max_len)]
23
30
 
24
31
 
25
- def s3_file_to_ndarray(
26
- s3_url: str,
32
+ def file_to_ndarray(
33
+ path: str,
27
34
  content_type: str,
28
- content_encoding: str,
35
+ content_encoding: str = ContentEncoding.IDENTITY.value,
36
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
29
37
  column_names: Optional[List[str]] = None,
30
38
  include_columns: Optional[List[str]] = None,
31
39
  pd_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
32
- **s3_client_kwargs
40
+ partial_file_download_params: Optional[PartialFileDownloadParams] = None,
41
+ fs_open_kwargs: Dict[str, Any] = {},
42
+ **kwargs,
33
43
  ) -> np.ndarray:
34
- # TODO: Compare perf to s3 -> pyarrow -> pandas [Series/DataFrame] -> numpy
35
- dataframe = pd_utils.s3_file_to_dataframe(
36
- s3_url,
37
- content_type,
38
- content_encoding,
39
- column_names,
40
- include_columns,
41
- pd_read_func_kwargs_provider,
42
- **s3_client_kwargs
44
+ """
45
+ Read a file into a NumPy ndarray using any filesystem.
46
+
47
+ This function delegates to the pandas file_to_dataframe function and converts
48
+ the resulting DataFrame to a NumPy ndarray.
49
+
50
+ Args:
51
+ path: The file path to read
52
+ content_type: The content type of the file (e.g., ContentType.CSV.value)
53
+ content_encoding: The content encoding (default: IDENTITY)
54
+ filesystem: The filesystem to use (if None, will be inferred from path)
55
+ column_names: Optional column names to assign
56
+ include_columns: Optional columns to include in the result
57
+ pd_read_func_kwargs_provider: Optional kwargs provider for customization
58
+ fs_open_kwargs: Optional kwargs for filesystem open operations
59
+ **kwargs: Additional kwargs passed to the reader function
60
+
61
+ Returns:
62
+ np.ndarray: The loaded data as a NumPy ndarray
63
+ """
64
+ logger.debug(
65
+ f"Reading {path} to NumPy ndarray. Content type: {content_type}. "
66
+ f"Encoding: {content_encoding}"
43
67
  )
44
- return dataframe.to_numpy()
68
+
69
+ dataframe, latency = timed_invocation(
70
+ pd_utils.file_to_dataframe,
71
+ path=path,
72
+ content_type=content_type,
73
+ content_encoding=content_encoding,
74
+ filesystem=filesystem,
75
+ column_names=column_names,
76
+ include_columns=include_columns,
77
+ pd_read_func_kwargs_provider=pd_read_func_kwargs_provider,
78
+ partial_file_download_params=partial_file_download_params,
79
+ fs_open_kwargs=fs_open_kwargs,
80
+ **kwargs,
81
+ )
82
+
83
+ ndarray, conversion_latency = timed_invocation(dataframe.to_numpy)
84
+ total_latency = latency + conversion_latency
85
+ logger.debug(f"Time to read {path} into NumPy ndarray: {total_latency}s")
86
+ return ndarray
45
87
 
46
88
 
47
89
  def ndarray_size(np_array: np.ndarray) -> int:
@@ -51,22 +93,72 @@ def ndarray_size(np_array: np.ndarray) -> int:
51
93
  def ndarray_to_file(
52
94
  np_array: np.ndarray,
53
95
  path: str,
54
- file_system: AbstractFileSystem,
96
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
55
97
  block_path_provider: Union[FilenameProvider, Callable],
56
98
  content_type: str = ContentType.PARQUET.value,
57
- **kwargs
99
+ **kwargs,
58
100
  ) -> None:
59
101
  """
60
102
  Writes the given Numpy ndarray to a file.
61
103
  """
104
+ import pyarrow as pa
62
105
 
63
- # PyArrow only supports 1D ndarrays, so convert to list of 1D arrays
64
- np_arrays = [array for array in np_array]
65
- pa_utils.table_to_file(
66
- pa.table({"data": np_arrays}),
106
+ # Extract schema from kwargs if available
107
+ schema = kwargs.pop("schema", None)
108
+
109
+ # Convert to pandas DataFrame with proper column names if schema is available
110
+ if schema and isinstance(schema, pa.Schema):
111
+ if np_array.ndim == 1:
112
+ # 1D array: single column
113
+ column_names = [schema.names[0]] if schema.names else ["0"]
114
+ df = pd.DataFrame({column_names[0]: np_array})
115
+ elif np_array.ndim == 2:
116
+ # 2D array: multiple columns
117
+ column_names = (
118
+ schema.names
119
+ if len(schema.names) == np_array.shape[1]
120
+ else [f"{i}" for i in range(np_array.shape[1])]
121
+ )
122
+ df = pd.DataFrame(np_array, columns=column_names)
123
+ else:
124
+ raise ValueError(
125
+ f"NumPy arrays with {np_array.ndim} dimensions are not supported"
126
+ )
127
+ else:
128
+ # Fallback to generic column names
129
+ df = pd.DataFrame(np_array)
130
+
131
+ pd_utils.dataframe_to_file(
132
+ df,
67
133
  path,
68
- file_system,
134
+ filesystem,
69
135
  block_path_provider,
70
136
  content_type,
71
- **kwargs
137
+ **kwargs,
72
138
  )
139
+
140
+
141
+ def concat_ndarrays(arrays: List[np.ndarray]) -> Optional[np.ndarray]:
142
+ """
143
+ Concatenate a list of NumPy ndarrays into a single ndarray.
144
+
145
+ Args:
146
+ arrays: List of NumPy ndarrays to concatenate
147
+
148
+ Returns:
149
+ Concatenated NumPy ndarray, or None if input is empty
150
+ """
151
+ if arrays is None or not len(arrays):
152
+ return None
153
+ if len(arrays) == 1:
154
+ return next(iter(arrays))
155
+ return np.concatenate(arrays, axis=0)
156
+
157
+
158
+ def append_column_to_ndarray(
159
+ np_array: np.ndarray,
160
+ column_name: str,
161
+ column_value: Any,
162
+ ) -> np.ndarray:
163
+ # Add a new column with value repeating for each row of np_array
164
+ return np.concatenate((np_array, np.full((len(np_array), 1), column_value)), axis=1)