deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1193 @@
1
+ from unittest import TestCase
2
+ import numpy as np
3
+ import tempfile
4
+ import fsspec
5
+ import gzip
6
+ import polars as pl
7
+ from deltacat.types.media import ContentType, ContentEncoding
8
+ from deltacat.utils.numpy import (
9
+ file_to_ndarray,
10
+ slice_ndarray,
11
+ ndarray_size,
12
+ ndarray_to_file,
13
+ )
14
+ from deltacat.utils.pandas import ReadKwargsProviderPandasCsvPureUtf8
15
+
16
+
17
+ class TestNumpyReaders(TestCase):
18
+ def setUp(self):
19
+ # Create test data files for reading
20
+ self.fs = fsspec.filesystem("file")
21
+ self.base_path = tempfile.mkdtemp()
22
+ self.fs.makedirs(self.base_path, exist_ok=True)
23
+
24
+ # Create test data as 2D array (3 rows, 3 columns)
25
+ self.expected_data = np.array(
26
+ [["a,b\tc|d", "1", "1.1"], ["e,f\tg|h", "2", "2.2"], ["test", "3", "3.3"]]
27
+ )
28
+
29
+ # Write test files in different formats
30
+ self._create_test_files()
31
+
32
+ def tearDown(self):
33
+ self.fs.rm(self.base_path, recursive=True)
34
+
35
+ def _create_test_files(self):
36
+ # Create CSV file (GZIP compressed)
37
+ csv_path = f"{self.base_path}/test.csv"
38
+ with self.fs.open(csv_path, "wb") as f:
39
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
40
+ content = '"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n'
41
+ gz.write(content.encode("utf-8"))
42
+
43
+ # Create TSV file (GZIP compressed)
44
+ tsv_path = f"{self.base_path}/test.tsv"
45
+ with self.fs.open(tsv_path, "wb") as f:
46
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
47
+ content = '"a,b\tc|d"\t1\t1.1\n"e,f\tg|h"\t2\t2.2\ntest\t3\t3.3\n'
48
+ gz.write(content.encode("utf-8"))
49
+
50
+ # Create PSV file (GZIP compressed)
51
+ psv_path = f"{self.base_path}/test.psv"
52
+ with self.fs.open(psv_path, "wb") as f:
53
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
54
+ content = '"a,b\tc|d"|1|1.1\n"e,f\tg|h"|2|2.2\ntest|3|3.3\n'
55
+ gz.write(content.encode("utf-8"))
56
+
57
+ # Create unescaped TSV file (GZIP compressed)
58
+ unescaped_tsv_path = f"{self.base_path}/test_unescaped.tsv"
59
+ with self.fs.open(unescaped_tsv_path, "wb") as f:
60
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
61
+ content = "abc\t1\t1.1\ndef\t2\t2.2\nghi\t3\t3.3\n"
62
+ gz.write(content.encode("utf-8"))
63
+
64
+ # Create Parquet file
65
+ parquet_path = f"{self.base_path}/test.parquet"
66
+ import pandas as pd
67
+
68
+ df = pd.DataFrame(
69
+ {
70
+ "col1": ["a,b\tc|d", "e,f\tg|h", "test"],
71
+ "col2": [1, 2, 3],
72
+ "col3": [1.1, 2.2, 3.3],
73
+ }
74
+ )
75
+ df.to_parquet(parquet_path, index=False)
76
+
77
+ # Create Feather file
78
+ feather_path = f"{self.base_path}/test.feather"
79
+ df.to_feather(feather_path)
80
+
81
+ # Create JSON file (GZIP compressed, NDJSON format)
82
+ json_path = f"{self.base_path}/test.json"
83
+ with self.fs.open(json_path, "wb") as f:
84
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
85
+ json_str = df.to_json(orient="records", lines=True)
86
+ gz.write(json_str.encode("utf-8"))
87
+
88
+ # Create Avro file using polars
89
+ avro_path = f"{self.base_path}/test.avro"
90
+ pl_df = pl.from_pandas(df)
91
+ pl_df.write_avro(avro_path)
92
+
93
+ # Create ORC file
94
+ orc_path = f"{self.base_path}/test.orc"
95
+ df.to_orc(orc_path, index=False)
96
+
97
+ def test_file_to_ndarray_csv(self):
98
+ # Test reading CSV with file_to_ndarray
99
+ csv_path = f"{self.base_path}/test.csv"
100
+
101
+ result = file_to_ndarray(
102
+ csv_path,
103
+ ContentType.CSV.value,
104
+ ContentEncoding.GZIP.value,
105
+ filesystem=self.fs,
106
+ column_names=["col1", "col2", "col3"],
107
+ )
108
+
109
+ assert result.shape == (3, 3)
110
+ assert result[0, 0] == "a,b\tc|d"
111
+ assert result[1, 0] == "e,f\tg|h"
112
+ assert result[2, 0] == "test"
113
+
114
+ def test_file_to_ndarray_tsv(self):
115
+ # Test reading TSV with file_to_ndarray
116
+ tsv_path = f"{self.base_path}/test.tsv"
117
+
118
+ result = file_to_ndarray(
119
+ tsv_path,
120
+ ContentType.TSV.value,
121
+ ContentEncoding.GZIP.value,
122
+ filesystem=self.fs,
123
+ column_names=["col1", "col2", "col3"],
124
+ )
125
+
126
+ assert result.shape == (3, 3)
127
+ assert result[0, 0] == "a,b\tc|d"
128
+ assert result[1, 0] == "e,f\tg|h"
129
+ assert result[2, 0] == "test"
130
+
131
+ def test_file_to_ndarray_psv(self):
132
+ # Test reading PSV with file_to_ndarray
133
+ psv_path = f"{self.base_path}/test.psv"
134
+
135
+ result = file_to_ndarray(
136
+ psv_path,
137
+ ContentType.PSV.value,
138
+ ContentEncoding.GZIP.value,
139
+ filesystem=self.fs,
140
+ column_names=["col1", "col2", "col3"],
141
+ )
142
+
143
+ assert result.shape == (3, 3)
144
+ assert result[0, 0] == "a,b\tc|d"
145
+ assert result[1, 0] == "e,f\tg|h"
146
+ assert result[2, 0] == "test"
147
+
148
+ def test_file_to_ndarray_unescaped_tsv(self):
149
+ # Test reading unescaped TSV with file_to_ndarray
150
+ unescaped_tsv_path = f"{self.base_path}/test_unescaped.tsv"
151
+
152
+ result = file_to_ndarray(
153
+ unescaped_tsv_path,
154
+ ContentType.UNESCAPED_TSV.value,
155
+ ContentEncoding.GZIP.value,
156
+ filesystem=self.fs,
157
+ column_names=["col1", "col2", "col3"],
158
+ )
159
+
160
+ assert result.shape == (3, 3)
161
+ assert result[0, 0] == "abc"
162
+ assert result[1, 0] == "def"
163
+ assert result[2, 0] == "ghi"
164
+
165
+ def test_file_to_ndarray_parquet(self):
166
+ # Test reading Parquet with file_to_ndarray
167
+ parquet_path = f"{self.base_path}/test.parquet"
168
+
169
+ result = file_to_ndarray(
170
+ parquet_path, ContentType.PARQUET.value, filesystem=self.fs
171
+ )
172
+
173
+ assert result.shape == (3, 3)
174
+ assert result[0, 0] == "a,b\tc|d"
175
+ assert result[1, 0] == "e,f\tg|h"
176
+ assert result[2, 0] == "test"
177
+
178
+ def test_file_to_ndarray_feather(self):
179
+ # Test reading Feather with file_to_ndarray
180
+ feather_path = f"{self.base_path}/test.feather"
181
+
182
+ result = file_to_ndarray(
183
+ feather_path, ContentType.FEATHER.value, filesystem=self.fs
184
+ )
185
+
186
+ assert result.shape == (3, 3)
187
+ assert result[0, 0] == "a,b\tc|d"
188
+ assert result[1, 0] == "e,f\tg|h"
189
+ assert result[2, 0] == "test"
190
+
191
+ def test_file_to_ndarray_json(self):
192
+ # Test reading JSON with file_to_ndarray
193
+ json_path = f"{self.base_path}/test.json"
194
+
195
+ result = file_to_ndarray(
196
+ json_path,
197
+ ContentType.JSON.value,
198
+ ContentEncoding.GZIP.value,
199
+ filesystem=self.fs,
200
+ )
201
+
202
+ assert result.shape == (3, 3)
203
+ # Note: JSON column order might differ, so check by value presence
204
+ assert "a,b\tc|d" in result.flatten()
205
+ assert "e,f\tg|h" in result.flatten()
206
+ assert "test" in result.flatten()
207
+
208
+ def test_file_to_ndarray_avro(self):
209
+ # Test reading Avro with file_to_ndarray
210
+ avro_path = f"{self.base_path}/test.avro"
211
+
212
+ result = file_to_ndarray(avro_path, ContentType.AVRO.value, filesystem=self.fs)
213
+
214
+ assert result.shape == (3, 3)
215
+ assert result[0, 0] == "a,b\tc|d"
216
+ assert result[1, 0] == "e,f\tg|h"
217
+ assert result[2, 0] == "test"
218
+
219
+ def test_file_to_ndarray_orc(self):
220
+ # Test reading ORC with file_to_ndarray
221
+ orc_path = f"{self.base_path}/test.orc"
222
+
223
+ result = file_to_ndarray(orc_path, ContentType.ORC.value, filesystem=self.fs)
224
+
225
+ assert result.shape == (3, 3)
226
+ assert result[0, 0] == "a,b\tc|d"
227
+ assert result[1, 0] == "e,f\tg|h"
228
+ assert result[2, 0] == "test"
229
+
230
+ def test_file_to_ndarray_with_column_selection(self):
231
+ # Test reading with column selection
232
+ csv_path = f"{self.base_path}/test.csv"
233
+
234
+ result = file_to_ndarray(
235
+ csv_path,
236
+ ContentType.CSV.value,
237
+ ContentEncoding.GZIP.value,
238
+ filesystem=self.fs,
239
+ column_names=["col1", "col2", "col3"],
240
+ include_columns=["col1", "col2"],
241
+ )
242
+
243
+ assert result.shape == (3, 2) # Should only have 2 columns
244
+ assert result[0, 0] == "a,b\tc|d"
245
+ assert result[1, 0] == "e,f\tg|h"
246
+ assert result[2, 0] == "test"
247
+
248
+ def test_file_to_ndarray_with_kwargs_provider(self):
249
+ # Test reading with kwargs provider (forces string types)
250
+ csv_path = f"{self.base_path}/test.csv"
251
+ provider = ReadKwargsProviderPandasCsvPureUtf8(
252
+ include_columns=["col1", "col2", "col3"]
253
+ )
254
+
255
+ result = file_to_ndarray(
256
+ csv_path,
257
+ ContentType.CSV.value,
258
+ ContentEncoding.GZIP.value,
259
+ filesystem=self.fs,
260
+ column_names=["col1", "col2", "col3"],
261
+ pd_read_func_kwargs_provider=provider,
262
+ )
263
+
264
+ assert result.shape == (3, 3)
265
+ assert result[0, 0] == "a,b\tc|d"
266
+ # With string types provider, numbers should also be strings
267
+ assert result[0, 1] == "1"
268
+ assert result[0, 2] == "1.1"
269
+
270
+ def test_file_to_ndarray_filesystem_inference(self):
271
+ # Test filesystem inference when no filesystem is provided
272
+ json_path = f"{self.base_path}/test.json"
273
+
274
+ result = file_to_ndarray(
275
+ json_path,
276
+ ContentType.JSON.value,
277
+ ContentEncoding.GZIP.value
278
+ # No filesystem provided - should be inferred
279
+ )
280
+
281
+ assert result.shape == (3, 3)
282
+ # JSON might have different column ordering
283
+ assert "a,b\tc|d" in result.flatten()
284
+ assert "e,f\tg|h" in result.flatten()
285
+ assert "test" in result.flatten()
286
+
287
+ def test_file_to_ndarray_bzip2_compression(self):
288
+ # Test BZIP2 compression handling
289
+ import bz2
290
+
291
+ # Create a BZIP2 compressed CSV file
292
+ csv_content = '"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n'
293
+ compressed_content = bz2.compress(csv_content.encode("utf-8"))
294
+
295
+ bz2_path = f"{self.base_path}/test.csv.bz2"
296
+ with self.fs.open(bz2_path, "wb") as f:
297
+ f.write(compressed_content)
298
+
299
+ result = file_to_ndarray(
300
+ bz2_path,
301
+ ContentType.CSV.value,
302
+ ContentEncoding.BZIP2.value,
303
+ filesystem=self.fs,
304
+ column_names=["col1", "col2", "col3"],
305
+ )
306
+
307
+ assert result.shape == (3, 3)
308
+ assert result[0, 0] == "a,b\tc|d"
309
+ assert result[1, 0] == "e,f\tg|h"
310
+ assert result[2, 0] == "test"
311
+
312
+ def test_slice_ndarray(self):
313
+ # Test slicing functionality
314
+ arr = np.arange(10).reshape(10, 1)
315
+
316
+ # Test without max_len (should return original array)
317
+ result = slice_ndarray(arr, None)
318
+ assert len(result) == 1
319
+ np.testing.assert_array_equal(result[0], arr)
320
+
321
+ # Test with max_len
322
+ result = slice_ndarray(arr, 3)
323
+ assert len(result) == 4 # 10 rows / 3 = 3 full slices + 1 remainder
324
+ assert result[0].shape == (3, 1)
325
+ assert result[1].shape == (3, 1)
326
+ assert result[2].shape == (3, 1)
327
+ assert result[3].shape == (1, 1) # remainder
328
+
329
+ # Verify data integrity
330
+ np.testing.assert_array_equal(result[0], arr[:3])
331
+ np.testing.assert_array_equal(result[1], arr[3:6])
332
+ np.testing.assert_array_equal(result[2], arr[6:9])
333
+ np.testing.assert_array_equal(result[3], arr[9:])
334
+
335
+ def test_ndarray_size(self):
336
+ # Test size calculation
337
+ arr = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float64)
338
+ size = ndarray_size(arr)
339
+ expected_size = arr.nbytes
340
+ assert size == expected_size
341
+
342
+ def test_ndarray_to_file(self):
343
+ # Test writing ndarray to file
344
+ arr = np.array([1, 2, 3, 4, 5])
345
+ path = f"{self.base_path}/test_output.parquet"
346
+
347
+ ndarray_to_file(
348
+ arr, path, self.fs, lambda x: path, content_type=ContentType.PARQUET.value
349
+ )
350
+
351
+ assert self.fs.exists(path), "file was not written"
352
+
353
+ # Verify we can read it back (though this tests the write functionality)
354
+ import pandas as pd
355
+
356
+ result_df = pd.read_parquet(path)
357
+ assert len(result_df) == 5
358
+ assert "0" in result_df.columns
359
+
360
+ def test_ndarray_to_file_different_content_types(self):
361
+ # Test writing ndarray to different file formats
362
+ arr = np.array([1, 2, 3, 4, 5])
363
+
364
+ # Test Parquet
365
+ parquet_path = f"{self.base_path}/test_output.parquet"
366
+ ndarray_to_file(
367
+ arr,
368
+ parquet_path,
369
+ self.fs,
370
+ lambda x: parquet_path,
371
+ content_type=ContentType.PARQUET.value,
372
+ )
373
+ assert self.fs.exists(parquet_path)
374
+
375
+ # Test Feather
376
+ feather_path = f"{self.base_path}/test_output.feather"
377
+ ndarray_to_file(
378
+ arr,
379
+ feather_path,
380
+ self.fs,
381
+ lambda x: feather_path,
382
+ content_type=ContentType.FEATHER.value,
383
+ )
384
+ assert self.fs.exists(feather_path)
385
+
386
+ # Test CSV (compressed)
387
+ csv_path = f"{self.base_path}/test_output.csv"
388
+ ndarray_to_file(
389
+ arr,
390
+ csv_path,
391
+ self.fs,
392
+ lambda x: csv_path,
393
+ content_type=ContentType.CSV.value,
394
+ )
395
+ assert self.fs.exists(csv_path)
396
+
397
+ # Test JSON (compressed)
398
+ json_path = f"{self.base_path}/test_output.json"
399
+ ndarray_to_file(
400
+ arr,
401
+ json_path,
402
+ self.fs,
403
+ lambda x: json_path,
404
+ content_type=ContentType.JSON.value,
405
+ )
406
+ assert self.fs.exists(json_path)
407
+
408
+ def test_ndarray_to_file_different_dtypes(self):
409
+ # Test writing arrays with different data types
410
+
411
+ # Integer array
412
+ int_arr = np.array([1, 2, 3, 4, 5], dtype=np.int64)
413
+ int_path = f"{self.base_path}/test_int.parquet"
414
+ ndarray_to_file(
415
+ int_arr,
416
+ int_path,
417
+ self.fs,
418
+ lambda x: int_path,
419
+ content_type=ContentType.PARQUET.value,
420
+ )
421
+ assert self.fs.exists(int_path)
422
+
423
+ # Float array
424
+ float_arr = np.array([1.1, 2.2, 3.3, 4.4, 5.5], dtype=np.float64)
425
+ float_path = f"{self.base_path}/test_float.parquet"
426
+ ndarray_to_file(
427
+ float_arr,
428
+ float_path,
429
+ self.fs,
430
+ lambda x: float_path,
431
+ content_type=ContentType.PARQUET.value,
432
+ )
433
+ assert self.fs.exists(float_path)
434
+
435
+ # String array (object dtype)
436
+ str_arr = np.array(["a", "b", "c", "d", "e"], dtype=object)
437
+ str_path = f"{self.base_path}/test_str.parquet"
438
+ ndarray_to_file(
439
+ str_arr,
440
+ str_path,
441
+ self.fs,
442
+ lambda x: str_path,
443
+ content_type=ContentType.PARQUET.value,
444
+ )
445
+ assert self.fs.exists(str_path)
446
+
447
+ def test_ndarray_to_file_2d_array(self):
448
+ # Test writing 2D arrays
449
+ arr_2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
450
+ path = f"{self.base_path}/test_2d.parquet"
451
+
452
+ ndarray_to_file(
453
+ arr_2d,
454
+ path,
455
+ self.fs,
456
+ lambda x: path,
457
+ content_type=ContentType.PARQUET.value,
458
+ )
459
+
460
+ assert self.fs.exists(path)
461
+
462
+ # Verify the file structure
463
+ import pandas as pd
464
+
465
+ result_df = pd.read_parquet(path)
466
+ assert len(result_df) == 3 # 3 rows (first dimension)
467
+ # 2D array should have columns "0", "1", "2"
468
+ assert list(result_df.columns) == ["0", "1", "2"]
469
+ # Verify the data values are correct (convert back to numpy for comparison)
470
+ result_array = result_df.to_numpy()
471
+ np.testing.assert_array_equal(result_array, arr_2d)
472
+
473
+ def test_ndarray_to_file_empty_array(self):
474
+ # Test writing empty arrays
475
+ empty_arr = np.array([])
476
+ path = f"{self.base_path}/test_empty.parquet"
477
+
478
+ ndarray_to_file(
479
+ empty_arr,
480
+ path,
481
+ self.fs,
482
+ lambda x: path,
483
+ content_type=ContentType.PARQUET.value,
484
+ )
485
+
486
+ assert self.fs.exists(path)
487
+
488
+ # Verify the file structure
489
+ import pandas as pd
490
+
491
+ result_df = pd.read_parquet(path)
492
+ assert len(result_df) == 0 # Empty DataFrame
493
+ assert "0" in result_df.columns
494
+
495
+ def test_ndarray_to_file_large_array(self):
496
+ # Test writing larger arrays
497
+ large_arr = np.arange(1000)
498
+ path = f"{self.base_path}/test_large.parquet"
499
+
500
+ ndarray_to_file(
501
+ large_arr,
502
+ path,
503
+ self.fs,
504
+ lambda x: path,
505
+ content_type=ContentType.PARQUET.value,
506
+ )
507
+
508
+ assert self.fs.exists(path)
509
+
510
+ # Verify the file can be read and has correct size
511
+ import pandas as pd
512
+
513
+ result_df = pd.read_parquet(path)
514
+ assert len(result_df) == 1000
515
+ assert "0" in result_df.columns
516
+
517
+ def test_ndarray_to_file_with_custom_kwargs(self):
518
+ # Test writing with custom kwargs
519
+ arr = np.array([1, 2, 3, 4, 5])
520
+ path = f"{self.base_path}/test_kwargs.parquet"
521
+
522
+ # Add some custom write kwargs (these will be passed to PyArrow)
523
+ ndarray_to_file(
524
+ arr,
525
+ path,
526
+ self.fs,
527
+ lambda x: path,
528
+ content_type=ContentType.PARQUET.value,
529
+ compression="snappy", # Custom compression
530
+ )
531
+
532
+ assert self.fs.exists(path)
533
+
534
+ # Verify the file was written (basic check)
535
+ import pandas as pd
536
+
537
+ result_df = pd.read_parquet(path)
538
+ assert len(result_df) == 5
539
+ assert "0" in result_df.columns
540
+
541
+ def test_ndarray_to_file_readback_verification(self):
542
+ # Test that we can read back the exact data we wrote
543
+ original_arr = np.array([1.1, 2.2, 3.3, 4.4, 5.5])
544
+ path = f"{self.base_path}/test_readback.parquet"
545
+
546
+ # Write the array
547
+ ndarray_to_file(
548
+ original_arr,
549
+ path,
550
+ self.fs,
551
+ lambda x: path,
552
+ content_type=ContentType.PARQUET.value,
553
+ )
554
+
555
+ # Read it back using pandas and verify content
556
+ import pandas as pd
557
+
558
+ result_df = pd.read_parquet(path)
559
+ readback_arr = np.array(result_df["0"].tolist())
560
+
561
+ # Check that the data matches
562
+ np.testing.assert_array_almost_equal(original_arr, readback_arr)
563
+
564
+ def test_ndarray_to_file_different_filesystems(self):
565
+ # Test with different filesystem implementations
566
+ arr = np.array([1, 2, 3, 4, 5])
567
+
568
+ # Test with fsspec filesystem (already used in other tests)
569
+ fsspec_path = f"{self.base_path}/test_fsspec.parquet"
570
+ ndarray_to_file(
571
+ arr,
572
+ fsspec_path,
573
+ self.fs,
574
+ lambda x: fsspec_path,
575
+ content_type=ContentType.PARQUET.value,
576
+ )
577
+ assert self.fs.exists(fsspec_path)
578
+
579
+ # Test with None filesystem (should infer local filesystem)
580
+ local_path = f"{self.base_path}/test_local.parquet"
581
+ ndarray_to_file(
582
+ arr,
583
+ local_path,
584
+ None, # No filesystem specified
585
+ lambda x: local_path,
586
+ content_type=ContentType.PARQUET.value,
587
+ )
588
+ # Check if file exists using the fsspec filesystem
589
+ assert self.fs.exists(local_path)
590
+
591
+ def test_ndarray_to_file_boolean_array(self):
592
+ # Test writing boolean arrays
593
+ bool_arr = np.array([True, False, True, False, True])
594
+ path = f"{self.base_path}/test_bool.parquet"
595
+
596
+ ndarray_to_file(
597
+ bool_arr,
598
+ path,
599
+ self.fs,
600
+ lambda x: path,
601
+ content_type=ContentType.PARQUET.value,
602
+ )
603
+
604
+ assert self.fs.exists(path)
605
+
606
+ # Verify the file structure and content
607
+ import pandas as pd
608
+
609
+ result_df = pd.read_parquet(path)
610
+ assert len(result_df) == 5
611
+ assert "0" in result_df.columns
612
+
613
+ # Check that boolean values are preserved
614
+ readback_arr = np.array(result_df["0"].tolist())
615
+ np.testing.assert_array_equal(bool_arr, readback_arr)
616
+
617
+ def test_ndarray_to_file_complex_dtypes(self):
618
+ # Test writing arrays with complex dtypes
619
+ complex_arr = np.array([1 + 2j, 3 + 4j, 5 + 6j])
620
+ path = f"{self.base_path}/test_complex.parquet"
621
+
622
+ # Note: Complex numbers might not be directly supported by all formats
623
+ # This test may need to handle conversion or errors gracefully
624
+ try:
625
+ ndarray_to_file(
626
+ complex_arr,
627
+ path,
628
+ self.fs,
629
+ lambda x: path,
630
+ content_type=ContentType.PARQUET.value,
631
+ )
632
+ assert self.fs.exists(path)
633
+ except (TypeError, ValueError, NotImplementedError):
634
+ # Complex dtypes might not be supported by PyArrow/Parquet
635
+ # This is acceptable behavior
636
+ pass
637
+
638
+
639
+ class TestNumpyFileSystemSupport(TestCase):
640
+ """
641
+ Comprehensive tests for numpy file operations with different filesystem types.
642
+ Tests fsspec AbstractFileSystem, PyArrow FileSystem, and auto-inferred filesystem.
643
+ """
644
+
645
+ def setUp(self):
646
+ import pyarrow.fs as pafs
647
+
648
+ # Create test data as numpy array
649
+ # All formats preserve mixed types when converted to numpy, so use object dtype for all
650
+ self.test_data = np.array(
651
+ [["value1", 1, 1.1], ["value2", 2, 2.2], ["value3", 3, 3.3]], dtype=object
652
+ )
653
+
654
+ # Set up temporary directory
655
+ self.temp_dir = tempfile.mkdtemp()
656
+
657
+ # Set up different filesystem types
658
+ self.fsspec_fs = fsspec.filesystem("file")
659
+ self.pyarrow_fs = pafs.LocalFileSystem()
660
+
661
+ # Create test files for each content type
662
+ self._create_test_files()
663
+
664
+ def tearDown(self):
665
+ import shutil
666
+
667
+ shutil.rmtree(self.temp_dir)
668
+
669
+ def _create_test_files(self):
670
+ """Create test files in different formats with different compression types."""
671
+ import gzip
672
+ import bz2
673
+ import pandas as pd
674
+
675
+ # Create pandas DataFrame for file creation
676
+ df = pd.DataFrame(
677
+ {
678
+ "col1": ["value1", "value2", "value3"],
679
+ "col2": [1, 2, 3],
680
+ "col3": [1.1, 2.2, 3.3],
681
+ }
682
+ )
683
+
684
+ # CSV files without headers to match test data structure
685
+ csv_data = "value1,1,1.1\nvalue2,2,2.2\nvalue3,3,3.3\n"
686
+
687
+ # Create uncompressed CSV
688
+ with open(f"{self.temp_dir}/test.csv", "w") as f:
689
+ f.write(csv_data)
690
+
691
+ # Create GZIP compressed CSV
692
+ with gzip.open(f"{self.temp_dir}/test_gzip.csv.gz", "wt") as f:
693
+ f.write(csv_data)
694
+
695
+ # Create BZIP2 compressed CSV
696
+ with bz2.open(f"{self.temp_dir}/test_bzip2.csv.bz2", "wt") as f:
697
+ f.write(csv_data)
698
+
699
+ # Parquet file
700
+ df.to_parquet(f"{self.temp_dir}/test.parquet", index=False)
701
+
702
+ # Feather file
703
+ df.to_feather(f"{self.temp_dir}/test.feather")
704
+
705
+ # JSON file (NDJSON format)
706
+ json_str = df.to_json(orient="records", lines=True)
707
+ with open(f"{self.temp_dir}/test.json", "w") as f:
708
+ f.write(json_str)
709
+
710
+ # AVRO file (using polars since pandas delegates to polars for AVRO)
711
+ import polars as pl
712
+
713
+ pl_df = pl.from_pandas(df)
714
+ pl_df.write_avro(f"{self.temp_dir}/test.avro")
715
+
716
+ # ORC file
717
+ df.to_orc(f"{self.temp_dir}/test.orc")
718
+
719
+ def _assert_arrays_equal(self, result, expected):
720
+ """Helper to assert numpy arrays are equal."""
721
+ assert (
722
+ result.shape == expected.shape
723
+ ), f"Shape mismatch: {result.shape} vs {expected.shape}"
724
+ np.testing.assert_array_equal(result, expected)
725
+
726
+ def test_csv_with_fsspec_filesystem(self):
727
+ """Test CSV reading with fsspec AbstractFileSystem."""
728
+ # Test uncompressed CSV
729
+ result = file_to_ndarray(
730
+ f"{self.temp_dir}/test.csv",
731
+ ContentType.CSV.value,
732
+ ContentEncoding.IDENTITY.value,
733
+ filesystem=self.fsspec_fs,
734
+ column_names=["col1", "col2", "col3"],
735
+ )
736
+ self._assert_arrays_equal(result, self.test_data)
737
+
738
+ # Test GZIP compressed CSV
739
+ result = file_to_ndarray(
740
+ f"{self.temp_dir}/test_gzip.csv.gz",
741
+ ContentType.CSV.value,
742
+ ContentEncoding.GZIP.value,
743
+ filesystem=self.fsspec_fs,
744
+ column_names=["col1", "col2", "col3"],
745
+ )
746
+ self._assert_arrays_equal(result, self.test_data)
747
+
748
+ # Test BZIP2 compressed CSV
749
+ result = file_to_ndarray(
750
+ f"{self.temp_dir}/test_bzip2.csv.bz2",
751
+ ContentType.CSV.value,
752
+ ContentEncoding.BZIP2.value,
753
+ filesystem=self.fsspec_fs,
754
+ column_names=["col1", "col2", "col3"],
755
+ )
756
+ self._assert_arrays_equal(result, self.test_data)
757
+
758
+ def test_csv_with_pyarrow_filesystem(self):
759
+ """Test CSV reading with PyArrow FileSystem."""
760
+ # Test uncompressed CSV
761
+ result = file_to_ndarray(
762
+ f"{self.temp_dir}/test.csv",
763
+ ContentType.CSV.value,
764
+ ContentEncoding.IDENTITY.value,
765
+ filesystem=self.pyarrow_fs,
766
+ column_names=["col1", "col2", "col3"],
767
+ )
768
+ self._assert_arrays_equal(result, self.test_data)
769
+
770
+ # Test GZIP compressed CSV
771
+ result = file_to_ndarray(
772
+ f"{self.temp_dir}/test_gzip.csv.gz",
773
+ ContentType.CSV.value,
774
+ ContentEncoding.GZIP.value,
775
+ filesystem=self.pyarrow_fs,
776
+ column_names=["col1", "col2", "col3"],
777
+ )
778
+ self._assert_arrays_equal(result, self.test_data)
779
+
780
+ def test_csv_with_auto_inferred_filesystem(self):
781
+ """Test CSV reading with automatically inferred filesystem."""
782
+ # Test uncompressed CSV (filesystem=None, should auto-infer)
783
+ result = file_to_ndarray(
784
+ f"{self.temp_dir}/test.csv",
785
+ ContentType.CSV.value,
786
+ ContentEncoding.IDENTITY.value,
787
+ filesystem=None,
788
+ column_names=["col1", "col2", "col3"],
789
+ )
790
+ self._assert_arrays_equal(result, self.test_data)
791
+
792
+ def test_parquet_with_different_filesystems(self):
793
+ """Test Parquet reading with different filesystem types."""
794
+ # Test with fsspec
795
+ result = file_to_ndarray(
796
+ f"{self.temp_dir}/test.parquet",
797
+ ContentType.PARQUET.value,
798
+ ContentEncoding.IDENTITY.value,
799
+ filesystem=self.fsspec_fs,
800
+ )
801
+ self._assert_arrays_equal(result, self.test_data)
802
+
803
+ # Test with PyArrow
804
+ result = file_to_ndarray(
805
+ f"{self.temp_dir}/test.parquet",
806
+ ContentType.PARQUET.value,
807
+ ContentEncoding.IDENTITY.value,
808
+ filesystem=self.pyarrow_fs,
809
+ )
810
+ self._assert_arrays_equal(result, self.test_data)
811
+
812
+ # Test with auto-inferred
813
+ result = file_to_ndarray(
814
+ f"{self.temp_dir}/test.parquet",
815
+ ContentType.PARQUET.value,
816
+ ContentEncoding.IDENTITY.value,
817
+ filesystem=None,
818
+ )
819
+ self._assert_arrays_equal(result, self.test_data)
820
+
821
+ def test_feather_with_different_filesystems(self):
822
+ """Test Feather reading with different filesystem types."""
823
+ # Test with fsspec
824
+ result = file_to_ndarray(
825
+ f"{self.temp_dir}/test.feather",
826
+ ContentType.FEATHER.value,
827
+ ContentEncoding.IDENTITY.value,
828
+ filesystem=self.fsspec_fs,
829
+ )
830
+ self._assert_arrays_equal(result, self.test_data)
831
+
832
+ # Test with PyArrow
833
+ result = file_to_ndarray(
834
+ f"{self.temp_dir}/test.feather",
835
+ ContentType.FEATHER.value,
836
+ ContentEncoding.IDENTITY.value,
837
+ filesystem=self.pyarrow_fs,
838
+ )
839
+ self._assert_arrays_equal(result, self.test_data)
840
+
841
+ # Test with auto-inferred
842
+ result = file_to_ndarray(
843
+ f"{self.temp_dir}/test.feather",
844
+ ContentType.FEATHER.value,
845
+ ContentEncoding.IDENTITY.value,
846
+ filesystem=None,
847
+ )
848
+ self._assert_arrays_equal(result, self.test_data)
849
+
850
+ def test_json_with_different_filesystems(self):
851
+ """Test JSON reading with different filesystem types."""
852
+ # Test with fsspec
853
+ result = file_to_ndarray(
854
+ f"{self.temp_dir}/test.json",
855
+ ContentType.JSON.value,
856
+ ContentEncoding.IDENTITY.value,
857
+ filesystem=self.fsspec_fs,
858
+ )
859
+ self._assert_arrays_equal(result, self.test_data)
860
+
861
+ # Test with PyArrow
862
+ result = file_to_ndarray(
863
+ f"{self.temp_dir}/test.json",
864
+ ContentType.JSON.value,
865
+ ContentEncoding.IDENTITY.value,
866
+ filesystem=self.pyarrow_fs,
867
+ )
868
+ self._assert_arrays_equal(result, self.test_data)
869
+
870
+ # Test with auto-inferred
871
+ result = file_to_ndarray(
872
+ f"{self.temp_dir}/test.json",
873
+ ContentType.JSON.value,
874
+ ContentEncoding.IDENTITY.value,
875
+ filesystem=None,
876
+ )
877
+ self._assert_arrays_equal(result, self.test_data)
878
+
879
+ def test_avro_with_different_filesystems(self):
880
+ """Test AVRO reading with different filesystem types."""
881
+ # Test with fsspec
882
+ result = file_to_ndarray(
883
+ f"{self.temp_dir}/test.avro",
884
+ ContentType.AVRO.value,
885
+ ContentEncoding.IDENTITY.value,
886
+ filesystem=self.fsspec_fs,
887
+ )
888
+ self._assert_arrays_equal(result, self.test_data)
889
+
890
+ # Test with PyArrow
891
+ result = file_to_ndarray(
892
+ f"{self.temp_dir}/test.avro",
893
+ ContentType.AVRO.value,
894
+ ContentEncoding.IDENTITY.value,
895
+ filesystem=self.pyarrow_fs,
896
+ )
897
+ self._assert_arrays_equal(result, self.test_data)
898
+
899
+ # Test with auto-inferred
900
+ result = file_to_ndarray(
901
+ f"{self.temp_dir}/test.avro",
902
+ ContentType.AVRO.value,
903
+ ContentEncoding.IDENTITY.value,
904
+ filesystem=None,
905
+ )
906
+ self._assert_arrays_equal(result, self.test_data)
907
+
908
+ def test_orc_with_different_filesystems(self):
909
+ """Test ORC reading with different filesystem types."""
910
+ # Test with fsspec
911
+ result = file_to_ndarray(
912
+ f"{self.temp_dir}/test.orc",
913
+ ContentType.ORC.value,
914
+ ContentEncoding.IDENTITY.value,
915
+ filesystem=self.fsspec_fs,
916
+ )
917
+ self._assert_arrays_equal(result, self.test_data)
918
+
919
+ # Test with PyArrow
920
+ result = file_to_ndarray(
921
+ f"{self.temp_dir}/test.orc",
922
+ ContentType.ORC.value,
923
+ ContentEncoding.IDENTITY.value,
924
+ filesystem=self.pyarrow_fs,
925
+ )
926
+ self._assert_arrays_equal(result, self.test_data)
927
+
928
+ # Test with auto-inferred
929
+ result = file_to_ndarray(
930
+ f"{self.temp_dir}/test.orc",
931
+ ContentType.ORC.value,
932
+ ContentEncoding.IDENTITY.value,
933
+ filesystem=None,
934
+ )
935
+ self._assert_arrays_equal(result, self.test_data)
936
+
937
+ def test_file_to_ndarray_with_different_filesystems(self):
938
+ """Test file_to_ndarray with different filesystem types for all content types."""
939
+ test_cases = [
940
+ (
941
+ f"{self.temp_dir}/test.csv",
942
+ ContentType.CSV.value,
943
+ ContentEncoding.IDENTITY.value,
944
+ {"column_names": ["col1", "col2", "col3"]},
945
+ self.test_data,
946
+ ),
947
+ (
948
+ f"{self.temp_dir}/test_gzip.csv.gz",
949
+ ContentType.CSV.value,
950
+ ContentEncoding.GZIP.value,
951
+ {"column_names": ["col1", "col2", "col3"]},
952
+ self.test_data,
953
+ ),
954
+ (
955
+ f"{self.temp_dir}/test.parquet",
956
+ ContentType.PARQUET.value,
957
+ ContentEncoding.IDENTITY.value,
958
+ {},
959
+ self.test_data,
960
+ ),
961
+ (
962
+ f"{self.temp_dir}/test.feather",
963
+ ContentType.FEATHER.value,
964
+ ContentEncoding.IDENTITY.value,
965
+ {},
966
+ self.test_data,
967
+ ),
968
+ (
969
+ f"{self.temp_dir}/test.json",
970
+ ContentType.JSON.value,
971
+ ContentEncoding.IDENTITY.value,
972
+ {},
973
+ self.test_data,
974
+ ),
975
+ (
976
+ f"{self.temp_dir}/test.avro",
977
+ ContentType.AVRO.value,
978
+ ContentEncoding.IDENTITY.value,
979
+ {},
980
+ self.test_data,
981
+ ),
982
+ (
983
+ f"{self.temp_dir}/test.orc",
984
+ ContentType.ORC.value,
985
+ ContentEncoding.IDENTITY.value,
986
+ {},
987
+ self.test_data,
988
+ ),
989
+ ]
990
+
991
+ filesystems = [
992
+ ("fsspec", self.fsspec_fs),
993
+ ("pyarrow", self.pyarrow_fs),
994
+ ("auto-inferred", None),
995
+ ]
996
+
997
+ for (
998
+ path,
999
+ content_type,
1000
+ content_encoding,
1001
+ extra_kwargs,
1002
+ expected_data,
1003
+ ) in test_cases:
1004
+ for fs_name, filesystem in filesystems:
1005
+ with self.subTest(
1006
+ content_type=content_type,
1007
+ filesystem=fs_name,
1008
+ encoding=content_encoding,
1009
+ ):
1010
+ result = file_to_ndarray(
1011
+ path=path,
1012
+ content_type=content_type,
1013
+ content_encoding=content_encoding,
1014
+ filesystem=filesystem,
1015
+ **extra_kwargs,
1016
+ )
1017
+ self._assert_arrays_equal(result, expected_data)
1018
+
1019
+ def test_compression_encoding_with_different_filesystems(self):
1020
+ """Test that compression encoding works correctly with different filesystem types."""
1021
+ test_cases = [
1022
+ (f"{self.temp_dir}/test.csv", ContentEncoding.IDENTITY.value),
1023
+ (f"{self.temp_dir}/test_gzip.csv.gz", ContentEncoding.GZIP.value),
1024
+ (f"{self.temp_dir}/test_bzip2.csv.bz2", ContentEncoding.BZIP2.value),
1025
+ ]
1026
+
1027
+ filesystems = [
1028
+ ("fsspec", self.fsspec_fs),
1029
+ ("pyarrow", self.pyarrow_fs),
1030
+ ("auto-inferred", None),
1031
+ ]
1032
+
1033
+ for path, content_encoding in test_cases:
1034
+ for fs_name, filesystem in filesystems:
1035
+ with self.subTest(encoding=content_encoding, filesystem=fs_name):
1036
+ result = file_to_ndarray(
1037
+ path=path,
1038
+ content_type=ContentType.CSV.value,
1039
+ content_encoding=content_encoding,
1040
+ filesystem=filesystem,
1041
+ column_names=["col1", "col2", "col3"],
1042
+ )
1043
+ self._assert_arrays_equal(result, self.test_data)
1044
+
1045
+ def test_filesystem_open_kwargs(self):
1046
+ """Test that filesystem open kwargs are properly passed through."""
1047
+ # Test with custom fs_open_kwargs
1048
+ result = file_to_ndarray(
1049
+ f"{self.temp_dir}/test.csv",
1050
+ ContentType.CSV.value,
1051
+ ContentEncoding.IDENTITY.value,
1052
+ filesystem=self.fsspec_fs,
1053
+ fs_open_kwargs={
1054
+ "encoding": "utf-8"
1055
+ }, # This should be passed to filesystem.open()
1056
+ column_names=["col1", "col2", "col3"],
1057
+ )
1058
+ self._assert_arrays_equal(result, self.test_data)
1059
+
1060
+ def test_delimited_formats_with_different_filesystems(self):
1061
+ """Test delimited formats (TSV, PSV, etc.) with different filesystem types."""
1062
+ # Create TSV test file without headers to match test data structure
1063
+ tsv_data = "value1\t1\t1.1\nvalue2\t2\t2.2\nvalue3\t3\t3.3\n"
1064
+ with open(f"{self.temp_dir}/test.tsv", "w") as f:
1065
+ f.write(tsv_data)
1066
+
1067
+ # Create PSV test file without headers to match test data structure
1068
+ psv_data = "value1|1|1.1\nvalue2|2|2.2\nvalue3|3|3.3\n"
1069
+ with open(f"{self.temp_dir}/test.psv", "w") as f:
1070
+ f.write(psv_data)
1071
+
1072
+ delimited_test_cases = [
1073
+ (
1074
+ f"{self.temp_dir}/test.tsv",
1075
+ ContentType.TSV.value,
1076
+ {"sep": "\t", "column_names": ["col1", "col2", "col3"]},
1077
+ ),
1078
+ (
1079
+ f"{self.temp_dir}/test.psv",
1080
+ ContentType.PSV.value,
1081
+ {"sep": "|", "column_names": ["col1", "col2", "col3"]},
1082
+ ),
1083
+ ]
1084
+
1085
+ filesystems = [
1086
+ ("fsspec", self.fsspec_fs),
1087
+ ("pyarrow", self.pyarrow_fs),
1088
+ ("auto-inferred", None),
1089
+ ]
1090
+
1091
+ for path, content_type, extra_kwargs in delimited_test_cases:
1092
+ for fs_name, filesystem in filesystems:
1093
+ with self.subTest(content_type=content_type, filesystem=fs_name):
1094
+ result = file_to_ndarray(
1095
+ path=path,
1096
+ content_type=content_type,
1097
+ content_encoding=ContentEncoding.IDENTITY.value,
1098
+ filesystem=filesystem,
1099
+ **extra_kwargs,
1100
+ )
1101
+ self._assert_arrays_equal(result, self.test_data)
1102
+
1103
+ def test_numpy_array_conversion_consistency(self):
1104
+ """Test that numpy array conversion is consistent across filesystem types."""
1105
+ # Test that the same data produces the same numpy array regardless of filesystem
1106
+ filesystems = [
1107
+ ("fsspec", self.fsspec_fs),
1108
+ ("pyarrow", self.pyarrow_fs),
1109
+ ("auto-inferred", None),
1110
+ ]
1111
+
1112
+ # Use Parquet as it preserves data types well
1113
+ parquet_path = f"{self.temp_dir}/test.parquet"
1114
+
1115
+ results = []
1116
+ for fs_name, filesystem in filesystems:
1117
+ result = file_to_ndarray(
1118
+ parquet_path,
1119
+ ContentType.PARQUET.value,
1120
+ ContentEncoding.IDENTITY.value,
1121
+ filesystem=filesystem,
1122
+ )
1123
+ results.append((fs_name, result))
1124
+
1125
+ # All results should be identical
1126
+ reference_result = results[0][1]
1127
+ for fs_name, result in results[1:]:
1128
+ with self.subTest(filesystem=fs_name):
1129
+ self._assert_arrays_equal(result, reference_result)
1130
+
1131
+ def test_dtype_preservation_across_filesystems(self):
1132
+ """Test that data types are preserved across different filesystem types."""
1133
+ import pandas as pd
1134
+
1135
+ # Create a DataFrame with mixed data types
1136
+ df = pd.DataFrame(
1137
+ {
1138
+ "int_col": [1, 2, 3],
1139
+ "float_col": [1.1, 2.2, 3.3],
1140
+ "str_col": ["a", "b", "c"],
1141
+ }
1142
+ )
1143
+
1144
+ # Save as Parquet (preserves types best)
1145
+ parquet_path = f"{self.temp_dir}/test_dtypes.parquet"
1146
+ df.to_parquet(parquet_path, index=False)
1147
+
1148
+ filesystems = [
1149
+ ("fsspec", self.fsspec_fs),
1150
+ ("pyarrow", self.pyarrow_fs),
1151
+ ("auto-inferred", None),
1152
+ ]
1153
+
1154
+ # Test that data types are consistent across filesystems
1155
+ dtypes = []
1156
+ for fs_name, filesystem in filesystems:
1157
+ result = file_to_ndarray(
1158
+ parquet_path,
1159
+ ContentType.PARQUET.value,
1160
+ ContentEncoding.IDENTITY.value,
1161
+ filesystem=filesystem,
1162
+ )
1163
+ dtypes.append((fs_name, result.dtype))
1164
+
1165
+ # All dtypes should be the same (object type for mixed data)
1166
+ reference_dtype = dtypes[0][1]
1167
+ for fs_name, dtype in dtypes[1:]:
1168
+ with self.subTest(filesystem=fs_name):
1169
+ assert (
1170
+ dtype == reference_dtype
1171
+ ), f"Dtype mismatch for {fs_name}: {dtype} vs {reference_dtype}"
1172
+
1173
+ def test_error_handling_across_filesystems(self):
1174
+ """Test that error handling is consistent across filesystem types."""
1175
+ filesystems = [
1176
+ ("fsspec", self.fsspec_fs),
1177
+ ("pyarrow", self.pyarrow_fs),
1178
+ ("auto-inferred", None),
1179
+ ]
1180
+
1181
+ # Test with non-existent file
1182
+ for fs_name, filesystem in filesystems:
1183
+ with self.subTest(filesystem=fs_name):
1184
+ with self.assertRaises(
1185
+ Exception
1186
+ ): # Should raise some kind of file not found error
1187
+ file_to_ndarray(
1188
+ f"{self.temp_dir}/nonexistent.csv",
1189
+ ContentType.CSV.value,
1190
+ ContentEncoding.IDENTITY.value,
1191
+ filesystem=filesystem,
1192
+ column_names=["col1", "col2", "col3"],
1193
+ )