deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,33 @@
1
1
  from unittest import TestCase
2
2
  from deltacat.utils.pyarrow import (
3
- s3_partial_parquet_file_to_table,
3
+ partial_parquet_file_to_table,
4
4
  pyarrow_read_csv,
5
5
  ContentTypeValidationError,
6
6
  content_type_to_reader_kwargs,
7
7
  _add_column_kwargs,
8
- logger,
9
- s3_file_to_table,
10
- s3_file_to_parquet,
8
+ file_to_table,
9
+ file_to_parquet,
10
+ table_to_file,
11
11
  ReadKwargsProviderPyArrowSchemaOverride,
12
- RAISE_ON_EMPTY_CSV_KWARG,
12
+ ReadKwargsProviderPyArrowCsvPureUtf8,
13
13
  RAISE_ON_DECIMAL_OVERFLOW,
14
- OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG,
14
+ RAISE_ON_EMPTY_CSV_KWARG,
15
15
  )
16
16
  import decimal
17
17
  from deltacat.types.media import ContentEncoding, ContentType
18
18
  from deltacat.types.partial_download import PartialParquetParameters
19
19
  from pyarrow.parquet import ParquetFile
20
+ import tempfile
20
21
  import pyarrow as pa
22
+ from pyarrow import csv as pacsv
23
+ import fsspec
24
+ import gzip
25
+ import json
26
+ from pyarrow import (
27
+ feather as paf,
28
+ parquet as papq,
29
+ orc as paorc,
30
+ )
21
31
 
22
32
  PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
23
33
  PARQUET_GZIP_COMPRESSED_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet.gz"
@@ -33,8 +43,8 @@ GZIP_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed
33
43
  BZ2_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.bz2"
34
44
 
35
45
 
36
- class TestS3PartialParquetFileToTable(TestCase):
37
- def test_s3_partial_parquet_file_to_table_sanity(self):
46
+ class TestPartialParquetFileToTable(TestCase):
47
+ def test_partial_parquet_file_to_table_sanity(self):
38
48
 
39
49
  pq_file = ParquetFile(PARQUET_FILE_PATH)
40
50
  partial_parquet_params = PartialParquetParameters.of(
@@ -48,7 +58,7 @@ class TestS3PartialParquetFileToTable(TestCase):
48
58
  # only first row group to be downloaded
49
59
  partial_parquet_params.row_groups_to_download.pop()
50
60
 
51
- result = s3_partial_parquet_file_to_table(
61
+ result = partial_parquet_file_to_table(
52
62
  PARQUET_FILE_PATH,
53
63
  include_columns=["n_legs"],
54
64
  content_encoding=ContentEncoding.IDENTITY.value,
@@ -59,7 +69,7 @@ class TestS3PartialParquetFileToTable(TestCase):
59
69
  self.assertEqual(len(result), 3)
60
70
  self.assertEqual(len(result.columns), 1)
61
71
 
62
- def test_s3_partial_parquet_file_to_table_when_schema_passed(self):
72
+ def test_partial_parquet_file_to_table_when_schema_passed(self):
63
73
 
64
74
  pq_file = ParquetFile(PARQUET_FILE_PATH)
65
75
  partial_parquet_params = PartialParquetParameters.of(
@@ -79,7 +89,7 @@ class TestS3PartialParquetFileToTable(TestCase):
79
89
 
80
90
  pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
81
91
 
82
- result = s3_partial_parquet_file_to_table(
92
+ result = partial_parquet_file_to_table(
83
93
  PARQUET_FILE_PATH,
84
94
  ContentType.PARQUET.value,
85
95
  ContentEncoding.IDENTITY.value,
@@ -98,7 +108,7 @@ class TestS3PartialParquetFileToTable(TestCase):
98
108
  self.assertEqual(result_schema.field(2).type, "int64")
99
109
  self.assertEqual(result_schema.field(2).name, "MISSING")
100
110
 
101
- def test_s3_partial_parquet_file_to_table_when_schema_missing_columns(self):
111
+ def test_partial_parquet_file_to_table_when_schema_missing_columns(self):
102
112
 
103
113
  pq_file = ParquetFile(PARQUET_FILE_PATH)
104
114
  partial_parquet_params = PartialParquetParameters.of(
@@ -118,7 +128,7 @@ class TestS3PartialParquetFileToTable(TestCase):
118
128
 
119
129
  pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
120
130
 
121
- result = s3_partial_parquet_file_to_table(
131
+ result = partial_parquet_file_to_table(
122
132
  PARQUET_FILE_PATH,
123
133
  ContentType.PARQUET.value,
124
134
  ContentEncoding.IDENTITY.value,
@@ -135,7 +145,7 @@ class TestS3PartialParquetFileToTable(TestCase):
135
145
  self.assertEqual(result_schema.field(0).type, "int64")
136
146
  self.assertEqual(result_schema.field(0).name, "MISSING")
137
147
 
138
- def test_s3_partial_parquet_file_to_table_when_schema_passed_with_include_columns(
148
+ def test_partial_parquet_file_to_table_when_schema_passed_with_include_columns(
139
149
  self,
140
150
  ):
141
151
 
@@ -152,11 +162,11 @@ class TestS3PartialParquetFileToTable(TestCase):
152
162
 
153
163
  pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
154
164
 
155
- result = s3_partial_parquet_file_to_table(
165
+ result = partial_parquet_file_to_table(
156
166
  PARQUET_FILE_PATH,
157
167
  ContentType.PARQUET.value,
158
168
  ContentEncoding.IDENTITY.value,
159
- ["n_legs", "animal"],
169
+ column_names=["n_legs", "animal"],
160
170
  pa_read_func_kwargs_provider=pa_kwargs_provider,
161
171
  partial_file_download_params=partial_parquet_params,
162
172
  )
@@ -168,7 +178,7 @@ class TestS3PartialParquetFileToTable(TestCase):
168
178
  self.assertEqual(result_schema.field(0).type, "string")
169
179
  self.assertEqual(result_schema.field(0).name, "n_legs") # order doesn't change
170
180
 
171
- def test_s3_partial_parquet_file_to_table_when_multiple_row_groups(self):
181
+ def test_partial_parquet_file_to_table_when_multiple_row_groups(self):
172
182
 
173
183
  pq_file = ParquetFile(PARQUET_FILE_PATH)
174
184
  partial_parquet_params = PartialParquetParameters.of(
@@ -179,7 +189,7 @@ class TestS3PartialParquetFileToTable(TestCase):
179
189
  partial_parquet_params.num_row_groups, 2, "test_file.parquet has changed."
180
190
  )
181
191
 
182
- result = s3_partial_parquet_file_to_table(
192
+ result = partial_parquet_file_to_table(
183
193
  PARQUET_FILE_PATH,
184
194
  content_encoding=ContentEncoding.IDENTITY.value,
185
195
  content_type=ContentType.PARQUET.value,
@@ -668,301 +678,1140 @@ class TestReadCSV(TestCase):
668
678
  self.assertEqual(result_schema.field(1).type, pa.decimal128(15, 2))
669
679
 
670
680
 
671
- class TestS3FileToTable(TestCase):
672
- def test_s3_file_to_table_identity_sanity(self):
681
+ class TestWriters(TestCase):
682
+ def setUp(self):
683
+ self.table = pa.table({"col1": ["a,b\tc|d", "e,f\tg|h"], "col2": [1, 2]})
684
+ self.fs = fsspec.filesystem("file")
685
+ self.base_path = tempfile.mkdtemp()
686
+ self.fs.makedirs(self.base_path, exist_ok=True)
687
+
688
+ def tearDown(self):
689
+ self.fs.rm(self.base_path, recursive=True)
690
+
691
+ def test_write_feather(self):
692
+ path = f"{self.base_path}/test.feather"
693
+
694
+ table_to_file(
695
+ self.table,
696
+ path,
697
+ self.fs,
698
+ lambda x: path,
699
+ content_type=ContentType.FEATHER.value,
700
+ )
701
+ assert self.fs.exists(path), "file was not written"
702
+
703
+ # Verify content
704
+ result = paf.read_table(path)
705
+ assert result.equals(self.table)
706
+
707
+ def test_write_csv(self):
708
+ path = f"{self.base_path}/test.csv.gz"
709
+
710
+ table_to_file(
711
+ self.table,
712
+ path,
713
+ self.fs,
714
+ lambda x: path,
715
+ content_type=ContentType.CSV.value,
716
+ )
717
+ assert self.fs.exists(path), "file was not written"
718
+
719
+ # Verify content (should be GZIP compressed)
720
+ with self.fs.open(path, "rb") as f:
721
+ with gzip.GzipFile(fileobj=f) as gz:
722
+ content = gz.read().decode("utf-8")
723
+ # Should be quoted due to commas in data
724
+ assert '"a,b\tc|d",1' in content
725
+ assert '"e,f\tg|h",2' in content
726
+
727
+ def test_write_tsv(self):
728
+ path = f"{self.base_path}/test.tsv.gz"
729
+
730
+ table_to_file(
731
+ self.table,
732
+ path,
733
+ self.fs,
734
+ lambda x: path,
735
+ content_type=ContentType.TSV.value,
736
+ )
737
+ assert self.fs.exists(path), "file was not written"
738
+
739
+ # Verify content (should be GZIP compressed)
740
+ with self.fs.open(path, "rb") as f:
741
+ with gzip.GzipFile(fileobj=f) as gz:
742
+ content = gz.read().decode("utf-8")
743
+ # Should be quoted due to tabs in data
744
+ assert '"a,b\tc|d"\t1' in content
745
+ assert '"e,f\tg|h"\t2' in content
746
+
747
+ def test_write_psv(self):
748
+ path = f"{self.base_path}/test.psv.gz"
749
+
750
+ table_to_file(
751
+ self.table,
752
+ path,
753
+ self.fs,
754
+ lambda x: path,
755
+ content_type=ContentType.PSV.value,
756
+ )
757
+ assert self.fs.exists(path), "file was not written"
758
+
759
+ # Verify content (should be GZIP compressed)
760
+ with self.fs.open(path, "rb") as f:
761
+ with gzip.GzipFile(fileobj=f) as gz:
762
+ content = gz.read().decode("utf-8")
763
+ # Should be quoted due to pipes in data
764
+ assert '"a,b\tc|d"|1' in content
765
+ assert '"e,f\tg|h"|2' in content
766
+
767
+ def test_write_unescaped_tsv(self):
768
+ # Create table without delimiters for unescaped TSV
769
+ table = pa.table({"col1": ["abc", "def"], "col2": [1, 2]})
770
+ path = f"{self.base_path}/test.tsv.gz"
771
+
772
+ table_to_file(
773
+ table,
774
+ path,
775
+ self.fs,
776
+ lambda x: path,
777
+ content_type=ContentType.UNESCAPED_TSV.value,
778
+ )
779
+ assert self.fs.exists(path), "file was not written"
780
+
781
+ # Verify content (should be GZIP compressed)
782
+ with self.fs.open(path, "rb") as f:
783
+ with gzip.GzipFile(fileobj=f) as gz:
784
+ content = gz.read().decode("utf-8")
785
+ # With quoting_style="none", strings should not be quoted
786
+ assert "abc\t1" in content
787
+ assert "def\t2" in content
788
+
789
+ def test_write_orc(self):
790
+ path = f"{self.base_path}/test.orc"
791
+
792
+ table_to_file(
793
+ self.table,
794
+ path,
795
+ self.fs,
796
+ lambda x: path,
797
+ content_type=ContentType.ORC.value,
798
+ )
799
+ assert self.fs.exists(path), "file was not written"
800
+
801
+ # Verify content
802
+ result = paorc.read_table(path)
803
+ assert result.equals(self.table)
804
+
805
+ def test_write_parquet(self):
806
+ path = f"{self.base_path}/test.parquet"
807
+
808
+ table_to_file(
809
+ self.table,
810
+ path,
811
+ self.fs,
812
+ lambda x: path,
813
+ content_type=ContentType.PARQUET.value,
814
+ )
815
+ assert self.fs.exists(path), "file was not written"
816
+
817
+ # Verify content
818
+ result = papq.read_table(path)
819
+ assert result.equals(self.table)
820
+
821
+ def test_write_json(self):
822
+ path = f"{self.base_path}/test.json.gz"
823
+
824
+ table_to_file(
825
+ self.table,
826
+ path,
827
+ self.fs,
828
+ lambda x: path,
829
+ content_type=ContentType.JSON.value,
830
+ )
831
+ assert self.fs.exists(path), "file was not written"
832
+
833
+ # Verify content (should be GZIP compressed)
834
+ with self.fs.open(path, "rb") as f:
835
+ with gzip.GzipFile(fileobj=f) as gz:
836
+ content = gz.read().decode("utf-8")
837
+ # Each line should be a valid JSON object
838
+ lines = [
839
+ line for line in content.split("\n") if line
840
+ ] # Skip empty lines
841
+ assert len(lines) == 2 # 2 records
842
+ assert json.loads(lines[0]) == {"col1": "a,b\tc|d", "col2": 1}
843
+ assert json.loads(lines[1]) == {"col1": "e,f\tg|h", "col2": 2}
844
+
845
+ def test_write_avro(self):
846
+ import polars as pl
847
+
848
+ path = f"{self.base_path}/test.avro"
849
+
850
+ table_to_file(
851
+ self.table,
852
+ path,
853
+ self.fs,
854
+ lambda x: path,
855
+ content_type=ContentType.AVRO.value,
856
+ )
857
+ assert self.fs.exists(path), "file was not written"
858
+
859
+ # Verify content by reading with polars
860
+ result = pl.read_avro(path).to_arrow()
861
+ # Cast the result to match the original table's schema
862
+ # (the round-trip from arrow->polars->arrow casts string to large string)
863
+ result = result.cast(self.table.schema)
864
+ assert result.equals(self.table)
865
+
866
+
867
+ class TestPyArrowReaders(TestCase):
868
+ def setUp(self):
869
+ # Create test data files for reading
870
+ self.fs = fsspec.filesystem("file")
871
+ self.base_path = tempfile.mkdtemp()
872
+ self.fs.makedirs(self.base_path, exist_ok=True)
873
+
874
+ # Create test Table
875
+ self.table = pa.Table.from_pylist(
876
+ [
877
+ {"col1": "a,b\tc|d", "col2": 1, "col3": 1.1},
878
+ {"col1": "e,f\tg|h", "col2": 2, "col3": 2.2},
879
+ {"col1": "test", "col2": 3, "col3": 3.3},
880
+ ]
881
+ )
673
882
 
674
- schema = pa.schema(
675
- [("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
883
+ # Write test files in different formats
884
+ self._create_test_files()
885
+
886
+ def tearDown(self):
887
+ self.fs.rm(self.base_path, recursive=True)
888
+
889
+ def _create_test_files(self):
890
+ # Create CSV file (GZIP compressed)
891
+ csv_path = f"{self.base_path}/test.csv"
892
+ with self.fs.open(csv_path, "wb") as f:
893
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
894
+ content = '"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n'
895
+ gz.write(content.encode("utf-8"))
896
+
897
+ # Create TSV file (GZIP compressed)
898
+ tsv_path = f"{self.base_path}/test.tsv"
899
+ with self.fs.open(tsv_path, "wb") as f:
900
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
901
+ content = '"a,b\tc|d"\t1\t1.1\n"e,f\tg|h"\t2\t2.2\ntest\t3\t3.3\n'
902
+ gz.write(content.encode("utf-8"))
903
+
904
+ # Create PSV file (GZIP compressed)
905
+ psv_path = f"{self.base_path}/test.psv"
906
+ with self.fs.open(psv_path, "wb") as f:
907
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
908
+ content = '"a,b\tc|d"|1|1.1\n"e,f\tg|h"|2|2.2\ntest|3|3.3\n'
909
+ gz.write(content.encode("utf-8"))
910
+
911
+ # Create unescaped TSV file (GZIP compressed)
912
+ unescaped_tsv_path = f"{self.base_path}/test_unescaped.tsv"
913
+ pa.Table.from_pylist(
914
+ [
915
+ {"col1": "abc", "col2": 1, "col3": 1.1},
916
+ {"col1": "def", "col2": 2, "col3": 2.2},
917
+ {"col1": "ghi", "col2": 3, "col3": 3.3},
918
+ ]
676
919
  )
920
+ with self.fs.open(unescaped_tsv_path, "wb") as f:
921
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
922
+ content = "abc\t1\t1.1\ndef\t2\t2.2\nghi\t3\t3.3\n"
923
+ gz.write(content.encode("utf-8"))
924
+
925
+ # Create Parquet file
926
+ parquet_path = f"{self.base_path}/test.parquet"
927
+ with self.fs.open(parquet_path, "wb") as f:
928
+ papq.write_table(self.table, f)
929
+
930
+ # Create Feather file
931
+ feather_path = f"{self.base_path}/test.feather"
932
+ with self.fs.open(feather_path, "wb") as f:
933
+ paf.write_feather(self.table, f)
934
+
935
+ # Create JSON file (GZIP compressed)
936
+ json_path = f"{self.base_path}/test.json"
937
+ with self.fs.open(json_path, "wb") as f:
938
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
939
+ # Create NDJSON format - one JSON object per line
940
+ lines = []
941
+ for row in self.table.to_pylist():
942
+ lines.append(json.dumps(row))
943
+ content = "\n".join(lines) + "\n"
944
+ gz.write(content.encode("utf-8"))
945
+
946
+ # Create Avro file using polars (since pyarrow delegates to polars for Avro)
947
+ avro_path = f"{self.base_path}/test.avro"
948
+ import polars as pl
949
+
950
+ pl_df = pl.from_arrow(self.table)
951
+ pl_df.write_avro(avro_path)
952
+
953
+ # Create ORC file
954
+ orc_path = f"{self.base_path}/test.orc"
955
+ with self.fs.open(orc_path, "wb") as f:
956
+ paorc.write_table(self.table, f)
957
+
958
+ def test_content_type_to_reader_kwargs(self):
959
+ # Test CSV kwargs
960
+ csv_kwargs = content_type_to_reader_kwargs(ContentType.CSV.value)
961
+ expected_csv = {"parse_options": pacsv.ParseOptions(delimiter=",")}
962
+ assert (
963
+ csv_kwargs["parse_options"].delimiter
964
+ == expected_csv["parse_options"].delimiter
965
+ )
966
+
967
+ # Test TSV kwargs
968
+ tsv_kwargs = content_type_to_reader_kwargs(ContentType.TSV.value)
969
+ expected_tsv = {"parse_options": pacsv.ParseOptions(delimiter="\t")}
970
+ assert (
971
+ tsv_kwargs["parse_options"].delimiter
972
+ == expected_tsv["parse_options"].delimiter
973
+ )
974
+
975
+ # Test PSV kwargs
976
+ psv_kwargs = content_type_to_reader_kwargs(ContentType.PSV.value)
977
+ expected_psv = {"parse_options": pacsv.ParseOptions(delimiter="|")}
978
+ assert (
979
+ psv_kwargs["parse_options"].delimiter
980
+ == expected_psv["parse_options"].delimiter
981
+ )
982
+
983
+ # Test unescaped TSV kwargs
984
+ unescaped_kwargs = content_type_to_reader_kwargs(
985
+ ContentType.UNESCAPED_TSV.value
986
+ )
987
+ assert unescaped_kwargs["parse_options"].delimiter == "\t"
988
+ assert unescaped_kwargs["parse_options"].quote_char is False
989
+ assert unescaped_kwargs["convert_options"].null_values == [""]
990
+
991
+ # Test Parquet kwargs (should be empty)
992
+ parquet_kwargs = content_type_to_reader_kwargs(ContentType.PARQUET.value)
993
+ assert parquet_kwargs == {}
994
+
995
+ # Test ORC kwargs (should be empty)
996
+ orc_kwargs = content_type_to_reader_kwargs(ContentType.ORC.value)
997
+ assert orc_kwargs == {}
998
+
999
+ # Test Avro kwargs (should be empty)
1000
+ avro_kwargs = content_type_to_reader_kwargs(ContentType.AVRO.value)
1001
+ assert avro_kwargs == {}
1002
+
1003
+ def test_add_column_kwargs(self):
1004
+ kwargs = {}
1005
+ column_names = ["col1", "col2", "col3"]
1006
+ include_columns = ["col1", "col2"]
1007
+
1008
+ # Test CSV column kwargs
1009
+ _add_column_kwargs(ContentType.CSV.value, column_names, include_columns, kwargs)
1010
+ assert kwargs["read_options"].column_names == column_names
1011
+ assert kwargs["convert_options"].include_columns == include_columns
1012
+
1013
+ # Test Parquet column kwargs
1014
+ kwargs = {}
1015
+ _add_column_kwargs(
1016
+ ContentType.PARQUET.value, column_names, include_columns, kwargs
1017
+ )
1018
+ assert kwargs["columns"] == include_columns
677
1019
 
678
- result = s3_file_to_table(
679
- NON_EMPTY_VALID_UTSV_PATH,
680
- ContentType.UNESCAPED_TSV.value,
681
- ContentEncoding.IDENTITY.value,
682
- ["is_active", "ship_datetime_utc"],
683
- None,
684
- pa_read_func_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(
685
- schema=schema
686
- ),
1020
+ def test_file_to_table_csv(self):
1021
+ # Test reading CSV with file_to_table
1022
+ csv_path = f"{self.base_path}/test.csv"
1023
+
1024
+ result = file_to_table(
1025
+ csv_path,
1026
+ ContentType.CSV.value,
1027
+ ContentEncoding.GZIP.value,
1028
+ filesystem=self.fs,
1029
+ column_names=["col1", "col2", "col3"],
687
1030
  )
688
1031
 
689
- self.assertEqual(len(result), 3)
690
- self.assertEqual(len(result.column_names), 2)
691
- result_schema = result.schema
692
- for index, field in enumerate(result_schema):
693
- self.assertEqual(field.name, schema.field(index).name)
1032
+ assert len(result) == 3
1033
+ assert result.column_names == ["col1", "col2", "col3"]
1034
+ assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
694
1035
 
695
- self.assertEqual(result.schema.field(0).type, "string")
1036
+ def test_file_to_table_tsv(self):
1037
+ # Test reading TSV with file_to_table
1038
+ tsv_path = f"{self.base_path}/test.tsv"
696
1039
 
697
- def test_s3_file_to_table_gzip_compressed_sanity(self):
1040
+ result = file_to_table(
1041
+ tsv_path,
1042
+ ContentType.TSV.value,
1043
+ ContentEncoding.GZIP.value,
1044
+ filesystem=self.fs,
1045
+ column_names=["col1", "col2", "col3"],
1046
+ )
698
1047
 
699
- schema = pa.schema(
700
- [("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
1048
+ assert len(result) == 3
1049
+ assert result.column_names == ["col1", "col2", "col3"]
1050
+ assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
1051
+
1052
+ def test_file_to_table_psv(self):
1053
+ # Test reading PSV with file_to_table
1054
+ psv_path = f"{self.base_path}/test.psv"
1055
+
1056
+ result = file_to_table(
1057
+ psv_path,
1058
+ ContentType.PSV.value,
1059
+ ContentEncoding.GZIP.value,
1060
+ filesystem=self.fs,
1061
+ column_names=["col1", "col2", "col3"],
701
1062
  )
702
1063
 
703
- result = s3_file_to_table(
704
- GZIP_COMPRESSED_FILE_UTSV_PATH,
1064
+ assert len(result) == 3
1065
+ assert result.column_names == ["col1", "col2", "col3"]
1066
+ assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
1067
+
1068
+ def test_file_to_table_unescaped_tsv(self):
1069
+ # Test reading unescaped TSV with file_to_table
1070
+ unescaped_tsv_path = f"{self.base_path}/test_unescaped.tsv"
1071
+
1072
+ result = file_to_table(
1073
+ unescaped_tsv_path,
705
1074
  ContentType.UNESCAPED_TSV.value,
706
1075
  ContentEncoding.GZIP.value,
707
- ["is_active", "ship_datetime_utc"],
708
- None,
709
- pa_read_func_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(
710
- schema=schema
711
- ),
1076
+ filesystem=self.fs,
1077
+ column_names=["col1", "col2", "col3"],
712
1078
  )
713
1079
 
714
- self.assertEqual(len(result), 3)
715
- self.assertEqual(len(result.column_names), 2)
716
- result_schema = result.schema
717
- for index, field in enumerate(result_schema):
718
- self.assertEqual(field.name, schema.field(index).name)
1080
+ assert len(result) == 3
1081
+ assert result.column_names == ["col1", "col2", "col3"]
1082
+ assert result.column("col1").to_pylist() == ["abc", "def", "ghi"]
719
1083
 
720
- self.assertEqual(result.schema.field(0).type, "string")
1084
+ def test_file_to_table_parquet(self):
1085
+ # Test reading Parquet with file_to_table
1086
+ parquet_path = f"{self.base_path}/test.parquet"
721
1087
 
722
- def test_s3_file_to_table_bz2_compressed_sanity(self):
1088
+ result = file_to_table(
1089
+ parquet_path, ContentType.PARQUET.value, filesystem=self.fs
1090
+ )
723
1091
 
724
- schema = pa.schema(
725
- [("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
1092
+ assert len(result) == 3
1093
+ assert result.column_names == ["col1", "col2", "col3"]
1094
+ assert result.equals(self.table)
1095
+
1096
+ def test_file_to_table_feather(self):
1097
+ # Test reading Feather with file_to_table
1098
+ feather_path = f"{self.base_path}/test.feather"
1099
+
1100
+ result = file_to_table(
1101
+ feather_path, ContentType.FEATHER.value, filesystem=self.fs
726
1102
  )
727
1103
 
728
- result = s3_file_to_table(
729
- BZ2_COMPRESSED_FILE_UTSV_PATH,
730
- ContentType.UNESCAPED_TSV.value,
731
- ContentEncoding.BZIP2.value,
732
- ["is_active", "ship_datetime_utc"],
733
- None,
734
- pa_read_func_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(
735
- schema=schema
736
- ),
1104
+ assert len(result) == 3
1105
+ assert result.column_names == ["col1", "col2", "col3"]
1106
+ assert result.equals(self.table)
1107
+
1108
+ def test_file_to_table_json(self):
1109
+ # Test reading JSON with file_to_table
1110
+ json_path = f"{self.base_path}/test.json"
1111
+
1112
+ result = file_to_table(
1113
+ json_path,
1114
+ ContentType.JSON.value,
1115
+ ContentEncoding.GZIP.value,
1116
+ filesystem=self.fs,
737
1117
  )
738
1118
 
739
- self.assertEqual(len(result), 3)
740
- self.assertEqual(len(result.column_names), 2)
741
- result_schema = result.schema
742
- for index, field in enumerate(result_schema):
743
- self.assertEqual(field.name, schema.field(index).name)
1119
+ assert len(result) == 3
1120
+ assert set(result.column_names) == {"col1", "col2", "col3"}
1121
+ assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
744
1122
 
745
- self.assertEqual(result.schema.field(0).type, "string")
1123
+ def test_file_to_table_avro(self):
1124
+ # Test reading Avro with file_to_table
1125
+ avro_path = f"{self.base_path}/test.avro"
746
1126
 
747
- def test_s3_file_to_table_when_parquet_sanity(self):
1127
+ result = file_to_table(avro_path, ContentType.AVRO.value, filesystem=self.fs)
748
1128
 
749
- pa_kwargs_provider = lambda content_type, kwargs: {
750
- "reader_type": "pyarrow",
751
- **kwargs,
752
- }
1129
+ assert len(result) == 3
1130
+ assert result.column_names == ["col1", "col2", "col3"]
1131
+ # Avro may have different dtypes, so compare values
1132
+ assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
753
1133
 
754
- result = s3_file_to_table(
755
- PARQUET_FILE_PATH,
756
- ContentType.PARQUET.value,
757
- ContentEncoding.IDENTITY.value,
758
- ["n_legs", "animal"],
759
- ["n_legs"],
760
- pa_read_func_kwargs_provider=pa_kwargs_provider,
1134
+ def test_file_to_table_orc(self):
1135
+ # Test reading ORC with file_to_table
1136
+ orc_path = f"{self.base_path}/test.orc"
1137
+
1138
+ result = file_to_table(orc_path, ContentType.ORC.value, filesystem=self.fs)
1139
+
1140
+ assert len(result) == 3
1141
+ assert result.column_names == ["col1", "col2", "col3"]
1142
+ assert result.equals(self.table)
1143
+
1144
+ def test_file_to_table_with_column_selection(self):
1145
+ # Test reading with column selection
1146
+ csv_path = f"{self.base_path}/test.csv"
1147
+
1148
+ result = file_to_table(
1149
+ csv_path,
1150
+ ContentType.CSV.value,
1151
+ ContentEncoding.GZIP.value,
1152
+ filesystem=self.fs,
1153
+ column_names=["col1", "col2", "col3"],
1154
+ include_columns=["col1", "col2"],
761
1155
  )
762
1156
 
763
- self.assertEqual(len(result), 6)
764
- self.assertEqual(len(result.column_names), 1)
765
- schema = result.schema
766
- schema_index = schema.get_field_index("n_legs")
767
- self.assertEqual(schema.field(schema_index).type, "int64")
1157
+ assert len(result) == 3
1158
+ assert len(result.column_names) == 2 # Should only have 2 columns
1159
+ assert result.column_names == ["col1", "col2"]
768
1160
 
769
- def test_s3_file_to_table_when_parquet_schema_overridden(self):
1161
+ def test_file_to_table_with_kwargs_provider(self):
1162
+ # Test reading with kwargs provider
1163
+ csv_path = f"{self.base_path}/test.csv"
1164
+ provider = ReadKwargsProviderPyArrowCsvPureUtf8(
1165
+ include_columns=["col1", "col2", "col3"]
1166
+ )
770
1167
 
771
- schema = pa.schema(
772
- [pa.field("animal", pa.string()), pa.field("n_legs", pa.string())]
1168
+ result = file_to_table(
1169
+ csv_path,
1170
+ ContentType.CSV.value,
1171
+ ContentEncoding.GZIP.value,
1172
+ filesystem=self.fs,
1173
+ column_names=["col1", "col2", "col3"],
1174
+ pa_read_func_kwargs_provider=provider,
773
1175
  )
774
1176
 
775
- pa_kwargs_provider = lambda content_type, kwargs: {
776
- "schema": schema,
777
- "reader_type": "pyarrow",
778
- **kwargs,
779
- }
1177
+ assert len(result) == 3
1178
+ assert result.column_names == ["col1", "col2", "col3"]
1179
+ # With string types provider, all columns should be strings
1180
+ for col_name in result.column_names:
1181
+ assert result.schema.field(col_name).type == pa.string()
780
1182
 
781
- result = s3_file_to_table(
782
- PARQUET_FILE_PATH,
783
- ContentType.PARQUET.value,
784
- ContentEncoding.IDENTITY.value,
785
- ["n_legs", "animal"],
786
- pa_read_func_kwargs_provider=pa_kwargs_provider,
1183
+ def test_file_to_table_filesystem_inference(self):
1184
+ # Test filesystem inference when no filesystem is provided
1185
+ # Use JSON file since it should work well with inference
1186
+ json_path = f"{self.base_path}/test.json"
1187
+
1188
+ result = file_to_table(
1189
+ json_path,
1190
+ ContentType.JSON.value,
1191
+ ContentEncoding.GZIP.value
1192
+ # No filesystem provided - should be inferred
787
1193
  )
788
1194
 
789
- self.assertEqual(len(result), 6)
790
- self.assertEqual(len(result.column_names), 2)
1195
+ assert len(result) == 3
1196
+ assert set(result.column_names) == {"col1", "col2", "col3"}
1197
+ assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
791
1198
 
792
- result_schema = result.schema
793
- for index, field in enumerate(result_schema):
794
- self.assertEqual(field.name, schema.field(index).name)
1199
+ def test_file_to_table_unsupported_content_type(self):
1200
+ # Test error handling for unsupported content type
1201
+ parquet_path = f"{self.base_path}/test.parquet"
795
1202
 
796
- self.assertEqual(result.schema.field(1).type, "string")
1203
+ with self.assertRaises(NotImplementedError) as context:
1204
+ file_to_table(parquet_path, "unsupported/content-type", filesystem=self.fs)
797
1205
 
798
- def test_s3_file_to_table_when_parquet_gzip(self):
1206
+ assert "not implemented" in str(context.exception)
799
1207
 
800
- pa_kwargs_provider = lambda content_type, kwargs: {
801
- "reader_type": "pyarrow",
802
- **kwargs,
803
- }
1208
+ def test_file_to_table_bzip2_compression(self):
1209
+ # Test BZIP2 compression handling
1210
+ import bz2
804
1211
 
805
- result = s3_file_to_table(
806
- PARQUET_GZIP_COMPRESSED_FILE_PATH,
807
- ContentType.PARQUET.value,
808
- ContentEncoding.GZIP.value,
809
- ["n_legs", "animal"],
810
- ["n_legs"],
811
- pa_read_func_kwargs_provider=pa_kwargs_provider,
1212
+ # Create a BZIP2 compressed CSV file
1213
+ csv_content = '"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n'
1214
+ compressed_content = bz2.compress(csv_content.encode("utf-8"))
1215
+
1216
+ bz2_path = f"{self.base_path}/test.csv.bz2"
1217
+ with self.fs.open(bz2_path, "wb") as f:
1218
+ f.write(compressed_content)
1219
+
1220
+ result = file_to_table(
1221
+ bz2_path,
1222
+ ContentType.CSV.value,
1223
+ ContentEncoding.BZIP2.value,
1224
+ filesystem=self.fs,
1225
+ column_names=["col1", "col2", "col3"],
812
1226
  )
813
1227
 
814
- self.assertEqual(len(result), 6)
815
- self.assertEqual(len(result.column_names), 1)
816
- schema = result.schema
817
- schema_index = schema.get_field_index("n_legs")
818
- self.assertEqual(schema.field(schema_index).type, "int64")
1228
+ assert len(result) == 3
1229
+ assert result.column_names == ["col1", "col2", "col3"]
1230
+ assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
1231
+
1232
+
1233
+ class TestFileToParquet(TestCase):
1234
+ def setUp(self):
1235
+ # Create test data files for reading
1236
+ self.fs = fsspec.filesystem("file")
1237
+ self.base_path = tempfile.mkdtemp()
1238
+ self.fs.makedirs(self.base_path, exist_ok=True)
1239
+
1240
+ # Create test Table
1241
+ self.table = pa.Table.from_pylist(
1242
+ [
1243
+ {"col1": "a,b\tc|d", "col2": 1, "col3": 1.1},
1244
+ {"col1": "e,f\tg|h", "col2": 2, "col3": 2.2},
1245
+ {"col1": "test", "col2": 3, "col3": 3.3},
1246
+ ]
1247
+ )
1248
+
1249
+ # Write test parquet files
1250
+ self._create_test_files()
1251
+
1252
+ def tearDown(self):
1253
+ self.fs.rm(self.base_path, recursive=True)
1254
+
1255
+ def _create_test_files(self):
1256
+ # Create basic Parquet file
1257
+ parquet_path = f"{self.base_path}/test.parquet"
1258
+ with self.fs.open(parquet_path, "wb") as f:
1259
+ papq.write_table(self.table, f)
1260
+
1261
+ # Create larger Parquet file with multiple row groups
1262
+ large_table = pa.Table.from_pylist(
1263
+ [{"col1": f"row_{i}", "col2": i, "col3": float(i)} for i in range(1000)]
1264
+ )
1265
+ large_parquet_path = f"{self.base_path}/test_large.parquet"
1266
+ with self.fs.open(large_parquet_path, "wb") as f:
1267
+ papq.write_table(
1268
+ large_table, f, row_group_size=100
1269
+ ) # Create multiple row groups
1270
+
1271
+ def test_file_to_parquet_basic(self):
1272
+ # Test basic parquet file reading
1273
+ parquet_path = f"{self.base_path}/test.parquet"
1274
+
1275
+ result = file_to_parquet(parquet_path, filesystem=self.fs)
1276
+
1277
+ assert isinstance(result, papq.ParquetFile)
1278
+ assert result.num_row_groups > 0
1279
+ assert result.metadata.num_rows == 3
1280
+ assert result.metadata.num_columns == 3
1281
+
1282
+ # Verify we can read the data
1283
+ table = result.read()
1284
+ assert len(table) == 3
1285
+ assert table.column_names == ["col1", "col2", "col3"]
1286
+
1287
+ def test_file_to_parquet_with_schema_provider(self):
1288
+ # Test with schema override provider
1289
+ parquet_path = f"{self.base_path}/test.parquet"
819
1290
 
820
- def test_s3_file_to_table_when_utsv_gzip_and_content_type_overridden(self):
821
1291
  schema = pa.schema(
822
- [("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
1292
+ [
1293
+ pa.field("col1", pa.string()),
1294
+ pa.field("col2", pa.string()), # Override to string
1295
+ pa.field("col3", pa.string()), # Override to string
1296
+ ]
823
1297
  )
824
- # OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG has no effect on uTSV files
825
- pa_kwargs_provider = lambda content_type, kwargs: {
826
- "reader_type": "pyarrow",
827
- **kwargs,
828
- }
829
- pa_kwargs_provider = lambda content_type, kwargs: {
830
- "reader_type": "pyarrow",
831
- OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
832
- **kwargs,
833
- }
834
-
835
- result = s3_file_to_table(
836
- GZIP_COMPRESSED_FILE_UTSV_PATH,
837
- ContentType.UNESCAPED_TSV.value,
838
- ContentEncoding.GZIP.value,
839
- ["is_active", "ship_datetime_utc"],
840
- None,
841
- pa_read_func_kwargs_provider=pa_kwargs_provider,
1298
+
1299
+ provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
1300
+
1301
+ result = file_to_parquet(
1302
+ parquet_path, filesystem=self.fs, pa_read_func_kwargs_provider=provider
842
1303
  )
843
1304
 
844
- self.assertEqual(len(result), 3)
845
- self.assertEqual(len(result.column_names), 2)
846
- result_schema = result.schema
847
- for index, field in enumerate(result_schema):
848
- self.assertEqual(field.name, schema.field(index).name)
1305
+ assert isinstance(result, papq.ParquetFile)
1306
+ # Note: schema override might not affect ParquetFile metadata,
1307
+ # but should work when reading the table
1308
+ table = result.read()
1309
+ assert len(table) == 3
849
1310
 
850
- self.assertEqual(result.schema.field(0).type, "string")
1311
+ def test_file_to_parquet_with_custom_kwargs(self):
1312
+ # Test with custom ParquetFile kwargs
1313
+ parquet_path = f"{self.base_path}/test.parquet"
851
1314
 
852
- def test_s3_file_to_table_when_parquet_gzip_and_encoding_overridden(self):
853
- pa_kwargs_provider = lambda content_type, kwargs: {
854
- "reader_type": "pyarrow",
855
- OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
856
- **kwargs,
857
- }
1315
+ result = file_to_parquet(
1316
+ parquet_path,
1317
+ filesystem=self.fs,
1318
+ validate_schema=True, # Custom kwarg for ParquetFile
1319
+ memory_map=True, # Another custom kwarg
1320
+ )
858
1321
 
859
- result = s3_file_to_table(
860
- PARQUET_FILE_PATH,
861
- ContentType.PARQUET.value,
862
- ContentEncoding.GZIP.value,
863
- ["n_legs", "animal"],
864
- ["n_legs"],
865
- pa_read_func_kwargs_provider=pa_kwargs_provider,
1322
+ assert isinstance(result, papq.ParquetFile)
1323
+ assert result.metadata.num_rows == 3
1324
+
1325
+ def test_file_to_parquet_filesystem_inference(self):
1326
+ # Test filesystem inference when no filesystem is provided
1327
+ parquet_path = f"{self.base_path}/test.parquet"
1328
+
1329
+ result = file_to_parquet(
1330
+ parquet_path
1331
+ # No filesystem provided - should be inferred
866
1332
  )
867
1333
 
868
- self.assertEqual(len(result), 6)
869
- self.assertEqual(len(result.column_names), 1)
1334
+ assert isinstance(result, papq.ParquetFile)
1335
+ assert result.metadata.num_rows == 3
1336
+ assert result.metadata.num_columns == 3
1337
+
1338
+ def test_file_to_parquet_large_file(self):
1339
+ # Test with larger parquet file (multiple row groups)
1340
+ large_parquet_path = f"{self.base_path}/test_large.parquet"
1341
+
1342
+ result = file_to_parquet(large_parquet_path, filesystem=self.fs)
1343
+
1344
+ assert isinstance(result, papq.ParquetFile)
1345
+ assert result.metadata.num_rows == 1000
1346
+ assert result.num_row_groups > 1 # Should have multiple row groups
1347
+
1348
+ # Test reading specific row groups
1349
+ first_row_group = result.read_row_group(0)
1350
+ assert len(first_row_group) <= 100 # Based on row_group_size=100
1351
+
1352
+ def test_file_to_parquet_metadata_access(self):
1353
+ # Test accessing various metadata properties
1354
+ parquet_path = f"{self.base_path}/test.parquet"
1355
+
1356
+ result = file_to_parquet(parquet_path, filesystem=self.fs)
1357
+
1358
+ # Test metadata access
1359
+ metadata = result.metadata
1360
+ assert metadata.num_rows == 3
1361
+ assert metadata.num_columns == 3
1362
+ assert metadata.num_row_groups >= 1
1363
+
1364
+ # Test schema access
870
1365
  schema = result.schema
871
- schema_index = schema.get_field_index("n_legs")
872
- self.assertEqual(schema.field(schema_index).type, "int64")
873
-
874
-
875
- class TestS3FileToParquet(TestCase):
876
- def test_s3_file_to_parquet_sanity(self):
877
- test_s3_url = PARQUET_FILE_PATH
878
- test_content_type = ContentType.PARQUET.value
879
- test_content_encoding = ContentEncoding.IDENTITY.value
880
- pa_kwargs_provider = lambda content_type, kwargs: {
881
- "reader_type": "pyarrow",
882
- **kwargs,
883
- }
884
- with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
885
- result_parquet_file: ParquetFile = s3_file_to_parquet(
886
- test_s3_url,
887
- test_content_type,
888
- test_content_encoding,
889
- ["n_legs", "animal"],
890
- ["n_legs"],
891
- pa_read_func_kwargs_provider=pa_kwargs_provider,
892
- )
893
- log_message_log_args = cm.records[0].getMessage()
894
- log_message_presanitize_kwargs = cm.records[1].getMessage()
895
- self.assertIn(
896
- f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
897
- log_message_log_args,
898
- )
899
- self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs)
900
- for index, field in enumerate(result_parquet_file.schema_arrow):
901
- self.assertEqual(
902
- field.name, result_parquet_file.schema_arrow.field(index).name
1366
+ assert len(schema) == 3
1367
+ assert "col1" in schema.names
1368
+ assert "col2" in schema.names
1369
+ assert "col3" in schema.names
1370
+
1371
+ # Test schema_arrow property
1372
+ schema_arrow = result.schema_arrow
1373
+ assert isinstance(schema_arrow, pa.Schema)
1374
+ assert len(schema_arrow) == 3
1375
+
1376
+ def test_file_to_parquet_column_selection(self):
1377
+ # Test reading specific columns
1378
+ parquet_path = f"{self.base_path}/test.parquet"
1379
+
1380
+ result = file_to_parquet(parquet_path, filesystem=self.fs)
1381
+
1382
+ # Read only specific columns
1383
+ table = result.read(columns=["col1", "col2"])
1384
+ assert len(table.column_names) == 2
1385
+ assert table.column_names == ["col1", "col2"]
1386
+ assert len(table) == 3
1387
+
1388
+ def test_file_to_parquet_invalid_content_type(self):
1389
+ # Test error handling for invalid content type
1390
+ parquet_path = f"{self.base_path}/test.parquet"
1391
+
1392
+ with self.assertRaises(ContentTypeValidationError) as context:
1393
+ file_to_parquet(
1394
+ parquet_path,
1395
+ content_type=ContentType.CSV.value, # Invalid content type
1396
+ filesystem=self.fs,
903
1397
  )
904
- self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64")
905
1398
 
906
- def test_s3_file_to_parquet_when_parquet_gzip_encoding_and_overridden_returns_success(
907
- self,
908
- ):
909
- test_s3_url = PARQUET_FILE_PATH
910
- test_content_type = ContentType.PARQUET.value
911
- test_content_encoding = ContentEncoding.GZIP.value
912
- pa_kwargs_provider = lambda content_type, kwargs: {
913
- "reader_type": "pyarrow",
914
- OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
915
- **kwargs,
916
- }
917
- with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
918
- result_parquet_file: ParquetFile = s3_file_to_parquet(
919
- test_s3_url,
920
- test_content_type,
921
- test_content_encoding,
922
- ["n_legs", "animal"],
923
- ["n_legs"],
924
- pa_read_func_kwargs_provider=pa_kwargs_provider,
925
- )
926
- log_message_log_args = cm.records[0].getMessage()
927
- log_message_log_new_content_encoding = cm.records[1].getMessage()
928
- log_message_presanitize_kwargs = cm.records[2].getMessage()
929
- self.assertIn(
930
- f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
931
- log_message_log_args,
932
- )
933
- self.assertIn(
934
- f"Overriding {test_s3_url} content encoding from {ContentEncoding.GZIP.value} to {ContentEncoding.IDENTITY.value}",
935
- log_message_log_new_content_encoding,
936
- )
937
- self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs)
938
- for index, field in enumerate(result_parquet_file.schema_arrow):
939
- self.assertEqual(
940
- field.name, result_parquet_file.schema_arrow.field(index).name
1399
+ assert "cannot be read into pyarrow.parquet.ParquetFile" in str(
1400
+ context.exception
1401
+ )
1402
+
1403
+ def test_file_to_parquet_invalid_content_encoding(self):
1404
+ # Test error handling for invalid content encoding
1405
+ parquet_path = f"{self.base_path}/test.parquet"
1406
+
1407
+ with self.assertRaises(ContentTypeValidationError) as context:
1408
+ file_to_parquet(
1409
+ parquet_path,
1410
+ content_encoding=ContentEncoding.GZIP.value, # Invalid encoding
1411
+ filesystem=self.fs,
941
1412
  )
942
- self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64")
943
1413
 
944
- def test_s3_file_to_parquet_when_parquet_gzip_encoding_not_overridden_throws_error(
945
- self,
946
- ):
947
- test_s3_url = PARQUET_FILE_PATH
948
- test_content_type = ContentType.PARQUET.value
949
- test_content_encoding = ContentEncoding.GZIP.value
950
- pa_kwargs_provider = lambda content_type, kwargs: {
951
- "reader_type": "pyarrow",
952
- **kwargs,
953
- }
954
- with self.assertRaises(ContentTypeValidationError):
955
- with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
956
- s3_file_to_parquet(
957
- test_s3_url,
958
- test_content_type,
959
- test_content_encoding,
960
- ["n_legs", "animal"],
961
- ["n_legs"],
962
- pa_read_func_kwargs_provider=pa_kwargs_provider,
1414
+ assert "cannot be read into pyarrow.parquet.ParquetFile" in str(
1415
+ context.exception
1416
+ )
1417
+
1418
+ def test_file_to_parquet_different_filesystems(self):
1419
+ # Test with different filesystem implementations
1420
+ parquet_path = f"{self.base_path}/test.parquet"
1421
+
1422
+ # Test with fsspec filesystem
1423
+ result_fsspec = file_to_parquet(parquet_path, filesystem=self.fs)
1424
+ assert isinstance(result_fsspec, papq.ParquetFile)
1425
+ assert result_fsspec.metadata.num_rows == 3
1426
+
1427
+ # Test with None filesystem (inferred)
1428
+ result_inferred = file_to_parquet(parquet_path, filesystem=None)
1429
+ assert isinstance(result_inferred, papq.ParquetFile)
1430
+ assert result_inferred.metadata.num_rows == 3
1431
+
1432
+ def test_file_to_parquet_lazy_loading(self):
1433
+ # Test that ParquetFile provides lazy loading capabilities
1434
+ large_parquet_path = f"{self.base_path}/test_large.parquet"
1435
+
1436
+ result = file_to_parquet(large_parquet_path, filesystem=self.fs)
1437
+
1438
+ # ParquetFile should be created without loading all data
1439
+ assert isinstance(result, papq.ParquetFile)
1440
+ assert result.metadata.num_rows == 1000
1441
+
1442
+ # Test reading only specific columns (lazy loading)
1443
+ partial_table = result.read(columns=["col1", "col2"])
1444
+ assert len(partial_table) == 1000 # All rows but only 2 columns
1445
+ assert partial_table.column_names == ["col1", "col2"]
1446
+
1447
+ # Test reading specific row group (lazy loading)
1448
+ row_group_table = result.read_row_group(0)
1449
+ assert len(row_group_table) <= 100 # Based on row_group_size
1450
+
1451
+ def test_file_to_parquet_performance_timing(self):
1452
+ # Test that performance timing is logged (basic functionality test)
1453
+ parquet_path = f"{self.base_path}/test.parquet"
1454
+
1455
+ # This should complete without error and log timing
1456
+ result = file_to_parquet(parquet_path, filesystem=self.fs)
1457
+
1458
+ assert isinstance(result, papq.ParquetFile)
1459
+ assert result.metadata.num_rows == 3
1460
+
1461
+
1462
+ class TestFileToTableFilesystems(TestCase):
1463
+ """Test file_to_table with different filesystem implementations across all content types."""
1464
+
1465
+ def setUp(self):
1466
+ self.tmpdir = tempfile.mkdtemp()
1467
+ self._create_test_files()
1468
+
1469
+ def tearDown(self):
1470
+ import shutil
1471
+
1472
+ shutil.rmtree(self.tmpdir)
1473
+
1474
+ def _create_test_files(self):
1475
+ """Create test files for all supported content types."""
1476
+ # Test data
1477
+ test_data = pa.table(
1478
+ {
1479
+ "id": [1, 2, 3, 4, 5],
1480
+ "name": ["Alice", "Bob", "Charlie", "Diana", "Eve"],
1481
+ "age": [25, 30, 35, 28, 32],
1482
+ "score": [85.5, 92.0, 78.5, 88.0, 95.5],
1483
+ }
1484
+ )
1485
+
1486
+ # File paths
1487
+ self.csv_file = f"{self.tmpdir}/test.csv"
1488
+ self.tsv_file = f"{self.tmpdir}/test.tsv"
1489
+ self.psv_file = f"{self.tmpdir}/test.psv"
1490
+ self.unescaped_tsv_file = f"{self.tmpdir}/test_unescaped.tsv"
1491
+ self.parquet_file = f"{self.tmpdir}/test.parquet"
1492
+ self.feather_file = f"{self.tmpdir}/test.feather"
1493
+ self.json_file = f"{self.tmpdir}/test.json"
1494
+ self.orc_file = f"{self.tmpdir}/test.orc"
1495
+ self.avro_file = f"{self.tmpdir}/test.avro"
1496
+
1497
+ # Create CSV file
1498
+ pacsv.write_csv(
1499
+ test_data,
1500
+ self.csv_file,
1501
+ write_options=pacsv.WriteOptions(delimiter=",", include_header=False),
1502
+ )
1503
+
1504
+ # Create TSV file
1505
+ pacsv.write_csv(
1506
+ test_data,
1507
+ self.tsv_file,
1508
+ write_options=pacsv.WriteOptions(delimiter="\t", include_header=False),
1509
+ )
1510
+
1511
+ # Create PSV file
1512
+ pacsv.write_csv(
1513
+ test_data,
1514
+ self.psv_file,
1515
+ write_options=pacsv.WriteOptions(delimiter="|", include_header=False),
1516
+ )
1517
+
1518
+ # Create unescaped TSV file
1519
+ pacsv.write_csv(
1520
+ test_data,
1521
+ self.unescaped_tsv_file,
1522
+ write_options=pacsv.WriteOptions(
1523
+ delimiter="\t", include_header=False, quoting_style="none"
1524
+ ),
1525
+ )
1526
+
1527
+ # Create Parquet file
1528
+ papq.write_table(test_data, self.parquet_file)
1529
+
1530
+ # Create Feather file
1531
+ paf.write_feather(test_data, self.feather_file)
1532
+
1533
+ # Create JSON file (write as JSONL format)
1534
+ df = test_data.to_pandas()
1535
+ with open(self.json_file, "w") as f:
1536
+ for _, row in df.iterrows():
1537
+ json.dump(row.to_dict(), f)
1538
+ f.write("\n")
1539
+
1540
+ # Create ORC file
1541
+ paorc.write_table(test_data, self.orc_file)
1542
+
1543
+ # Create Avro file
1544
+ try:
1545
+ import polars as pl
1546
+
1547
+ pl_df = pl.from_arrow(test_data)
1548
+ pl_df.write_avro(self.avro_file)
1549
+ except ImportError:
1550
+ # Skip Avro file creation if polars is not available
1551
+ self.avro_file = None
1552
+
1553
+ def _get_filesystems(self, file_path):
1554
+ """Get different filesystem implementations for testing."""
1555
+ # fsspec AbstractFileSystem
1556
+ fsspec_fs = fsspec.filesystem("file")
1557
+
1558
+ # PyArrow filesystem
1559
+ import pyarrow.fs as pafs
1560
+
1561
+ pyarrow_fs = pafs.LocalFileSystem()
1562
+
1563
+ # None for automatic inference
1564
+ auto_infer_fs = None
1565
+
1566
+ return [
1567
+ ("fsspec", fsspec_fs),
1568
+ ("pyarrow", pyarrow_fs),
1569
+ ("auto_infer", auto_infer_fs),
1570
+ ]
1571
+
1572
+ def _assert_table_content(self, table, content_type):
1573
+ """Assert that the loaded table has expected content."""
1574
+ self.assertEqual(len(table), 5, f"Expected 5 rows for {content_type}")
1575
+ self.assertEqual(
1576
+ len(table.columns), 4, f"Expected 4 columns for {content_type}"
1577
+ )
1578
+
1579
+ # Check column names exist (order might vary for some formats)
1580
+ column_names = set(table.column_names)
1581
+ expected_columns = {"id", "name", "age", "score"}
1582
+ self.assertEqual(
1583
+ column_names, expected_columns, f"Column names mismatch for {content_type}"
1584
+ )
1585
+
1586
+ def test_csv_all_filesystems(self):
1587
+ """Test CSV reading with all filesystem types."""
1588
+ for fs_name, filesystem in self._get_filesystems(self.csv_file):
1589
+ with self.subTest(filesystem=fs_name):
1590
+ table = file_to_table(
1591
+ self.csv_file,
1592
+ ContentType.CSV.value,
1593
+ ContentEncoding.IDENTITY.value,
1594
+ filesystem=filesystem,
1595
+ column_names=["id", "name", "age", "score"],
1596
+ )
1597
+ self._assert_table_content(table, f"CSV with {fs_name}")
1598
+
1599
+ def test_tsv_all_filesystems(self):
1600
+ """Test TSV reading with all filesystem types."""
1601
+ for fs_name, filesystem in self._get_filesystems(self.tsv_file):
1602
+ with self.subTest(filesystem=fs_name):
1603
+ table = file_to_table(
1604
+ self.tsv_file,
1605
+ ContentType.TSV.value,
1606
+ ContentEncoding.IDENTITY.value,
1607
+ filesystem=filesystem,
1608
+ column_names=["id", "name", "age", "score"],
1609
+ )
1610
+ self._assert_table_content(table, f"TSV with {fs_name}")
1611
+
1612
+ def test_psv_all_filesystems(self):
1613
+ """Test PSV reading with all filesystem types."""
1614
+ for fs_name, filesystem in self._get_filesystems(self.psv_file):
1615
+ with self.subTest(filesystem=fs_name):
1616
+ table = file_to_table(
1617
+ self.psv_file,
1618
+ ContentType.PSV.value,
1619
+ ContentEncoding.IDENTITY.value,
1620
+ filesystem=filesystem,
1621
+ column_names=["id", "name", "age", "score"],
1622
+ )
1623
+ self._assert_table_content(table, f"PSV with {fs_name}")
1624
+
1625
+ def test_unescaped_tsv_all_filesystems(self):
1626
+ """Test unescaped TSV reading with all filesystem types."""
1627
+ for fs_name, filesystem in self._get_filesystems(self.unescaped_tsv_file):
1628
+ with self.subTest(filesystem=fs_name):
1629
+ table = file_to_table(
1630
+ self.unescaped_tsv_file,
1631
+ ContentType.UNESCAPED_TSV.value,
1632
+ ContentEncoding.IDENTITY.value,
1633
+ filesystem=filesystem,
1634
+ column_names=["id", "name", "age", "score"],
1635
+ )
1636
+ self._assert_table_content(table, f"UNESCAPED_TSV with {fs_name}")
1637
+
1638
+ def test_parquet_all_filesystems(self):
1639
+ """Test Parquet reading with all filesystem types."""
1640
+ for fs_name, filesystem in self._get_filesystems(self.parquet_file):
1641
+ with self.subTest(filesystem=fs_name):
1642
+ table = file_to_table(
1643
+ self.parquet_file,
1644
+ ContentType.PARQUET.value,
1645
+ ContentEncoding.IDENTITY.value,
1646
+ filesystem=filesystem,
1647
+ )
1648
+ self._assert_table_content(table, f"PARQUET with {fs_name}")
1649
+
1650
+ def test_feather_all_filesystems(self):
1651
+ """Test Feather reading with all filesystem types."""
1652
+ for fs_name, filesystem in self._get_filesystems(self.feather_file):
1653
+ with self.subTest(filesystem=fs_name):
1654
+ table = file_to_table(
1655
+ self.feather_file,
1656
+ ContentType.FEATHER.value,
1657
+ ContentEncoding.IDENTITY.value,
1658
+ filesystem=filesystem,
1659
+ )
1660
+ self._assert_table_content(table, f"FEATHER with {fs_name}")
1661
+
1662
+ def test_json_all_filesystems(self):
1663
+ """Test JSON reading with all filesystem types."""
1664
+ for fs_name, filesystem in self._get_filesystems(self.json_file):
1665
+ with self.subTest(filesystem=fs_name):
1666
+ table = file_to_table(
1667
+ self.json_file,
1668
+ ContentType.JSON.value,
1669
+ ContentEncoding.IDENTITY.value,
1670
+ filesystem=filesystem,
1671
+ )
1672
+ self._assert_table_content(table, f"JSON with {fs_name}")
1673
+
1674
+ def test_orc_all_filesystems(self):
1675
+ """Test ORC reading with all filesystem types."""
1676
+ for fs_name, filesystem in self._get_filesystems(self.orc_file):
1677
+ with self.subTest(filesystem=fs_name):
1678
+ table = file_to_table(
1679
+ self.orc_file,
1680
+ ContentType.ORC.value,
1681
+ ContentEncoding.IDENTITY.value,
1682
+ filesystem=filesystem,
1683
+ )
1684
+ self._assert_table_content(table, f"ORC with {fs_name}")
1685
+
1686
+ def test_avro_all_filesystems(self):
1687
+ """Test Avro reading with all filesystem types."""
1688
+ if self.avro_file is None:
1689
+ self.skipTest("Avro file creation skipped (polars not available)")
1690
+
1691
+ for fs_name, filesystem in self._get_filesystems(self.avro_file):
1692
+ with self.subTest(filesystem=fs_name):
1693
+ table = file_to_table(
1694
+ self.avro_file,
1695
+ ContentType.AVRO.value,
1696
+ ContentEncoding.IDENTITY.value,
1697
+ filesystem=filesystem,
1698
+ )
1699
+ self._assert_table_content(table, f"AVRO with {fs_name}")
1700
+
1701
+ def test_column_selection_all_filesystems(self):
1702
+ """Test column selection works with all filesystem types."""
1703
+ for fs_name, filesystem in self._get_filesystems(self.parquet_file):
1704
+ with self.subTest(filesystem=fs_name):
1705
+ table = file_to_table(
1706
+ self.parquet_file,
1707
+ ContentType.PARQUET.value,
1708
+ ContentEncoding.IDENTITY.value,
1709
+ filesystem=filesystem,
1710
+ include_columns=["name", "age"],
1711
+ )
1712
+ self.assertEqual(
1713
+ len(table.columns), 2, f"Expected 2 columns with {fs_name}"
963
1714
  )
964
- log_message_log_args = cm.records[0].getMessage()
965
- self.assertIn(
966
- f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
967
- log_message_log_args,
1715
+ self.assertEqual(
1716
+ set(table.column_names),
1717
+ {"name", "age"},
1718
+ f"Column selection failed with {fs_name}",
1719
+ )
1720
+
1721
+ def test_kwargs_provider_all_filesystems(self):
1722
+ """Test that kwargs providers work with all filesystem types."""
1723
+
1724
+ def schema_provider(content_type, kwargs):
1725
+ if content_type == ContentType.CSV.value:
1726
+ # Force all columns to be strings
1727
+ kwargs["convert_options"] = pacsv.ConvertOptions(
1728
+ column_types={
1729
+ "id": pa.string(),
1730
+ "name": pa.string(),
1731
+ "age": pa.string(),
1732
+ "score": pa.string(),
1733
+ }
1734
+ )
1735
+ return kwargs
1736
+
1737
+ for fs_name, filesystem in self._get_filesystems(self.csv_file):
1738
+ with self.subTest(filesystem=fs_name):
1739
+ table = file_to_table(
1740
+ self.csv_file,
1741
+ ContentType.CSV.value,
1742
+ ContentEncoding.IDENTITY.value,
1743
+ filesystem=filesystem,
1744
+ column_names=["id", "name", "age", "score"],
1745
+ pa_read_func_kwargs_provider=schema_provider,
1746
+ )
1747
+ # Check that all columns are strings
1748
+ for field in table.schema:
1749
+ self.assertEqual(
1750
+ field.type,
1751
+ pa.string(),
1752
+ f"Column {field.name} should be string with {fs_name}",
1753
+ )
1754
+
1755
+ def test_filesystem_auto_inference_consistency(self):
1756
+ """Test that auto-inferred filesystem produces same results as explicit filesystems."""
1757
+ # Use Parquet as it's most reliable across filesystem types
1758
+
1759
+ # Read with auto-inference
1760
+ auto_table = file_to_table(
1761
+ self.parquet_file,
1762
+ ContentType.PARQUET.value,
1763
+ ContentEncoding.IDENTITY.value,
1764
+ filesystem=None, # Auto-infer
1765
+ )
1766
+
1767
+ # Read with explicit fsspec filesystem
1768
+ fsspec_fs = fsspec.filesystem("file")
1769
+ fsspec_table = file_to_table(
1770
+ self.parquet_file,
1771
+ ContentType.PARQUET.value,
1772
+ ContentEncoding.IDENTITY.value,
1773
+ filesystem=fsspec_fs,
968
1774
  )
1775
+
1776
+ # Read with explicit PyArrow filesystem
1777
+ import pyarrow.fs as pafs
1778
+
1779
+ pyarrow_fs = pafs.LocalFileSystem()
1780
+ pyarrow_table = file_to_table(
1781
+ self.parquet_file,
1782
+ ContentType.PARQUET.value,
1783
+ ContentEncoding.IDENTITY.value,
1784
+ filesystem=pyarrow_fs,
1785
+ )
1786
+
1787
+ # All should produce equivalent results
1788
+ self.assertTrue(
1789
+ auto_table.equals(fsspec_table),
1790
+ "Auto-inferred result should match fsspec result",
1791
+ )
1792
+ self.assertTrue(
1793
+ auto_table.equals(pyarrow_table),
1794
+ "Auto-inferred result should match PyArrow result",
1795
+ )
1796
+
1797
+ def test_error_handling_all_filesystems(self):
1798
+ """Test error handling works consistently across filesystem types."""
1799
+ for fs_name, filesystem in self._get_filesystems(self.parquet_file):
1800
+ with self.subTest(filesystem=fs_name):
1801
+ # Test unsupported content type
1802
+ with self.assertRaises(NotImplementedError):
1803
+ file_to_table(
1804
+ self.parquet_file,
1805
+ "UNSUPPORTED_TYPE",
1806
+ ContentEncoding.IDENTITY.value,
1807
+ filesystem=filesystem,
1808
+ )
1809
+
1810
+ # Test non-existent file
1811
+ with self.assertRaises((FileNotFoundError, OSError)):
1812
+ file_to_table(
1813
+ f"{self.tmpdir}/non_existent.parquet",
1814
+ ContentType.PARQUET.value,
1815
+ ContentEncoding.IDENTITY.value,
1816
+ filesystem=filesystem,
1817
+ )