deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
deltacat/utils/pyarrow.py CHANGED
@@ -1,19 +1,21 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
+ import copy
4
5
  import bz2
5
6
  import gzip
6
7
  import io
7
8
  import logging
8
9
  from functools import partial
9
- from typing import Any, Callable, Dict, Iterable, List, Optional, Union
10
- from pyarrow.parquet import ParquetFile
11
- from deltacat.exceptions import ContentTypeValidationError
10
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Union, Tuple
11
+ from datetime import datetime, date
12
+ from decimal import Decimal
12
13
 
13
14
  import pyarrow as pa
14
15
  import numpy as np
15
16
  import pyarrow.compute as pc
16
17
  import pyarrow.fs as pafs
18
+ from pyarrow.parquet import ParquetFile
17
19
 
18
20
  from fsspec import AbstractFileSystem
19
21
  from pyarrow import csv as pacsv
@@ -22,9 +24,9 @@ from pyarrow import json as pajson
22
24
  from pyarrow import parquet as papq
23
25
  from pyarrow import orc as paorc
24
26
  from ray.data.datasource import FilenameProvider
25
- from deltacat.utils.s3fs import create_s3_file_system
26
27
 
27
28
  from deltacat import logs
29
+ from deltacat.exceptions import ContentTypeValidationError
28
30
  from deltacat.types.media import (
29
31
  DELIMITED_TEXT_CONTENT_TYPES,
30
32
  TABULAR_CONTENT_TYPES,
@@ -37,7 +39,6 @@ from deltacat.types.partial_download import (
37
39
  )
38
40
  from deltacat.utils.common import ContentTypeKwargsProvider, ReadKwargsProvider
39
41
  from deltacat.utils.performance import timed_invocation
40
- from deltacat.utils.daft import daft_s3_file_to_table
41
42
  from deltacat.utils.schema import coerce_pyarrow_table_to_schema
42
43
  from deltacat.utils.arguments import (
43
44
  sanitize_kwargs_to_callable,
@@ -45,12 +46,30 @@ from deltacat.utils.arguments import (
45
46
  )
46
47
  from deltacat.utils.filesystem import resolve_path_and_filesystem
47
48
  from functools import lru_cache
49
+ from typing import TYPE_CHECKING
50
+
51
+ if TYPE_CHECKING:
52
+ from deltacat.storage.model.manifest import Manifest
53
+ from deltacat.storage.model.delta import Delta
48
54
 
49
55
 
50
56
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
51
57
 
52
58
  RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
53
59
  READER_TYPE_KWARG = "reader_type"
60
+ OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG = "override_content_encoding_for_parquet"
61
+
62
+ """
63
+ By default, round decimal values using half_to_even round mode when
64
+ rescaling a decimal to the given scale and precision in the schema would cause
65
+ data loss. Setting any non null value of this argument will result
66
+ in an error instead.
67
+ """
68
+ RAISE_ON_DECIMAL_OVERFLOW = "raise_on_decimal_overflow"
69
+ # Note the maximum from https://arrow.apache.org/docs/python/generated/pyarrow.Decimal256Type.html#pyarrow.Decimal256Type
70
+ DECIMAL256_DEFAULT_SCALE = 38
71
+ DECIMAL256_MAX_PRECISION = 76
72
+ MAX_INT_BYTES = 2147483646
54
73
 
55
74
 
56
75
  def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Schema:
@@ -69,42 +88,142 @@ def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Sche
69
88
  return target_schema
70
89
 
71
90
 
72
- def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
73
- try:
74
- new_kwargs = sanitize_kwargs_by_supported_kwargs(
75
- ["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
91
+ def _extract_arrow_schema_from_read_csv_kwargs(kwargs: Dict[str, Any]) -> pa.Schema:
92
+ schema = None
93
+ if (
94
+ "convert_options" in kwargs
95
+ and kwargs["convert_options"].column_types is not None
96
+ ):
97
+ schema = kwargs["convert_options"].column_types
98
+ if not isinstance(schema, pa.Schema):
99
+ schema = pa.schema(schema)
100
+ if kwargs["convert_options"].include_columns:
101
+ schema = _filter_schema_for_columns(
102
+ schema, kwargs["convert_options"].include_columns
103
+ )
104
+ elif (
105
+ kwargs.get("read_options") is not None
106
+ and kwargs["read_options"].column_names
107
+ ):
108
+ schema = _filter_schema_for_columns(
109
+ schema, kwargs["read_options"].column_names
110
+ )
111
+ else:
112
+ logger.debug(
113
+ "Schema not specified in the kwargs."
114
+ " Hence, schema could not be inferred from the empty CSV."
76
115
  )
116
+
117
+ return schema
118
+
119
+
120
+ def _new_schema_with_replaced_fields(
121
+ schema: pa.Schema, field_to_replace: Callable[[pa.Field], Optional[pa.Field]]
122
+ ) -> pa.Schema:
123
+ if schema is None:
124
+ return None
125
+
126
+ new_schema_fields = []
127
+ for field in schema:
128
+ new_field = field_to_replace(field)
129
+ if new_field is not None:
130
+ new_schema_fields.append(new_field)
131
+ else:
132
+ new_schema_fields.append(field)
133
+
134
+ return pa.schema(new_schema_fields, metadata=schema.metadata)
135
+
136
+
137
+ def _read_csv_rounding_decimal_columns_to_fit_scale(
138
+ schema: pa.Schema, reader_args: List[Any], reader_kwargs: Dict[str, Any]
139
+ ) -> pa.Table:
140
+ # Note: We read decimals as strings first because CSV
141
+ # conversion to decimal256 isn't implemented as of pyarrow==12.0.1
142
+ new_schema = _new_schema_with_replaced_fields(
143
+ schema,
144
+ lambda fld: (
145
+ pa.field(fld.name, pa.string(), metadata=fld.metadata)
146
+ if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
147
+ else None
148
+ ),
149
+ )
150
+ new_kwargs = sanitize_kwargs_by_supported_kwargs(
151
+ ["read_options", "parse_options", "convert_options", "memory_pool"],
152
+ reader_kwargs,
153
+ )
154
+ # Creating a shallow copy for efficiency
155
+ new_convert_options = copy.copy(new_kwargs["convert_options"])
156
+ new_convert_options.column_types = new_schema
157
+ new_reader_kwargs = {**new_kwargs, "convert_options": new_convert_options}
158
+ arrow_table = pacsv.read_csv(*reader_args, **new_reader_kwargs)
159
+
160
+ for column_index, field in enumerate(schema):
161
+ if pa.types.is_decimal128(field.type) or pa.types.is_decimal256(field.type):
162
+ column_array = arrow_table[field.name]
163
+ # We always cast to decimal256 to accomodate fixed scale of 38
164
+ cast_to_type = pa.decimal256(
165
+ DECIMAL256_MAX_PRECISION, DECIMAL256_DEFAULT_SCALE
166
+ )
167
+ casted_decimal_array = pc.cast(column_array, cast_to_type)
168
+ # Note that scale can be negative
169
+ rounded_column_array = pc.round(
170
+ casted_decimal_array, ndigits=field.type.scale
171
+ )
172
+ final_decimal_array = pc.cast(rounded_column_array, field.type)
173
+ arrow_table = arrow_table.set_column(
174
+ column_index,
175
+ field,
176
+ final_decimal_array,
177
+ )
178
+ logger.debug(
179
+ f"Rounded decimal column: {field.name} to {field.type.scale} scale and"
180
+ f" {field.type.precision} precision"
181
+ )
182
+
183
+ return arrow_table
184
+
185
+
186
+ def pyarrow_read_csv_default(*args, **kwargs):
187
+ new_kwargs = sanitize_kwargs_by_supported_kwargs(
188
+ ["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
189
+ )
190
+
191
+ try:
77
192
  return pacsv.read_csv(*args, **new_kwargs)
78
193
  except pa.lib.ArrowInvalid as e:
79
- if e.__str__() == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
80
- schema = None
81
- if (
82
- "convert_options" in kwargs
83
- and kwargs["convert_options"].column_types is not None
84
- ):
85
- schema = kwargs["convert_options"].column_types
86
- if not isinstance(schema, pa.Schema):
87
- schema = pa.schema(schema)
88
- if kwargs["convert_options"].include_columns:
89
- schema = _filter_schema_for_columns(
90
- schema, kwargs["convert_options"].include_columns
91
- )
92
- elif (
93
- kwargs.get("read_options") is not None
94
- and kwargs["read_options"].column_names
194
+ error_str = e.__str__()
195
+ schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
196
+
197
+ if error_str == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
198
+ logger.debug(f"Read CSV empty schema being used: {schema}")
199
+ return pa.Table.from_pylist([], schema=schema)
200
+ if not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
201
+ # Note, this logic requires expensive casting. To prevent downgrading performance
202
+ # for happy path reads, we are handling this case in response to an error.
203
+ logger.warning(
204
+ "Rescaling Decimal to the given scale in the schema. "
205
+ f"Original error: {error_str}"
206
+ )
207
+
208
+ if schema is not None and "convert_options" in kwargs:
209
+ if (
210
+ "Rescaling Decimal" in error_str
211
+ and "value would cause data loss" in error_str
95
212
  ):
96
- schema = _filter_schema_for_columns(
97
- schema, kwargs["read_options"].column_names
213
+ logger.debug(f"Checking if the file: {args[0]}...")
214
+ # Since we are re-reading the file, we have to seek to beginning
215
+ if isinstance(args[0], io.IOBase) and args[0].seekable():
216
+ logger.debug(f"Seeking to the beginning of the file {args[0]}")
217
+ args[0].seek(0)
218
+ return _read_csv_rounding_decimal_columns_to_fit_scale(
219
+ schema=schema, reader_args=args, reader_kwargs=kwargs
98
220
  )
99
-
100
221
  else:
101
222
  logger.debug(
102
- "Schema not specified in the kwargs."
103
- " Hence, schema could not be inferred from the empty CSV."
223
+ "Schema is None when trying to adjust decimal values. "
224
+ "Hence, bubbling up exception..."
104
225
  )
105
226
 
106
- logger.debug(f"Read CSV empty schema being used: {schema}")
107
- return pa.Table.from_pylist([], schema=schema)
108
227
  raise e
109
228
 
110
229
 
@@ -114,14 +233,39 @@ def read_csv(
114
233
  *,
115
234
  filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
116
235
  fs_open_kwargs: Dict[str, any] = {},
236
+ content_encoding: str = ContentEncoding.IDENTITY.value,
117
237
  **read_kwargs,
118
238
  ) -> pa.Table:
239
+ # Filter out DeltaCAT-specific parameters that PyArrow doesn't understand
240
+ from deltacat.types.tables import _filter_kwargs_for_external_readers
241
+
242
+ pyarrow_kwargs = _filter_kwargs_for_external_readers(read_kwargs)
243
+ # TODO(pdames): Merge in decimal256 support from pure S3 path reader.
244
+
245
+ # Check if compression is already indicated by file path
246
+ should_decompress = path.endswith(".gz")
247
+
119
248
  if not filesystem or isinstance(filesystem, pafs.FileSystem):
120
- path, filesystem = resolve_path_and_filesystem(path)
249
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
121
250
  with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
122
- return pacsv.read_csv(f, **read_kwargs)
123
- with filesystem.open(path, "rb", **fs_open_kwargs) as f:
124
- return pacsv.read_csv(f, **read_kwargs)
251
+ # Handle decompression - avoid double decompression for PyArrow filesystem
252
+ if should_decompress:
253
+ # PyArrow filesystem already handles .gz decompression automatically
254
+ return pacsv.read_csv(f, **pyarrow_kwargs)
255
+ else:
256
+ # Apply explicit decompression if needed
257
+ input_file_init = ENCODING_TO_FILE_INIT.get(
258
+ content_encoding, lambda x: x
259
+ )
260
+ with input_file_init(f) as input_file:
261
+ return pacsv.read_csv(input_file, **pyarrow_kwargs)
262
+ else:
263
+ # fsspec AbstractFileSystem
264
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
265
+ # Handle decompression - apply explicit decompression for fsspec
266
+ input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
267
+ with input_file_init(f) as input_file:
268
+ return pacsv.read_csv(input_file, **pyarrow_kwargs)
125
269
 
126
270
 
127
271
  def read_feather(
@@ -129,14 +273,57 @@ def read_feather(
129
273
  *,
130
274
  filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
131
275
  fs_open_kwargs: Dict[str, any] = {},
276
+ content_encoding: str = ContentEncoding.IDENTITY.value,
132
277
  **read_kwargs,
133
278
  ) -> pa.Table:
279
+ # Filter out DeltaCAT-specific parameters that PyArrow doesn't understand
280
+ from deltacat.types.tables import _filter_kwargs_for_external_readers
281
+
282
+ pyarrow_kwargs = _filter_kwargs_for_external_readers(read_kwargs)
134
283
  if not filesystem or isinstance(filesystem, pafs.FileSystem):
135
- path, filesystem = resolve_path_and_filesystem(path)
136
- with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
137
- return paf.read_feather(f, **read_kwargs)
138
- with filesystem.open(path, "rb", **fs_open_kwargs) as f:
139
- return paf.read_feather(f, **read_kwargs)
284
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
285
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f:
286
+ # Handle compression
287
+ input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
288
+ with input_file_init(f) as input_file:
289
+ return paf.read_table(input_file, **pyarrow_kwargs)
290
+ else:
291
+ # fsspec AbstractFileSystem - Feather requires seekable files
292
+ # For local files, we can use the file path directly
293
+ if hasattr(filesystem, "protocol") and filesystem.protocol == "file":
294
+ if content_encoding != ContentEncoding.IDENTITY.value:
295
+ # For compressed files, decompress to a temporary file
296
+ import tempfile
297
+ import shutil
298
+
299
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
300
+ input_file_init = ENCODING_TO_FILE_INIT.get(
301
+ content_encoding, lambda x: x
302
+ )
303
+ with input_file_init(f) as input_file:
304
+ # Create temporary file to hold decompressed data
305
+ with tempfile.NamedTemporaryFile() as temp_file:
306
+ shutil.copyfileobj(input_file, temp_file)
307
+ temp_file.flush()
308
+ return paf.read_table(temp_file.name, **read_kwargs)
309
+ else:
310
+ # No compression, can read directly from file path
311
+ return paf.read_table(path, **pyarrow_kwargs)
312
+ else:
313
+ # For non-local filesystems, always read from temporary file
314
+ import tempfile
315
+ import shutil
316
+
317
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
318
+ input_file_init = ENCODING_TO_FILE_INIT.get(
319
+ content_encoding, lambda x: x
320
+ )
321
+ with input_file_init(f) as input_file:
322
+ # Create temporary file to hold data
323
+ with tempfile.NamedTemporaryFile() as temp_file:
324
+ shutil.copyfileobj(input_file, temp_file)
325
+ temp_file.flush()
326
+ return paf.read_table(temp_file.name, **read_kwargs)
140
327
 
141
328
 
142
329
  def read_json(
@@ -144,14 +331,37 @@ def read_json(
144
331
  *,
145
332
  filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
146
333
  fs_open_kwargs: Dict[str, any] = {},
334
+ content_encoding: str = ContentEncoding.IDENTITY.value,
147
335
  **read_kwargs,
148
336
  ) -> pa.Table:
337
+ # Filter out DeltaCAT-specific parameters that PyArrow doesn't understand
338
+ from deltacat.types.tables import _filter_kwargs_for_external_readers
339
+
340
+ pyarrow_kwargs = _filter_kwargs_for_external_readers(read_kwargs)
341
+ # Check if decompression is already indicated by file path
342
+ should_decompress = path.endswith(".gz")
343
+
149
344
  if not filesystem or isinstance(filesystem, pafs.FileSystem):
150
- path, filesystem = resolve_path_and_filesystem(path)
345
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
151
346
  with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
152
- return pajson.read_json(f, **read_kwargs)
153
- with filesystem.open(path, "rb", **fs_open_kwargs) as f:
154
- return pajson.read_json(f, **read_kwargs)
347
+ # Handle decompression - avoid double decompression for PyArrow filesystem
348
+ if should_decompress:
349
+ # PyArrow filesystem already handles .gz decompression automatically
350
+ return pajson.read_json(f, **pyarrow_kwargs)
351
+ else:
352
+ # Apply explicit decompression if needed
353
+ input_file_init = ENCODING_TO_FILE_INIT.get(
354
+ content_encoding, lambda x: x
355
+ )
356
+ with input_file_init(f) as input_file:
357
+ return pajson.read_json(input_file, **pyarrow_kwargs)
358
+ else:
359
+ # fsspec AbstractFileSystem
360
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
361
+ # Handle decompression - apply explicit decompression for fsspec
362
+ input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
363
+ with input_file_init(f) as input_file:
364
+ return pajson.read_json(input_file, **pyarrow_kwargs)
155
365
 
156
366
 
157
367
  def read_orc(
@@ -159,14 +369,42 @@ def read_orc(
159
369
  *,
160
370
  filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
161
371
  fs_open_kwargs: Dict[str, any] = {},
372
+ content_encoding: str = ContentEncoding.IDENTITY.value,
162
373
  **read_kwargs,
163
374
  ) -> pa.Table:
375
+ # Filter out DeltaCAT-specific parameters that PyArrow doesn't understand
376
+ from deltacat.types.tables import _filter_kwargs_for_external_readers
377
+
378
+ pyarrow_kwargs = _filter_kwargs_for_external_readers(read_kwargs)
164
379
  if not filesystem or isinstance(filesystem, pafs.FileSystem):
165
- path, filesystem = resolve_path_and_filesystem(path)
166
- with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
167
- return paorc.read_table(f, **read_kwargs)
168
- with filesystem.open(path, "rb", **fs_open_kwargs) as f:
169
- return paorc.read_table(f, **read_kwargs)
380
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
381
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f:
382
+ # Handle compression
383
+ input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
384
+ with input_file_init(f) as input_file:
385
+ return paorc.read_table(input_file, **pyarrow_kwargs)
386
+ else:
387
+ # fsspec AbstractFileSystem - ORC requires seekable files, so handle compression differently
388
+ if content_encoding != ContentEncoding.IDENTITY.value:
389
+ # For compressed files with fsspec, we need to decompress to a temporary file
390
+ # since ORC requires seekable streams
391
+ import tempfile
392
+ import shutil
393
+
394
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
395
+ input_file_init = ENCODING_TO_FILE_INIT.get(
396
+ content_encoding, lambda x: x
397
+ )
398
+ with input_file_init(f) as input_file:
399
+ # Create temporary file to hold decompressed data
400
+ with tempfile.NamedTemporaryFile() as temp_file:
401
+ shutil.copyfileobj(input_file, temp_file)
402
+ temp_file.flush()
403
+ return paorc.read_table(temp_file.name, **pyarrow_kwargs)
404
+ else:
405
+ # No compression, can read directly
406
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
407
+ return paorc.read_table(f, **pyarrow_kwargs)
170
408
 
171
409
 
172
410
  def read_parquet(
@@ -174,27 +412,117 @@ def read_parquet(
174
412
  *,
175
413
  filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
176
414
  fs_open_kwargs: Dict[str, any] = {},
415
+ content_encoding: str = ContentEncoding.IDENTITY.value,
177
416
  **read_kwargs,
178
417
  ) -> pa.Table:
418
+ # Convert DeltaCAT Schema to PyArrow Schema if present
419
+ if "schema" in read_kwargs:
420
+ from deltacat.storage.model.schema import Schema as DeltaCATSchema
421
+
422
+ schema = read_kwargs["schema"]
423
+ if isinstance(schema, DeltaCATSchema):
424
+ read_kwargs["schema"] = schema.arrow
425
+
426
+ # Filter out DeltaCAT-specific parameters that PyArrow doesn't understand
427
+ # Use local import to avoid circular dependency
428
+ from deltacat.types.tables import _filter_kwargs_for_external_readers
429
+
430
+ pyarrow_kwargs = _filter_kwargs_for_external_readers(read_kwargs)
179
431
  if not filesystem or isinstance(filesystem, pafs.FileSystem):
180
- path, filesystem = resolve_path_and_filesystem(path)
432
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
433
+ with filesystem.open_input_file(path, **fs_open_kwargs) as f:
434
+ # Handle compression
435
+ input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
436
+ with input_file_init(f) as input_file:
437
+ return papq.read_table(input_file, **pyarrow_kwargs)
438
+ else:
439
+ # fsspec AbstractFileSystem
440
+ with filesystem.open(path, "rb", **fs_open_kwargs) as f:
441
+ # Handle compression
442
+ input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
443
+ with input_file_init(f) as input_file:
444
+ return papq.read_table(input_file, **pyarrow_kwargs)
445
+
446
+
447
+ def read_avro(
448
+ path: str,
449
+ *,
450
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
451
+ fs_open_kwargs: Dict[str, any] = {},
452
+ content_encoding: str = ContentEncoding.IDENTITY.value,
453
+ **read_kwargs,
454
+ ) -> pa.Table:
455
+ # Filter out DeltaCAT-specific parameters that Polars doesn't understand
456
+ from deltacat.types.tables import _filter_kwargs_for_external_readers
457
+
458
+ polars_kwargs = _filter_kwargs_for_external_readers(read_kwargs)
459
+ """
460
+ Read an Avro file using polars and convert to PyArrow.
461
+ """
462
+ import polars as pl
463
+
464
+ # If path is a file-like object, read directly
465
+ if hasattr(path, "read"):
466
+ pl_df = pl.read_avro(path, **polars_kwargs)
467
+ return pl_df.to_arrow()
468
+
469
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
470
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
181
471
  with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
182
- return papq.read_table(f, **read_kwargs)
472
+ # Handle compression
473
+ input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
474
+ with input_file_init(f) as input_file:
475
+ pl_df = pl.read_avro(input_file, **polars_kwargs)
476
+ return pl_df.to_arrow()
183
477
  with filesystem.open(path, "rb", **fs_open_kwargs) as f:
184
- return papq.read_table(f, **read_kwargs)
478
+ input_file_init = ENCODING_TO_FILE_INIT.get(content_encoding, lambda x: x)
479
+ with input_file_init(f) as input_file:
480
+ pl_df = pl.read_avro(input_file, **polars_kwargs)
481
+ return pl_df.to_arrow()
185
482
 
186
483
 
187
- CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
484
+ def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
485
+ schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
486
+
487
+ # CSV conversion to decimal256 isn't supported as of pyarrow=12.0.1
488
+ # Below ensures decimal256 is casted properly.
489
+ schema_includes_decimal256 = (
490
+ (True if any([pa.types.is_decimal256(x.type) for x in schema]) else False)
491
+ if schema is not None
492
+ else None
493
+ )
494
+ if schema_includes_decimal256 and not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
495
+ # falling back to expensive method of reading CSV
496
+ return _read_csv_rounding_decimal_columns_to_fit_scale(
497
+ schema, reader_args=args, reader_kwargs=kwargs
498
+ )
499
+ else:
500
+ return pyarrow_read_csv_default(*args, **kwargs)
501
+
502
+
503
+ CONTENT_TYPE_TO_PA_S3_READ_FUNC: Dict[str, Callable] = {
188
504
  ContentType.UNESCAPED_TSV.value: pyarrow_read_csv,
189
505
  ContentType.TSV.value: pyarrow_read_csv,
190
506
  ContentType.CSV.value: pyarrow_read_csv,
191
507
  ContentType.PSV.value: pyarrow_read_csv,
192
508
  ContentType.PARQUET.value: papq.read_table,
193
509
  ContentType.FEATHER.value: paf.read_table,
194
- # Pyarrow.orc is disabled in Pyarrow 0.15, 0.16:
195
- # https://issues.apache.org/jira/browse/ARROW-7811
196
- # ContentType.ORC.value: paorc.ContentType.ORCFile,
197
510
  ContentType.JSON.value: pajson.read_json,
511
+ ContentType.ORC.value: paorc.read_table,
512
+ ContentType.AVRO.value: read_avro,
513
+ }
514
+
515
+
516
+ CONTENT_TYPE_TO_READ_FN: Dict[str, Callable] = {
517
+ ContentType.UNESCAPED_TSV.value: read_csv,
518
+ ContentType.TSV.value: read_csv,
519
+ ContentType.CSV.value: read_csv,
520
+ ContentType.PSV.value: read_csv,
521
+ ContentType.PARQUET.value: read_parquet,
522
+ ContentType.FEATHER.value: read_feather,
523
+ ContentType.JSON.value: read_json,
524
+ ContentType.ORC.value: read_orc,
525
+ ContentType.AVRO.value: read_avro,
198
526
  }
199
527
 
200
528
 
@@ -207,7 +535,7 @@ def write_feather(
207
535
  **write_kwargs,
208
536
  ) -> None:
209
537
  if not filesystem or isinstance(filesystem, pafs.FileSystem):
210
- path, filesystem = resolve_path_and_filesystem(path)
538
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
211
539
  with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
212
540
  paf.write_feather(table, f, **write_kwargs)
213
541
  else:
@@ -223,20 +551,31 @@ def write_csv(
223
551
  fs_open_kwargs: Dict[str, any] = {},
224
552
  **write_kwargs,
225
553
  ) -> None:
554
+ if write_kwargs.get("write_options") is None:
555
+ # column names are kept in table metadata, so omit header
556
+ write_kwargs["write_options"] = pacsv.WriteOptions(include_header=False)
557
+
558
+ # Check if the path already indicates compression to avoid double compression
559
+ should_compress = path.endswith(".gz")
560
+
226
561
  if not filesystem or isinstance(filesystem, pafs.FileSystem):
227
- path, filesystem = resolve_path_and_filesystem(path)
562
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
228
563
  with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
229
- pacsv.write_csv(table, f, **write_kwargs)
564
+ if should_compress:
565
+ # Path ends with .gz, PyArrow filesystem automatically compresses, no need for additional compression
566
+ pacsv.write_csv(table, f, **write_kwargs)
567
+ else:
568
+ # No compression indicated, write uncompressed
569
+ pacsv.write_csv(table, f, **write_kwargs)
230
570
  else:
231
571
  with filesystem.open(path, "wb", **fs_open_kwargs) as f:
232
- # TODO (pdames): Add support for client-specified compression types.
233
- with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
234
- if write_kwargs.get("write_options") is None:
235
- # column names are kept in table metadata, so omit header
236
- write_kwargs["write_options"] = pacsv.WriteOptions(
237
- include_header=False
238
- )
239
- pacsv.write_csv(table, out, **write_kwargs)
572
+ if should_compress:
573
+ # For fsspec filesystems, we need to apply compression explicitly
574
+ with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
575
+ pacsv.write_csv(table, out, **write_kwargs)
576
+ else:
577
+ # No compression indicated, write uncompressed
578
+ pacsv.write_csv(table, f, **write_kwargs)
240
579
 
241
580
 
242
581
  def write_orc(
@@ -248,7 +587,7 @@ def write_orc(
248
587
  **write_kwargs,
249
588
  ) -> None:
250
589
  if not filesystem or isinstance(filesystem, pafs.FileSystem):
251
- path, filesystem = resolve_path_and_filesystem(path)
590
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
252
591
  with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
253
592
  paorc.write_table(table, f, **write_kwargs)
254
593
  else:
@@ -265,7 +604,7 @@ def write_parquet(
265
604
  **write_kwargs,
266
605
  ) -> None:
267
606
  if not filesystem or isinstance(filesystem, pafs.FileSystem):
268
- path, filesystem = resolve_path_and_filesystem(path)
607
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
269
608
  with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
270
609
  papq.write_table(table, f, **write_kwargs)
271
610
  else:
@@ -273,17 +612,120 @@ def write_parquet(
273
612
  papq.write_table(table, f, **write_kwargs)
274
613
 
275
614
 
615
+ def write_json(
616
+ table: pa.Table,
617
+ path: str,
618
+ *,
619
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
620
+ fs_open_kwargs: Dict[str, any] = {},
621
+ **write_kwargs,
622
+ ) -> None:
623
+ """
624
+ Write a PyArrow Table to a JSON file by delegating to polars implementation.
625
+ """
626
+ import polars as pl
627
+ from deltacat.utils.polars import write_json as polars_write_json
628
+
629
+ # Convert PyArrow Table to polars DataFrame
630
+ pl_df = pl.from_arrow(table)
631
+
632
+ # Delegate to polars write_json implementation with GZIP compression
633
+ polars_write_json(
634
+ pl_df,
635
+ path,
636
+ filesystem=filesystem,
637
+ fs_open_kwargs=fs_open_kwargs,
638
+ **write_kwargs,
639
+ )
640
+
641
+
642
+ def write_avro(
643
+ table: pa.Table,
644
+ path: str,
645
+ *,
646
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
647
+ fs_open_kwargs: Dict[str, any] = {},
648
+ **write_kwargs,
649
+ ) -> None:
650
+ """
651
+ Write a PyArrow Table to an AVRO file by delegating to polars implementation.
652
+ """
653
+ import polars as pl
654
+ from deltacat.utils.polars import write_avro as polars_write_avro
655
+
656
+ # Convert PyArrow Table to polars DataFrame
657
+ pl_df = pl.from_arrow(table)
658
+
659
+ # Delegate to polars write_avro implementation
660
+ polars_write_avro(
661
+ pl_df,
662
+ path,
663
+ filesystem=filesystem,
664
+ fs_open_kwargs=fs_open_kwargs,
665
+ **write_kwargs,
666
+ )
667
+
668
+
276
669
  CONTENT_TYPE_TO_PA_WRITE_FUNC: Dict[str, Callable] = {
277
- # TODO (pdames): add support for other delimited text content types as
278
- # pyarrow adds support for custom delimiters, escaping, and None value
279
- # representations to pyarrow.csv.WriteOptions.
670
+ ContentType.UNESCAPED_TSV.value: write_csv,
671
+ ContentType.TSV.value: write_csv,
280
672
  ContentType.CSV.value: write_csv,
281
- ContentType.ORC.value: write_orc,
673
+ ContentType.PSV.value: write_csv,
282
674
  ContentType.PARQUET.value: write_parquet,
283
675
  ContentType.FEATHER.value: write_feather,
676
+ ContentType.JSON.value: write_json,
677
+ ContentType.AVRO.value: write_avro,
678
+ ContentType.ORC.value: write_orc,
284
679
  }
285
680
 
286
681
 
682
+ def content_type_to_writer_kwargs(content_type: str) -> Dict[str, Any]:
683
+ """
684
+ Returns writer kwargs for the given content type when writing with pyarrow.
685
+ """
686
+ if content_type == ContentType.UNESCAPED_TSV.value:
687
+ return {
688
+ "write_options": pacsv.WriteOptions(
689
+ delimiter="\t",
690
+ include_header=False,
691
+ quoting_style="none",
692
+ )
693
+ }
694
+ if content_type == ContentType.TSV.value:
695
+ return {
696
+ "write_options": pacsv.WriteOptions(
697
+ include_header=False,
698
+ delimiter="\t",
699
+ quoting_style="needed",
700
+ )
701
+ }
702
+ if content_type == ContentType.CSV.value:
703
+ return {
704
+ "write_options": pacsv.WriteOptions(
705
+ include_header=False,
706
+ delimiter=",",
707
+ quoting_style="needed",
708
+ )
709
+ }
710
+ if content_type == ContentType.PSV.value:
711
+ return {
712
+ "write_options": pacsv.WriteOptions(
713
+ include_header=False,
714
+ delimiter="|",
715
+ quoting_style="needed",
716
+ )
717
+ }
718
+ if content_type in {
719
+ ContentType.PARQUET.value,
720
+ ContentType.FEATHER.value,
721
+ ContentType.JSON.value,
722
+ ContentType.AVRO.value,
723
+ ContentType.ORC.value,
724
+ }:
725
+ return {}
726
+ raise ValueError(f"Unsupported content type: {content_type}")
727
+
728
+
287
729
  def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
288
730
  if content_type == ContentType.UNESCAPED_TSV.value:
289
731
  return {
@@ -303,12 +745,10 @@ def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
303
745
  ContentType.PARQUET.value,
304
746
  ContentType.FEATHER.value,
305
747
  ContentType.JSON.value,
748
+ ContentType.ORC.value,
749
+ ContentType.AVRO.value,
306
750
  }:
307
751
  return {}
308
- # Pyarrow.orc is disabled in Pyarrow 0.15, 0.16:
309
- # https://issues.apache.org/jira/browse/ARROW-7811
310
- # if DataTypes.ContentType.ORC:
311
- # return {},
312
752
  raise ValueError(f"Unsupported content type: {content_type}")
313
753
 
314
754
 
@@ -320,7 +760,10 @@ ENCODING_TO_FILE_INIT: Dict[str, Callable] = {
320
760
  }
321
761
 
322
762
 
323
- def slice_table(table: pa.Table, max_len: Optional[int]) -> List[pa.Table]:
763
+ def slice_table(
764
+ table: pa.Table,
765
+ max_len: Optional[int],
766
+ ) -> List[pa.Table]:
324
767
  """
325
768
  Iteratively create 0-copy table slices.
326
769
  """
@@ -337,6 +780,23 @@ def slice_table(table: pa.Table, max_len: Optional[int]) -> List[pa.Table]:
337
780
  return tables
338
781
 
339
782
 
783
+ def append_column_to_table(
784
+ table: pa.Table,
785
+ column_name: str,
786
+ column_value: Any,
787
+ ) -> pa.Table:
788
+ num_rows = table.num_rows
789
+ column_array = pa.array([column_value] * num_rows)
790
+ return table.append_column(column_name, column_array)
791
+
792
+
793
+ def select_columns(
794
+ table: pa.Table,
795
+ column_names: List[str],
796
+ ) -> pa.Table:
797
+ return table.select(column_names)
798
+
799
+
340
800
  class ReadKwargsProviderPyArrowCsvPureUtf8(ContentTypeKwargsProvider):
341
801
  """ReadKwargsProvider impl that reads columns of delimited text files
342
802
  as UTF-8 strings (i.e. disables type inference). Useful for ensuring
@@ -458,15 +918,16 @@ def _add_column_kwargs(
458
918
  )
459
919
 
460
920
 
461
- def s3_partial_parquet_file_to_table(
462
- s3_url: str,
921
+ def partial_parquet_file_to_table(
922
+ path: str,
463
923
  content_type: str,
464
924
  content_encoding: str,
925
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
465
926
  column_names: Optional[List[str]] = None,
466
927
  include_columns: Optional[List[str]] = None,
467
928
  pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
468
929
  partial_file_download_params: Optional[PartialParquetParameters] = None,
469
- **s3_client_kwargs,
930
+ **kwargs,
470
931
  ) -> pa.Table:
471
932
 
472
933
  assert (
@@ -476,13 +937,18 @@ def s3_partial_parquet_file_to_table(
476
937
  partial_file_download_params.row_groups_to_download is not None
477
938
  ), "No row groups to download"
478
939
 
479
- pq_file = s3_file_to_parquet(
480
- s3_url=s3_url,
940
+ # Resolve filesystem and path
941
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
942
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
943
+
944
+ pq_file = file_to_parquet(
945
+ path=path,
481
946
  content_type=content_type,
482
947
  content_encoding=content_encoding,
948
+ filesystem=filesystem,
483
949
  partial_file_download_params=partial_file_download_params,
484
950
  pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
485
- **s3_client_kwargs,
951
+ **kwargs,
486
952
  )
487
953
 
488
954
  table, latency = timed_invocation(
@@ -491,7 +957,7 @@ def s3_partial_parquet_file_to_table(
491
957
  columns=include_columns or column_names,
492
958
  )
493
959
 
494
- logger.debug(f"Successfully read from s3_url={s3_url} in {latency}s")
960
+ logger.debug(f"Successfully read from path={path} in {latency}s")
495
961
 
496
962
  kwargs = {}
497
963
 
@@ -525,128 +991,6 @@ def s3_partial_parquet_file_to_table(
525
991
  return table
526
992
 
527
993
 
528
- def s3_file_to_table(
529
- s3_url: str,
530
- content_type: str,
531
- content_encoding: str,
532
- column_names: Optional[List[str]] = None,
533
- include_columns: Optional[List[str]] = None,
534
- pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
535
- partial_file_download_params: Optional[PartialFileDownloadParams] = None,
536
- **s3_client_kwargs,
537
- ) -> pa.Table:
538
-
539
- logger.debug(
540
- f"Reading {s3_url} to PyArrow. Content type: {content_type}. "
541
- f"Encoding: {content_encoding}"
542
- )
543
-
544
- kwargs = content_type_to_reader_kwargs(content_type)
545
- _add_column_kwargs(content_type, column_names, include_columns, kwargs)
546
-
547
- if pa_read_func_kwargs_provider is not None:
548
- kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
549
-
550
- if (
551
- content_type == ContentType.PARQUET.value
552
- and content_encoding == ContentEncoding.IDENTITY.value
553
- ):
554
- logger.debug(
555
- f"Performing read using parquet reader for encoding={content_encoding} "
556
- f"and content_type={content_type}"
557
- )
558
-
559
- parquet_reader_func = None
560
- if kwargs.get(READER_TYPE_KWARG, "daft") == "daft":
561
- parquet_reader_func = daft_s3_file_to_table
562
- elif partial_file_download_params and isinstance(
563
- partial_file_download_params, PartialParquetParameters
564
- ):
565
- parquet_reader_func = s3_partial_parquet_file_to_table
566
-
567
- if parquet_reader_func is not None:
568
- return parquet_reader_func(
569
- s3_url=s3_url,
570
- content_type=content_type,
571
- content_encoding=content_encoding,
572
- column_names=column_names,
573
- include_columns=include_columns,
574
- pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
575
- partial_file_download_params=partial_file_download_params,
576
- **s3_client_kwargs,
577
- )
578
-
579
- if READER_TYPE_KWARG in kwargs:
580
- kwargs.pop(READER_TYPE_KWARG)
581
-
582
- filesystem = io
583
- if s3_url.startswith("s3://"):
584
- filesystem = create_s3_file_system(s3_client_kwargs)
585
-
586
- logger.debug(f"Read S3 object from {s3_url} using filesystem: {filesystem}")
587
- input_file_init = ENCODING_TO_FILE_INIT[content_encoding]
588
- pa_read_func = CONTENT_TYPE_TO_PA_READ_FUNC[content_type]
589
-
590
- with filesystem.open(s3_url, "rb") as s3_file, input_file_init(
591
- s3_file
592
- ) as input_file:
593
- args = [input_file]
594
- logger.debug(f"Reading {s3_url} via {pa_read_func} with kwargs: {kwargs}")
595
- table, latency = timed_invocation(pa_read_func, *args, **kwargs)
596
- logger.debug(f"Time to read {s3_url} into PyArrow table: {latency}s")
597
- return table
598
-
599
-
600
- def s3_file_to_parquet(
601
- s3_url: str,
602
- content_type: str,
603
- content_encoding: str,
604
- column_names: Optional[List[str]] = None,
605
- include_columns: Optional[List[str]] = None,
606
- pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
607
- partial_file_download_params: Optional[PartialFileDownloadParams] = None,
608
- **s3_client_kwargs,
609
- ) -> ParquetFile:
610
- logger.debug(
611
- f"Reading {s3_url} to PyArrow ParquetFile. "
612
- f"Content type: {content_type}. Encoding: {content_encoding}"
613
- )
614
-
615
- if (
616
- content_type != ContentType.PARQUET.value
617
- or content_encoding != ContentEncoding.IDENTITY
618
- ):
619
- raise ContentTypeValidationError(
620
- f"S3 file with content type: {content_type} and content encoding: {content_encoding} "
621
- "cannot be read into pyarrow.parquet.ParquetFile"
622
- )
623
-
624
- if s3_client_kwargs is None:
625
- s3_client_kwargs = {}
626
-
627
- kwargs = {}
628
-
629
- if s3_url.startswith("s3://"):
630
- s3_file_system = create_s3_file_system(s3_client_kwargs)
631
- kwargs["filesystem"] = s3_file_system
632
-
633
- if pa_read_func_kwargs_provider:
634
- kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
635
-
636
- logger.debug(f"Pre-sanitize kwargs for {s3_url}: {kwargs}")
637
-
638
- kwargs = sanitize_kwargs_to_callable(ParquetFile.__init__, kwargs)
639
-
640
- logger.debug(
641
- f"Reading the file from {s3_url} into ParquetFile with kwargs: {kwargs}"
642
- )
643
- pqFile, latency = timed_invocation(ParquetFile, s3_url, **kwargs)
644
-
645
- logger.debug(f"Time to get {s3_url} into parquet file: {latency}s")
646
-
647
- return pqFile
648
-
649
-
650
994
  def table_size(table: pa.Table) -> int:
651
995
  return table.nbytes
652
996
 
@@ -658,13 +1002,23 @@ def parquet_file_size(table: papq.ParquetFile) -> int:
658
1002
  def table_to_file(
659
1003
  table: pa.Table,
660
1004
  base_path: str,
661
- file_system: Optional[AbstractFileSystem],
1005
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]],
662
1006
  block_path_provider: Union[Callable, FilenameProvider],
663
1007
  content_type: str = ContentType.PARQUET.value,
1008
+ schema: Optional[pa.Schema] = None,
664
1009
  **kwargs,
665
1010
  ) -> None:
666
1011
  """
667
1012
  Writes the given Pyarrow Table to a file.
1013
+
1014
+ Args:
1015
+ table: The PyArrow table to write
1016
+ base_path: Base path to write to
1017
+ file_system: Optional filesystem to use
1018
+ block_path_provider: Provider for block path generation
1019
+ content_type: Content type for the output file
1020
+ schema: Optional schema (for compatibility with explicit schema parameter pattern)
1021
+ kwargs: Keyword arguments passed to the PyArrow write function
668
1022
  """
669
1023
  writer = CONTENT_TYPE_TO_PA_WRITE_FUNC.get(content_type)
670
1024
  if not writer:
@@ -674,8 +1028,10 @@ def table_to_file(
674
1028
  f"{CONTENT_TYPE_TO_PA_WRITE_FUNC.keys}"
675
1029
  )
676
1030
  path = block_path_provider(base_path)
677
- logger.debug(f"Writing table: {table} with kwargs: {kwargs} to path: {path}")
678
- writer(table, path, filesystem=file_system, **kwargs)
1031
+ writer_kwargs = content_type_to_writer_kwargs(content_type)
1032
+ writer_kwargs.update(kwargs)
1033
+ logger.debug(f"Writing table: {table} with kwargs: {writer_kwargs} to path: {path}")
1034
+ writer(table, path, filesystem=filesystem, **writer_kwargs)
679
1035
 
680
1036
 
681
1037
  class RecordBatchTables:
@@ -919,7 +1275,6 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
919
1275
  TODO: deprecate this function when pyarrow performs proper ChunkedArray -> ChunkedArray casting
920
1276
  """
921
1277
  dtype = array.type
922
- MAX_BYTES = 2147483646
923
1278
  max_str_len = None
924
1279
  if pa.types.is_integer(dtype):
925
1280
  max_str_len = _int_max_string_len()
@@ -931,7 +1286,7 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
931
1286
  max_str_len = _max_decimal256_string_len()
932
1287
 
933
1288
  if max_str_len is not None:
934
- max_elems_per_chunk = MAX_BYTES // (2 * max_str_len) # safety factor of 2
1289
+ max_elems_per_chunk = MAX_INT_BYTES // (2 * max_str_len) # safety factor of 2
935
1290
  all_chunks = []
936
1291
  for chunk in array.chunks:
937
1292
  if len(chunk) < max_elems_per_chunk:
@@ -946,3 +1301,693 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
946
1301
  array = pa.chunked_array(all_chunks, type=dtype)
947
1302
 
948
1303
  return pc.cast(array, pa.string())
1304
+
1305
+
1306
+ def file_to_table(
1307
+ path: str,
1308
+ content_type: str,
1309
+ content_encoding: str = ContentEncoding.IDENTITY.value,
1310
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
1311
+ column_names: Optional[List[str]] = None,
1312
+ include_columns: Optional[List[str]] = None,
1313
+ pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
1314
+ partial_file_download_params: Optional[PartialFileDownloadParams] = None,
1315
+ fs_open_kwargs: Dict[str, Any] = {},
1316
+ **kwargs,
1317
+ ) -> pa.Table:
1318
+ """
1319
+ Read a file into a PyArrow Table using any filesystem.
1320
+
1321
+ Args:
1322
+ path: The file path to read
1323
+ content_type: The content type of the file (e.g., ContentType.CSV.value)
1324
+ content_encoding: The content encoding (default: IDENTITY)
1325
+ filesystem: The filesystem to use (if None, will be inferred from path)
1326
+ column_names: Optional column names to assign
1327
+ include_columns: Optional columns to include in the result
1328
+ pa_read_func_kwargs_provider: Optional kwargs provider for customization
1329
+ fs_open_kwargs: Optional kwargs for filesystem open operations
1330
+ **kwargs: Additional kwargs passed to the reader function
1331
+
1332
+ Returns:
1333
+ pa.Table: The loaded PyArrow Table
1334
+ """
1335
+ logger.debug(
1336
+ f"Reading {path} to PyArrow. Content type: {content_type}. "
1337
+ f"Encoding: {content_encoding}"
1338
+ )
1339
+
1340
+ if (
1341
+ content_type == ContentType.PARQUET.value
1342
+ and content_encoding == ContentEncoding.IDENTITY.value
1343
+ and not filesystem
1344
+ and path.startswith("s3://")
1345
+ ):
1346
+ # Use optimized partial parquet reader for s3 if possible
1347
+ logger.debug(
1348
+ f"Reading {path} using parquet reader for encoding={content_encoding} "
1349
+ f"and content_type={content_type}"
1350
+ )
1351
+
1352
+ parquet_reader_func = None
1353
+ if kwargs.get(READER_TYPE_KWARG, "daft") == "daft":
1354
+ from deltacat.utils.daft import daft_file_to_pyarrow_table
1355
+
1356
+ parquet_reader_func = daft_file_to_pyarrow_table
1357
+ elif partial_file_download_params and isinstance(
1358
+ partial_file_download_params, PartialParquetParameters
1359
+ ):
1360
+ parquet_reader_func = partial_parquet_file_to_table
1361
+
1362
+ if parquet_reader_func is not None:
1363
+ return parquet_reader_func(
1364
+ path=path,
1365
+ content_type=content_type,
1366
+ content_encoding=content_encoding,
1367
+ filesystem=filesystem,
1368
+ column_names=column_names,
1369
+ include_columns=include_columns,
1370
+ pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
1371
+ partial_file_download_params=partial_file_download_params,
1372
+ **kwargs,
1373
+ )
1374
+
1375
+ if READER_TYPE_KWARG in kwargs:
1376
+ kwargs.pop(READER_TYPE_KWARG)
1377
+
1378
+ pa_read_func = CONTENT_TYPE_TO_READ_FN.get(content_type)
1379
+ if not pa_read_func:
1380
+ raise NotImplementedError(
1381
+ f"PyArrow reader for content type '{content_type}' not "
1382
+ f"implemented. Known content types: "
1383
+ f"{list(CONTENT_TYPE_TO_READ_FN.keys())}"
1384
+ )
1385
+
1386
+ reader_kwargs = content_type_to_reader_kwargs(content_type)
1387
+
1388
+ _add_column_kwargs(content_type, column_names, include_columns, reader_kwargs)
1389
+
1390
+ # Merge with provided kwargs
1391
+ reader_kwargs.update(kwargs)
1392
+
1393
+ if pa_read_func_kwargs_provider:
1394
+ reader_kwargs = pa_read_func_kwargs_provider(content_type, reader_kwargs)
1395
+
1396
+ logger.debug(f"Reading {path} via {pa_read_func} with kwargs: {reader_kwargs}")
1397
+
1398
+ table, latency = timed_invocation(
1399
+ pa_read_func,
1400
+ path,
1401
+ filesystem=filesystem,
1402
+ fs_open_kwargs=fs_open_kwargs,
1403
+ content_encoding=content_encoding,
1404
+ **reader_kwargs,
1405
+ )
1406
+ logger.debug(f"Time to read {path} into PyArrow Table: {latency}s")
1407
+ return table
1408
+
1409
+
1410
+ def file_to_parquet(
1411
+ path: str,
1412
+ content_type: str = ContentType.PARQUET.value,
1413
+ content_encoding: str = ContentEncoding.IDENTITY.value,
1414
+ filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
1415
+ column_names: Optional[List[str]] = None,
1416
+ include_columns: Optional[List[str]] = None,
1417
+ pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
1418
+ partial_file_download_params: Optional[PartialFileDownloadParams] = None,
1419
+ fs_open_kwargs: Dict[str, Any] = {},
1420
+ **kwargs,
1421
+ ) -> ParquetFile:
1422
+ """
1423
+ Read a file into a PyArrow ParquetFile using any filesystem.
1424
+
1425
+ It returns a ParquetFile object which provides metadata access and lazy loading.
1426
+
1427
+ Args:
1428
+ path: The file path to read
1429
+ content_type: The content type (must be PARQUET, default: PARQUET)
1430
+ content_encoding: The content encoding (must be IDENTITY, default: IDENTITY)
1431
+ filesystem: The filesystem to use (if None, will be inferred from path)
1432
+ column_names: Optional column names (unused for ParquetFile but kept for API consistency)
1433
+ include_columns: Optional columns (unused for ParquetFile but kept for API consistency)
1434
+ pa_read_func_kwargs_provider: Optional kwargs provider for customization
1435
+ fs_open_kwargs: Optional kwargs for filesystem open operations
1436
+ **kwargs: Additional kwargs passed to ParquetFile constructor
1437
+
1438
+ Returns:
1439
+ ParquetFile: The ParquetFile object for lazy loading and metadata access
1440
+
1441
+ Raises:
1442
+ ContentTypeValidationError: If content_type is not PARQUET or content_encoding is not IDENTITY
1443
+ """
1444
+ logger.debug(
1445
+ f"Reading {path} to PyArrow ParquetFile. "
1446
+ f"Content type: {content_type}. Encoding: {content_encoding}"
1447
+ )
1448
+ # Validate content type and encoding
1449
+ if (
1450
+ content_type != ContentType.PARQUET.value
1451
+ or content_encoding != ContentEncoding.IDENTITY.value
1452
+ ):
1453
+ raise ContentTypeValidationError(
1454
+ f"File with content type: {content_type} and content encoding: {content_encoding} "
1455
+ "cannot be read into pyarrow.parquet.ParquetFile"
1456
+ )
1457
+
1458
+ # Resolve filesystem and path
1459
+ if not filesystem or isinstance(filesystem, pafs.FileSystem):
1460
+ path, filesystem = resolve_path_and_filesystem(path, filesystem)
1461
+
1462
+ # Build kwargs for ParquetFile constructor
1463
+ parquet_kwargs = {}
1464
+
1465
+ # Add filesystem to kwargs if we have one
1466
+ if filesystem:
1467
+ parquet_kwargs["filesystem"] = filesystem
1468
+
1469
+ # Apply kwargs provider if provided
1470
+ if pa_read_func_kwargs_provider:
1471
+ parquet_kwargs = pa_read_func_kwargs_provider(content_type, parquet_kwargs)
1472
+
1473
+ # Merge with provided kwargs
1474
+ parquet_kwargs.update(kwargs)
1475
+
1476
+ logger.debug(f"Pre-sanitize kwargs for {path}: {parquet_kwargs}")
1477
+
1478
+ # Sanitize kwargs to only include those supported by ParquetFile.__init__
1479
+ parquet_kwargs = sanitize_kwargs_to_callable(ParquetFile.__init__, parquet_kwargs)
1480
+
1481
+ logger.debug(
1482
+ f"Reading the file from {path} into ParquetFile with kwargs: {parquet_kwargs}"
1483
+ )
1484
+
1485
+ def _create_parquet_file():
1486
+ return ParquetFile(path, **parquet_kwargs)
1487
+
1488
+ pq_file, latency = timed_invocation(_create_parquet_file)
1489
+
1490
+ logger.debug(f"Time to get {path} into parquet file: {latency}s")
1491
+
1492
+ return pq_file
1493
+
1494
+
1495
+ def concat_tables(
1496
+ tables: List[Union[pa.Table, papq.ParquetFile]]
1497
+ ) -> Optional[Union[pa.Table, List[papq.ParquetFile]]]:
1498
+ """
1499
+ Concatenate a list of PyArrow Tables or ParquetFiles.
1500
+
1501
+ Args:
1502
+ tables: List of PyArrow Tables or ParquetFiles to concatenate
1503
+
1504
+ Returns:
1505
+ - Single table/ParquetFile if only one input
1506
+ - List of ParquetFiles if all inputs are ParquetFiles (preserves lazy behavior)
1507
+ - Concatenated PyArrow Table if mixed types or multiple PyArrow Tables
1508
+ - None if input is empty
1509
+ """
1510
+ if tables is None or not len(tables):
1511
+ return None
1512
+ if len(tables) == 1:
1513
+ # Return single table as-is to preserve lazy behavior
1514
+ return next(iter(tables))
1515
+
1516
+ # Check if all tables are ParquetFiles - return list to preserve lazy behavior
1517
+ if all(isinstance(table, papq.ParquetFile) for table in tables):
1518
+ return list(tables)
1519
+
1520
+ # Convert all tables to PyArrow Tables for concatenation
1521
+ converted_tables = []
1522
+ for table in tables:
1523
+ if isinstance(table, papq.ParquetFile):
1524
+ converted_tables.append(table.read())
1525
+ else:
1526
+ converted_tables.append(table)
1527
+
1528
+ return pa.concat_tables(converted_tables)
1529
+
1530
+
1531
+ def delta_manifest_to_table(
1532
+ manifest: "Manifest",
1533
+ delta: Optional["Delta"] = None,
1534
+ ) -> pa.Table:
1535
+ """Create a flattened PyArrow table from a delta manifest.
1536
+
1537
+ This implementation can process ~1.4MM records/second on a
1538
+ 10-core 2025 Macbook Air M4 with 16GB of RAM.
1539
+
1540
+ Args:
1541
+ manifest: The manifest to convert to a table
1542
+ delta: Optional parent delta of the manifest
1543
+
1544
+ Returns:
1545
+ PyArrow table with flattened manifest entry data
1546
+ """
1547
+ if not manifest.entries:
1548
+ return pa.table({})
1549
+
1550
+ num_entries = len(manifest.entries)
1551
+
1552
+ # Get manifest-level data once
1553
+ manifest_author = manifest.author
1554
+ author_name = manifest_author.name if manifest_author else None
1555
+ author_version = manifest_author.version if manifest_author else None
1556
+
1557
+ # Get delta-level data once
1558
+ stream_position = delta.stream_position if delta else None
1559
+ previous_stream_position = delta.previous_stream_position if delta else None
1560
+
1561
+ # Pre-allocate lists for core columns to avoid repeated list operations
1562
+ url_values = [None] * num_entries
1563
+ id_values = [None] * num_entries
1564
+ mandatory_values = [None] * num_entries
1565
+
1566
+ # Meta columns - most common fields in manifest entries
1567
+ meta_record_count = [None] * num_entries
1568
+ meta_content_length = [None] * num_entries
1569
+ meta_source_content_length = [None] * num_entries
1570
+ meta_content_type = [None] * num_entries
1571
+ meta_content_encoding = [None] * num_entries
1572
+
1573
+ # Track any additional meta fields we haven't seen before
1574
+ additional_meta_fields = {}
1575
+ additional_entry_fields = {}
1576
+
1577
+ # Single pass through entries with direct list assignment
1578
+ for i, entry in enumerate(manifest.entries):
1579
+ # Handle core entry fields efficiently
1580
+ url_values[i] = entry.get("url") or entry.get("uri")
1581
+ id_values[i] = entry.get("id")
1582
+ mandatory_values[i] = entry.get("mandatory")
1583
+
1584
+ # Handle meta fields efficiently
1585
+ meta = entry.get("meta", {})
1586
+ meta_record_count[i] = meta.get("record_count")
1587
+ meta_content_length[i] = meta.get("content_length")
1588
+ meta_source_content_length[i] = meta.get("source_content_length")
1589
+ meta_content_type[i] = meta.get("content_type")
1590
+ meta_content_encoding[i] = meta.get("content_encoding")
1591
+
1592
+ # Handle any additional meta fields not in our core set
1593
+ for meta_key, meta_value in meta.items():
1594
+ if meta_key not in {
1595
+ "record_count",
1596
+ "content_length",
1597
+ "source_content_length",
1598
+ "content_type",
1599
+ "content_encoding",
1600
+ "entry_type",
1601
+ }:
1602
+ field_name = f"meta_{meta_key}"
1603
+ if field_name not in additional_meta_fields:
1604
+ additional_meta_fields[field_name] = [None] * num_entries
1605
+ additional_meta_fields[field_name][i] = meta_value
1606
+
1607
+ # Handle any additional entry fields not in our core set
1608
+ for entry_key, entry_value in entry.items():
1609
+ if entry_key not in {"url", "uri", "id", "mandatory", "meta"}:
1610
+ if entry_key not in additional_entry_fields:
1611
+ additional_entry_fields[entry_key] = [None] * num_entries
1612
+ additional_entry_fields[entry_key][i] = entry_value
1613
+
1614
+ # Build the arrays dict with core columns
1615
+ arrays_dict = {
1616
+ "id": pa.array(id_values),
1617
+ "mandatory": pa.array(mandatory_values),
1618
+ "meta_content_encoding": pa.array(meta_content_encoding),
1619
+ "meta_content_length": pa.array(meta_content_length),
1620
+ "meta_content_type": pa.array(meta_content_type),
1621
+ "meta_record_count": pa.array(meta_record_count),
1622
+ "meta_source_content_length": pa.array(meta_source_content_length),
1623
+ "path": pa.array(url_values),
1624
+ }
1625
+
1626
+ # Add additional fields if they exist
1627
+ for field_name, field_values in additional_meta_fields.items():
1628
+ arrays_dict[field_name] = pa.array(field_values)
1629
+
1630
+ for field_name, field_values in additional_entry_fields.items():
1631
+ arrays_dict[field_name] = pa.array(field_values)
1632
+
1633
+ # Add manifest/delta columns only if they have data (avoid null columns)
1634
+ if author_name is not None:
1635
+ arrays_dict["author_name"] = pa.array([author_name] * num_entries)
1636
+ if author_version is not None:
1637
+ arrays_dict["author_version"] = pa.array([author_version] * num_entries)
1638
+ if stream_position is not None:
1639
+ arrays_dict["stream_position"] = pa.array([stream_position] * num_entries)
1640
+ if previous_stream_position is not None:
1641
+ arrays_dict["previous_stream_position"] = pa.array(
1642
+ [previous_stream_position] * num_entries
1643
+ )
1644
+
1645
+ return pa.table(arrays_dict)
1646
+
1647
+
1648
+ def get_base_arrow_type_name(arrow_type: pa.DataType) -> str:
1649
+ """Get the base type name from a PyArrow DataType for compatibility lookup.
1650
+
1651
+ This function normalizes complex PyArrow types to their base type names for
1652
+ use in reader compatibility validation. Only specific complex types are
1653
+ normalized; all others return their string representation.
1654
+
1655
+ Args:
1656
+ arrow_type: The PyArrow DataType to normalize
1657
+
1658
+ Returns:
1659
+ str: The normalized type name for compatibility lookup
1660
+
1661
+ Examples:
1662
+ >>> get_base_arrow_type_name(pa.int32())
1663
+ 'int32'
1664
+ >>> get_base_arrow_type_name(pa.list_(pa.int32()))
1665
+ 'list'
1666
+ >>> get_base_arrow_type_name(pa.timestamp('s', tz='UTC'))
1667
+ 'timestamp_tz'
1668
+ """
1669
+ # Only normalize specific complex types, otherwise return str(arrow_type)
1670
+ if isinstance(arrow_type, pa.FixedShapeTensorType):
1671
+ return "fixed_shape_tensor"
1672
+ elif pa.types.is_large_list(arrow_type):
1673
+ return "large_list"
1674
+ elif pa.types.is_list_view(arrow_type):
1675
+ return "list_view"
1676
+ elif pa.types.is_large_list_view(arrow_type):
1677
+ return "large_list_view"
1678
+ elif pa.types.is_fixed_size_list(arrow_type):
1679
+ return "fixed_size_list"
1680
+ elif pa.types.is_list(arrow_type):
1681
+ return "list"
1682
+ elif pa.types.is_map(arrow_type):
1683
+ return "map"
1684
+ elif pa.types.is_struct(arrow_type):
1685
+ return "struct"
1686
+ elif pa.types.is_dictionary(arrow_type):
1687
+ return "dictionary"
1688
+ elif pa.types.is_decimal(arrow_type):
1689
+ if isinstance(arrow_type, pa.Decimal128Type):
1690
+ return "decimal128"
1691
+ elif isinstance(arrow_type, pa.Decimal256Type):
1692
+ return "decimal256"
1693
+ elif pa.types.is_timestamp(arrow_type):
1694
+ # Check if it has timezone info
1695
+ if arrow_type.tz is not None:
1696
+ return f"timestamp_tz[{arrow_type.unit}]"
1697
+ else:
1698
+ return str(arrow_type)
1699
+ else:
1700
+ # For all other types, return the string representation
1701
+ return str(arrow_type)
1702
+
1703
+
1704
+ def get_supported_test_types() -> List[Tuple[str, str, List[Any]]]:
1705
+ """Get comprehensive PyArrow types supported by DeltaCAT writers and readers.
1706
+
1707
+ This utility function returns example Arrow arrays for every Arrow type
1708
+ supported by DeltaCAT writers and readers of tables with schemas. The data
1709
+ is used for testing compatibility between different dataset types and
1710
+ content types.
1711
+
1712
+ Returns:
1713
+ List[Tuple[str, str, List[Any]]]: List of tuples containing:
1714
+ - Test name (str): Human-readable name for the test case
1715
+ - Arrow type code (str): Python code to create the PyArrow DataType
1716
+ - Test data (List[Any]): Sample data values for testing
1717
+
1718
+ Examples:
1719
+ >>> test_types = get_supported_test_types()
1720
+ >>> for name, type_code, data in test_types[:2]:
1721
+ ... print(f"{name}: {type_code} -> {data}")
1722
+ int8: pa.int8() -> [127, -128, 0]
1723
+ int16: pa.int16() -> [32767, -32768, 1000]
1724
+ """
1725
+
1726
+ return [
1727
+ # Integer types
1728
+ ("int8", "pa.int8()", [127, -128, 0]),
1729
+ ("int16", "pa.int16()", [32767, -32768, 1000]),
1730
+ ("int32", "pa.int32()", [2147483647, -2147483648, 1000]),
1731
+ ("int64", "pa.int64()", [9223372036854775807, -9223372036854775808, 1000]),
1732
+ ("uint8", "pa.uint8()", [255, 0, 128]),
1733
+ ("uint16", "pa.uint16()", [65535, 0, 1000]),
1734
+ ("uint32", "pa.uint32()", [4294967295, 0, 1000]),
1735
+ ("uint64", "pa.uint64()", [18446744073709551615, 0, 1000]),
1736
+ # Float types
1737
+ ("float16", "pa.float16()", np.array([1.5, np.nan], dtype=np.float16)),
1738
+ ("float32", "pa.float32()", [3.14159, -2.71828, 1.41421]),
1739
+ ("float64", "pa.float64()", [1.123456789, -2.987654321, 3.141592653589793]),
1740
+ # Boolean and null
1741
+ ("bool_", "pa.bool_()", [True, False, True]),
1742
+ ("null", "pa.null()", [None, None, None]),
1743
+ # String types
1744
+ ("string", "pa.string()", ["hello", "world", "test"]),
1745
+ (
1746
+ "large_string",
1747
+ "pa.large_string()",
1748
+ ["large hello", "large world", "large test"],
1749
+ ),
1750
+ # Binary types
1751
+ ("binary", "pa.binary()", [b"hello", b"world", b"test"]),
1752
+ (
1753
+ "large_binary",
1754
+ "pa.large_binary()",
1755
+ [b"large hello", b"large world", b"large test"],
1756
+ ),
1757
+ # Date and time types
1758
+ (
1759
+ "date32",
1760
+ "pa.date32()",
1761
+ [date(2023, 1, 1), date(2023, 12, 31), date(2024, 6, 15)],
1762
+ ),
1763
+ (
1764
+ "date64",
1765
+ "pa.date64()",
1766
+ [date(2023, 1, 1), date(2023, 12, 31), date(2024, 6, 15)],
1767
+ ),
1768
+ ("time32_s", "pa.time32('s')", [1754962113, 1754962114, 1754962115]),
1769
+ ("time32_ms", "pa.time32('ms')", [1754962113, 1754962114, 1754962115]),
1770
+ (
1771
+ "time64_us",
1772
+ "pa.time64('us')",
1773
+ [1754962113000000, 1754962114000000, 1754962115000000],
1774
+ ),
1775
+ (
1776
+ "time64_ns",
1777
+ "pa.time64('ns')",
1778
+ [1754962113000000000, 1754962114000000000, 1754962115000000000],
1779
+ ),
1780
+ (
1781
+ "timestamp_s",
1782
+ "pa.timestamp('s')",
1783
+ [
1784
+ datetime(2023, 1, 1, 12, 0, 0),
1785
+ datetime(2023, 12, 31, 23, 59, 59),
1786
+ datetime(2024, 6, 15, 10, 30, 45),
1787
+ ],
1788
+ ),
1789
+ (
1790
+ "timestamp_ms",
1791
+ "pa.timestamp('ms')",
1792
+ [
1793
+ datetime(2023, 1, 1, 12, 0, 0),
1794
+ datetime(2023, 12, 31, 23, 59, 59),
1795
+ datetime(2024, 6, 15, 10, 30, 45),
1796
+ ],
1797
+ ),
1798
+ (
1799
+ "timestamp_us",
1800
+ "pa.timestamp('us')",
1801
+ [
1802
+ datetime(2023, 1, 1, 12, 0, 0),
1803
+ datetime(2023, 12, 31, 23, 59, 59),
1804
+ datetime(2024, 6, 15, 10, 30, 45),
1805
+ ],
1806
+ ),
1807
+ (
1808
+ "timestamp_ns",
1809
+ "pa.timestamp('ns')",
1810
+ [
1811
+ datetime(2023, 1, 1, 12, 0, 0),
1812
+ datetime(2023, 12, 31, 23, 59, 59),
1813
+ datetime(2024, 6, 15, 10, 30, 45),
1814
+ ],
1815
+ ),
1816
+ (
1817
+ "timestamp_s_utc",
1818
+ "pa.timestamp('s', tz='UTC')",
1819
+ [
1820
+ datetime(2023, 1, 1, 12, 0, 0),
1821
+ datetime(2023, 12, 31, 23, 59, 59),
1822
+ datetime(2024, 6, 15, 10, 30, 45),
1823
+ ],
1824
+ ),
1825
+ (
1826
+ "timestamp_ms_utc",
1827
+ "pa.timestamp('ms', tz='UTC')",
1828
+ [
1829
+ datetime(2023, 1, 1, 12, 0, 0),
1830
+ datetime(2023, 12, 31, 23, 59, 59),
1831
+ datetime(2024, 6, 15, 10, 30, 45),
1832
+ ],
1833
+ ),
1834
+ (
1835
+ "timestamp_us_utc",
1836
+ "pa.timestamp('us', tz='UTC')",
1837
+ [
1838
+ datetime(2023, 1, 1, 12, 0, 0),
1839
+ datetime(2023, 12, 31, 23, 59, 59),
1840
+ datetime(2024, 6, 15, 10, 30, 45),
1841
+ ],
1842
+ ),
1843
+ (
1844
+ "timestamp_ns_utc",
1845
+ "pa.timestamp('ns', tz='UTC')",
1846
+ [
1847
+ datetime(2023, 1, 1, 12, 0, 0),
1848
+ datetime(2023, 12, 31, 23, 59, 59),
1849
+ datetime(2024, 6, 15, 10, 30, 45),
1850
+ ],
1851
+ ),
1852
+ ("duration_s", "pa.duration('s')", [1754962113, 1754962114, 1754962115]),
1853
+ (
1854
+ "duration_ms",
1855
+ "pa.duration('ms')",
1856
+ [1754962113000, 1754962114000, 1754962115000],
1857
+ ),
1858
+ (
1859
+ "duration_us",
1860
+ "pa.duration('us')",
1861
+ [1754962113000000, 1754962114000000, 1754962115000000],
1862
+ ),
1863
+ (
1864
+ "duration_ns",
1865
+ "pa.duration('ns')",
1866
+ [1754962113000000000, 1754962114000000000, 1754962115000000000],
1867
+ ),
1868
+ (
1869
+ "month_day_nano",
1870
+ "pa.month_day_nano_interval()",
1871
+ [
1872
+ pa.scalar((1, 15, -30), type=pa.month_day_nano_interval()),
1873
+ pa.scalar((2, 15, -30), type=pa.month_day_nano_interval()),
1874
+ pa.scalar((3, 15, -30), type=pa.month_day_nano_interval()),
1875
+ ],
1876
+ ),
1877
+ # Decimal
1878
+ (
1879
+ "decimal128_5_2",
1880
+ "pa.decimal128(5, 2)",
1881
+ [Decimal("123.45"), Decimal("-67.89"), Decimal("999.99")],
1882
+ ),
1883
+ (
1884
+ "decimal128_38_0",
1885
+ "pa.decimal128(38, 0)",
1886
+ [
1887
+ Decimal("12345678901234567890123456789012345678"),
1888
+ Decimal("-12345678901234567890123456789012345678"),
1889
+ Decimal("0"),
1890
+ ],
1891
+ ),
1892
+ (
1893
+ "decimal128_1_0",
1894
+ "pa.decimal128(1, 0)",
1895
+ [Decimal("1"), Decimal("2"), Decimal("3")],
1896
+ ),
1897
+ (
1898
+ "decimal128_38_10",
1899
+ "pa.decimal128(38, 10)",
1900
+ [
1901
+ Decimal("1234567890123456789012345678.9012345678"),
1902
+ Decimal("-1234567890123456789012345678.9012345678"),
1903
+ Decimal("0.0000000000"),
1904
+ ],
1905
+ ),
1906
+ (
1907
+ "decimal256_76_0",
1908
+ "pa.decimal256(76, 0)",
1909
+ [
1910
+ Decimal(
1911
+ "1234567890123456789012345678901234567812345678901234567890123456789012345678"
1912
+ ),
1913
+ Decimal("-0"),
1914
+ Decimal("0"),
1915
+ ],
1916
+ ),
1917
+ (
1918
+ "decimal256_1_0",
1919
+ "pa.decimal256(1, 0)",
1920
+ [Decimal("1"), Decimal("2"), Decimal("3")],
1921
+ ),
1922
+ (
1923
+ "decimal256_5_2",
1924
+ "pa.decimal256(5, 2)",
1925
+ [Decimal("123.45"), Decimal("-67.89"), Decimal("999.99")],
1926
+ ),
1927
+ (
1928
+ "decimal256_76_38",
1929
+ "pa.decimal256(76, 38)",
1930
+ [
1931
+ Decimal(
1932
+ "12345678901234567890123456789012345678.12345678901234567890123456789012345678"
1933
+ ),
1934
+ Decimal("-0.00000000000000000000000000000000000000"),
1935
+ Decimal("0.00000000000000000000000000000000000000"),
1936
+ ],
1937
+ ),
1938
+ # List types
1939
+ ("list_int32", "pa.list_(pa.int32())", [[1, 2, 3], [4, 5], [6, 7, 8, 9]]),
1940
+ ("list_string", "pa.list_(pa.string())", [["a", "b"], ["c", "d", "e"], ["f"]]),
1941
+ # Struct type
1942
+ (
1943
+ "struct_simple",
1944
+ "pa.struct([('name', pa.string()), ('age', pa.int32())])",
1945
+ [
1946
+ {"name": "Alice", "age": 30},
1947
+ {"name": "Bob", "age": 25},
1948
+ {"name": "Charlie", "age": 35},
1949
+ ],
1950
+ ),
1951
+ (
1952
+ "large_list_int32",
1953
+ "pa.large_list(pa.int32())",
1954
+ [[1, 2, 3], [4, 5], [6, 7, 8, 9]],
1955
+ ),
1956
+ (
1957
+ "fixed_size_list_int32",
1958
+ "pa.list_(pa.int32(), 3)",
1959
+ [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
1960
+ ),
1961
+ (
1962
+ "list_view_int32",
1963
+ "pa.list_view(pa.int32())",
1964
+ [[1, 2, 3], [4, 5], [6, 7, 8, 9]],
1965
+ ),
1966
+ (
1967
+ "large_list_view_int32",
1968
+ "pa.large_list_view(pa.int32())",
1969
+ [[1, 2, 3], [4, 5], [6, 7, 8, 9]],
1970
+ ),
1971
+ # Dictionary type
1972
+ (
1973
+ "dictionary_string",
1974
+ "pa.dictionary(pa.int32(), pa.string())",
1975
+ ["apple", "banana", "apple"],
1976
+ ),
1977
+ # Map type
1978
+ (
1979
+ "map_string_int32",
1980
+ "pa.map_(pa.string(), pa.int32())",
1981
+ [{"a": 1, "b": 2}, {"c": 3, "d": 4}, {"e": 5}],
1982
+ ),
1983
+ # Extension Types
1984
+ (
1985
+ "fixed_shape_tensor",
1986
+ "pa.fixed_shape_tensor(pa.int32(), [3, 3])",
1987
+ [
1988
+ np.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.int32),
1989
+ np.array([10, 11, 12, 13, 14, 15, 16, 17, 18], dtype=np.int32),
1990
+ np.array([19, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int32),
1991
+ ],
1992
+ ),
1993
+ ]