deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -1,27 +1,50 @@
1
1
  from unittest import TestCase
2
2
  from deltacat.utils.pyarrow import (
3
- s3_partial_parquet_file_to_table,
3
+ partial_parquet_file_to_table,
4
4
  pyarrow_read_csv,
5
+ ContentTypeValidationError,
5
6
  content_type_to_reader_kwargs,
6
7
  _add_column_kwargs,
7
- s3_file_to_table,
8
+ file_to_table,
9
+ file_to_parquet,
10
+ table_to_file,
8
11
  ReadKwargsProviderPyArrowSchemaOverride,
12
+ ReadKwargsProviderPyArrowCsvPureUtf8,
13
+ RAISE_ON_DECIMAL_OVERFLOW,
9
14
  RAISE_ON_EMPTY_CSV_KWARG,
10
15
  )
16
+ import decimal
11
17
  from deltacat.types.media import ContentEncoding, ContentType
12
18
  from deltacat.types.partial_download import PartialParquetParameters
13
19
  from pyarrow.parquet import ParquetFile
20
+ import tempfile
14
21
  import pyarrow as pa
22
+ from pyarrow import csv as pacsv
23
+ import fsspec
24
+ import gzip
25
+ import json
26
+ from pyarrow import (
27
+ feather as paf,
28
+ parquet as papq,
29
+ orc as paorc,
30
+ )
15
31
 
16
32
  PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
33
+ PARQUET_GZIP_COMPRESSED_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet.gz"
17
34
  EMPTY_UTSV_PATH = "deltacat/tests/utils/data/empty.csv"
18
35
  NON_EMPTY_VALID_UTSV_PATH = "deltacat/tests/utils/data/non_empty_valid.csv"
36
+ OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH = (
37
+ "deltacat/tests/utils/data/overflowing_decimal_precision.csv"
38
+ )
39
+ OVERFLOWING_DECIMAL_SCALE_UTSV_PATH = (
40
+ "deltacat/tests/utils/data/overflowing_decimal_scale.csv"
41
+ )
19
42
  GZIP_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.gz"
20
43
  BZ2_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.bz2"
21
44
 
22
45
 
23
- class TestS3PartialParquetFileToTable(TestCase):
24
- def test_s3_partial_parquet_file_to_table_sanity(self):
46
+ class TestPartialParquetFileToTable(TestCase):
47
+ def test_partial_parquet_file_to_table_sanity(self):
25
48
 
26
49
  pq_file = ParquetFile(PARQUET_FILE_PATH)
27
50
  partial_parquet_params = PartialParquetParameters.of(
@@ -35,7 +58,7 @@ class TestS3PartialParquetFileToTable(TestCase):
35
58
  # only first row group to be downloaded
36
59
  partial_parquet_params.row_groups_to_download.pop()
37
60
 
38
- result = s3_partial_parquet_file_to_table(
61
+ result = partial_parquet_file_to_table(
39
62
  PARQUET_FILE_PATH,
40
63
  include_columns=["n_legs"],
41
64
  content_encoding=ContentEncoding.IDENTITY.value,
@@ -46,7 +69,7 @@ class TestS3PartialParquetFileToTable(TestCase):
46
69
  self.assertEqual(len(result), 3)
47
70
  self.assertEqual(len(result.columns), 1)
48
71
 
49
- def test_s3_partial_parquet_file_to_table_when_schema_passed(self):
72
+ def test_partial_parquet_file_to_table_when_schema_passed(self):
50
73
 
51
74
  pq_file = ParquetFile(PARQUET_FILE_PATH)
52
75
  partial_parquet_params = PartialParquetParameters.of(
@@ -66,7 +89,7 @@ class TestS3PartialParquetFileToTable(TestCase):
66
89
 
67
90
  pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
68
91
 
69
- result = s3_partial_parquet_file_to_table(
92
+ result = partial_parquet_file_to_table(
70
93
  PARQUET_FILE_PATH,
71
94
  ContentType.PARQUET.value,
72
95
  ContentEncoding.IDENTITY.value,
@@ -85,7 +108,7 @@ class TestS3PartialParquetFileToTable(TestCase):
85
108
  self.assertEqual(result_schema.field(2).type, "int64")
86
109
  self.assertEqual(result_schema.field(2).name, "MISSING")
87
110
 
88
- def test_s3_partial_parquet_file_to_table_when_schema_missing_columns(self):
111
+ def test_partial_parquet_file_to_table_when_schema_missing_columns(self):
89
112
 
90
113
  pq_file = ParquetFile(PARQUET_FILE_PATH)
91
114
  partial_parquet_params = PartialParquetParameters.of(
@@ -105,7 +128,7 @@ class TestS3PartialParquetFileToTable(TestCase):
105
128
 
106
129
  pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
107
130
 
108
- result = s3_partial_parquet_file_to_table(
131
+ result = partial_parquet_file_to_table(
109
132
  PARQUET_FILE_PATH,
110
133
  ContentType.PARQUET.value,
111
134
  ContentEncoding.IDENTITY.value,
@@ -122,7 +145,7 @@ class TestS3PartialParquetFileToTable(TestCase):
122
145
  self.assertEqual(result_schema.field(0).type, "int64")
123
146
  self.assertEqual(result_schema.field(0).name, "MISSING")
124
147
 
125
- def test_s3_partial_parquet_file_to_table_when_schema_passed_with_include_columns(
148
+ def test_partial_parquet_file_to_table_when_schema_passed_with_include_columns(
126
149
  self,
127
150
  ):
128
151
 
@@ -139,11 +162,11 @@ class TestS3PartialParquetFileToTable(TestCase):
139
162
 
140
163
  pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
141
164
 
142
- result = s3_partial_parquet_file_to_table(
165
+ result = partial_parquet_file_to_table(
143
166
  PARQUET_FILE_PATH,
144
167
  ContentType.PARQUET.value,
145
168
  ContentEncoding.IDENTITY.value,
146
- ["n_legs", "animal"],
169
+ column_names=["n_legs", "animal"],
147
170
  pa_read_func_kwargs_provider=pa_kwargs_provider,
148
171
  partial_file_download_params=partial_parquet_params,
149
172
  )
@@ -155,7 +178,7 @@ class TestS3PartialParquetFileToTable(TestCase):
155
178
  self.assertEqual(result_schema.field(0).type, "string")
156
179
  self.assertEqual(result_schema.field(0).name, "n_legs") # order doesn't change
157
180
 
158
- def test_s3_partial_parquet_file_to_table_when_multiple_row_groups(self):
181
+ def test_partial_parquet_file_to_table_when_multiple_row_groups(self):
159
182
 
160
183
  pq_file = ParquetFile(PARQUET_FILE_PATH)
161
184
  partial_parquet_params = PartialParquetParameters.of(
@@ -166,7 +189,7 @@ class TestS3PartialParquetFileToTable(TestCase):
166
189
  partial_parquet_params.num_row_groups, 2, "test_file.parquet has changed."
167
190
  )
168
191
 
169
- result = s3_partial_parquet_file_to_table(
192
+ result = partial_parquet_file_to_table(
170
193
  PARQUET_FILE_PATH,
171
194
  content_encoding=ContentEncoding.IDENTITY.value,
172
195
  content_type=ContentType.PARQUET.value,
@@ -407,130 +430,1388 @@ class TestReadCSV(TestCase):
407
430
  ),
408
431
  )
409
432
 
433
+ def test_read_csv_when_decimal_precision_overflows_and_raise_kwarg_specified(self):
434
+ schema = pa.schema(
435
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
436
+ )
437
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
438
+ _add_column_kwargs(
439
+ ContentType.UNESCAPED_TSV.value,
440
+ ["is_active", "decimal_value"],
441
+ ["is_active", "decimal_value"],
442
+ kwargs,
443
+ )
444
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
410
445
 
411
- class TestS3FileToTable(TestCase):
412
- def test_s3_file_to_table_identity_sanity(self):
446
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
447
+ self.assertRaises(
448
+ pa.lib.ArrowInvalid,
449
+ lambda: pyarrow_read_csv(
450
+ OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH,
451
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
452
+ ),
453
+ )
413
454
 
455
+ def test_read_csv_when_decimal_precision_overflows_sanity(self):
414
456
  schema = pa.schema(
415
- [("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
457
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
458
+ )
459
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
460
+ _add_column_kwargs(
461
+ ContentType.UNESCAPED_TSV.value,
462
+ ["is_active", "decimal_value"],
463
+ ["is_active", "decimal_value"],
464
+ kwargs,
416
465
  )
417
466
 
418
- result = s3_file_to_table(
419
- NON_EMPTY_VALID_UTSV_PATH,
467
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
468
+
469
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
470
+
471
+ self.assertRaises(
472
+ pa.lib.ArrowInvalid,
473
+ lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH, **kwargs),
474
+ )
475
+
476
+ def test_read_csv_when_decimal_scale_overflows_and_raise_kwarg_specified(self):
477
+ schema = pa.schema(
478
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
479
+ )
480
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
481
+ _add_column_kwargs(
420
482
  ContentType.UNESCAPED_TSV.value,
421
- ContentEncoding.IDENTITY.value,
422
- ["is_active", "ship_datetime_utc"],
423
- None,
424
- pa_read_func_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(
425
- schema=schema
483
+ ["is_active", "decimal_value"],
484
+ ["is_active", "decimal_value"],
485
+ kwargs,
486
+ )
487
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
488
+
489
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
490
+
491
+ self.assertRaises(
492
+ pa.lib.ArrowInvalid,
493
+ lambda: pyarrow_read_csv(
494
+ OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
495
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
426
496
  ),
427
497
  )
428
498
 
499
+ def test_read_csv_when_decimal_scale_overflows_sanity(self):
500
+ schema = pa.schema(
501
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
502
+ )
503
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
504
+ _add_column_kwargs(
505
+ ContentType.UNESCAPED_TSV.value,
506
+ ["is_active", "decimal_value"],
507
+ ["is_active", "decimal_value"],
508
+ kwargs,
509
+ )
510
+
511
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
512
+
513
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
514
+
515
+ result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
516
+
429
517
  self.assertEqual(len(result), 3)
518
+ self.assertEqual(
519
+ result[1][0].as_py(), decimal.Decimal("322236.66")
520
+ ) # rounding decimal
521
+ self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
430
522
  self.assertEqual(len(result.column_names), 2)
431
523
  result_schema = result.schema
432
- for index, field in enumerate(result_schema):
433
- self.assertEqual(field.name, schema.field(index).name)
524
+ self.assertEqual(result_schema.field(0).type, "string")
525
+ self.assertEqual(result_schema.field(1).type, pa.decimal128(20, 2))
434
526
 
435
- self.assertEqual(result.schema.field(0).type, "string")
527
+ def test_read_csv_when_decimal_scale_overflows_and_negative_scale(self):
528
+ schema = pa.schema(
529
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, -2))]
530
+ )
531
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
532
+ _add_column_kwargs(
533
+ ContentType.UNESCAPED_TSV.value,
534
+ ["is_active", "decimal_value"],
535
+ ["is_active", "decimal_value"],
536
+ kwargs,
537
+ )
436
538
 
437
- def test_s3_file_to_table_gzip_compressed_sanity(self):
539
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
540
+
541
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
542
+
543
+ result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
544
+
545
+ self.assertEqual(len(result), 3)
546
+ self.assertEqual(
547
+ result[1][0].as_py(),
548
+ decimal.Decimal("322200"), # consequence of negative scale
549
+ ) # rounding decimal
550
+ self.assertEqual(result[1][1].as_py(), decimal.Decimal("00"))
551
+ self.assertEqual(len(result.column_names), 2)
552
+ result_schema = result.schema
553
+ self.assertEqual(result_schema.field(0).type, "string")
554
+ self.assertEqual(result_schema.field(1).type, pa.decimal128(20, -2))
438
555
 
556
+ def test_read_csv_when_decimal_scale_overflows_with_decimal256(self):
439
557
  schema = pa.schema(
440
- [("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
558
+ [("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
559
+ )
560
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
561
+ _add_column_kwargs(
562
+ ContentType.UNESCAPED_TSV.value,
563
+ ["is_active", "decimal_value"],
564
+ ["is_active", "decimal_value"],
565
+ kwargs,
441
566
  )
442
567
 
443
- result = s3_file_to_table(
444
- GZIP_COMPRESSED_FILE_UTSV_PATH,
568
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
569
+
570
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
571
+
572
+ result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
573
+
574
+ self.assertEqual(len(result), 3)
575
+ self.assertEqual(
576
+ result[1][0].as_py(), decimal.Decimal("322236.66")
577
+ ) # rounding decimal
578
+ self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
579
+ self.assertEqual(len(result.column_names), 2)
580
+ result_schema = result.schema
581
+ self.assertEqual(result_schema.field(0).type, "string")
582
+ self.assertEqual(result_schema.field(1).type, pa.decimal256(20, 2))
583
+
584
+ def test_read_csv_when_decimal_scale_overflows_with_decimal256_and_raise_on_overflow(
585
+ self,
586
+ ):
587
+ schema = pa.schema(
588
+ [("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
589
+ )
590
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
591
+ _add_column_kwargs(
445
592
  ContentType.UNESCAPED_TSV.value,
446
- ContentEncoding.GZIP.value,
447
- ["is_active", "ship_datetime_utc"],
448
- None,
449
- pa_read_func_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(
450
- schema=schema
593
+ ["is_active", "decimal_value"],
594
+ ["is_active", "decimal_value"],
595
+ kwargs,
596
+ )
597
+
598
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
599
+
600
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
601
+
602
+ self.assertRaises(
603
+ pa.lib.ArrowNotImplementedError,
604
+ lambda: pyarrow_read_csv(
605
+ OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
606
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
451
607
  ),
452
608
  )
453
609
 
454
- self.assertEqual(len(result), 3)
610
+ def test_read_csv_when_decimal_scale_overflows_without_any_schema_then_infers(self):
611
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
612
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=None)
613
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
614
+
615
+ result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
616
+
617
+ # The default behavior of pyarrow is to invalid skip rows
618
+ self.assertEqual(len(result), 2)
619
+ self.assertEqual(result[1][0].as_py(), 32.33) # rounding decimal
620
+ self.assertEqual(result[1][1].as_py(), 0.4) # not rounded
455
621
  self.assertEqual(len(result.column_names), 2)
456
622
  result_schema = result.schema
457
- for index, field in enumerate(result_schema):
458
- self.assertEqual(field.name, schema.field(index).name)
623
+ self.assertEqual(result_schema.field(0).type, "string")
624
+ self.assertEqual(result_schema.field(1).type, pa.float64())
459
625
 
460
- self.assertEqual(result.schema.field(0).type, "string")
626
+ def test_read_csv_when_decimal_scale_and_precision_overflow_and_raise_on_overflow(
627
+ self,
628
+ ):
629
+ schema = pa.schema(
630
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(5, 2))]
631
+ )
632
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
633
+ _add_column_kwargs(
634
+ ContentType.UNESCAPED_TSV.value,
635
+ ["is_active", "decimal_value"],
636
+ ["is_active", "decimal_value"],
637
+ kwargs,
638
+ )
639
+
640
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
641
+
642
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
461
643
 
462
- def test_s3_file_to_table_bz2_compressed_sanity(self):
644
+ self.assertRaises(
645
+ pa.lib.ArrowInvalid,
646
+ lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs),
647
+ )
463
648
 
649
+ def test_read_csv_when_decimal_scale_overflow_and_file_like_obj_passed(self):
464
650
  schema = pa.schema(
465
- [("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
651
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(15, 2))]
652
+ )
653
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
654
+ _add_column_kwargs(
655
+ ContentType.UNESCAPED_TSV.value,
656
+ ["is_active", "decimal_value"],
657
+ ["is_active", "decimal_value"],
658
+ kwargs,
659
+ )
660
+
661
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
662
+
663
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
664
+
665
+ with open(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, "rb") as file:
666
+ result = pyarrow_read_csv(file, **kwargs)
667
+
668
+ self.assertEqual(len(result), 3)
669
+ self.assertEqual(
670
+ result[1][0].as_py(), decimal.Decimal("322236.66")
671
+ ) # rounding decimal
672
+ self.assertEqual(
673
+ result[1][1].as_py(), decimal.Decimal("32.33")
674
+ ) # not rounded
675
+ self.assertEqual(len(result.column_names), 2)
676
+ result_schema = result.schema
677
+ self.assertEqual(result_schema.field(0).type, "string")
678
+ self.assertEqual(result_schema.field(1).type, pa.decimal128(15, 2))
679
+
680
+
681
+ class TestWriters(TestCase):
682
+ def setUp(self):
683
+ self.table = pa.table({"col1": ["a,b\tc|d", "e,f\tg|h"], "col2": [1, 2]})
684
+ self.fs = fsspec.filesystem("file")
685
+ self.base_path = tempfile.mkdtemp()
686
+ self.fs.makedirs(self.base_path, exist_ok=True)
687
+
688
+ def tearDown(self):
689
+ self.fs.rm(self.base_path, recursive=True)
690
+
691
+ def test_write_feather(self):
692
+ path = f"{self.base_path}/test.feather"
693
+
694
+ table_to_file(
695
+ self.table,
696
+ path,
697
+ self.fs,
698
+ lambda x: path,
699
+ content_type=ContentType.FEATHER.value,
700
+ )
701
+ assert self.fs.exists(path), "file was not written"
702
+
703
+ # Verify content
704
+ result = paf.read_table(path)
705
+ assert result.equals(self.table)
706
+
707
+ def test_write_csv(self):
708
+ path = f"{self.base_path}/test.csv.gz"
709
+
710
+ table_to_file(
711
+ self.table,
712
+ path,
713
+ self.fs,
714
+ lambda x: path,
715
+ content_type=ContentType.CSV.value,
716
+ )
717
+ assert self.fs.exists(path), "file was not written"
718
+
719
+ # Verify content (should be GZIP compressed)
720
+ with self.fs.open(path, "rb") as f:
721
+ with gzip.GzipFile(fileobj=f) as gz:
722
+ content = gz.read().decode("utf-8")
723
+ # Should be quoted due to commas in data
724
+ assert '"a,b\tc|d",1' in content
725
+ assert '"e,f\tg|h",2' in content
726
+
727
+ def test_write_tsv(self):
728
+ path = f"{self.base_path}/test.tsv.gz"
729
+
730
+ table_to_file(
731
+ self.table,
732
+ path,
733
+ self.fs,
734
+ lambda x: path,
735
+ content_type=ContentType.TSV.value,
736
+ )
737
+ assert self.fs.exists(path), "file was not written"
738
+
739
+ # Verify content (should be GZIP compressed)
740
+ with self.fs.open(path, "rb") as f:
741
+ with gzip.GzipFile(fileobj=f) as gz:
742
+ content = gz.read().decode("utf-8")
743
+ # Should be quoted due to tabs in data
744
+ assert '"a,b\tc|d"\t1' in content
745
+ assert '"e,f\tg|h"\t2' in content
746
+
747
+ def test_write_psv(self):
748
+ path = f"{self.base_path}/test.psv.gz"
749
+
750
+ table_to_file(
751
+ self.table,
752
+ path,
753
+ self.fs,
754
+ lambda x: path,
755
+ content_type=ContentType.PSV.value,
756
+ )
757
+ assert self.fs.exists(path), "file was not written"
758
+
759
+ # Verify content (should be GZIP compressed)
760
+ with self.fs.open(path, "rb") as f:
761
+ with gzip.GzipFile(fileobj=f) as gz:
762
+ content = gz.read().decode("utf-8")
763
+ # Should be quoted due to pipes in data
764
+ assert '"a,b\tc|d"|1' in content
765
+ assert '"e,f\tg|h"|2' in content
766
+
767
+ def test_write_unescaped_tsv(self):
768
+ # Create table without delimiters for unescaped TSV
769
+ table = pa.table({"col1": ["abc", "def"], "col2": [1, 2]})
770
+ path = f"{self.base_path}/test.tsv.gz"
771
+
772
+ table_to_file(
773
+ table,
774
+ path,
775
+ self.fs,
776
+ lambda x: path,
777
+ content_type=ContentType.UNESCAPED_TSV.value,
778
+ )
779
+ assert self.fs.exists(path), "file was not written"
780
+
781
+ # Verify content (should be GZIP compressed)
782
+ with self.fs.open(path, "rb") as f:
783
+ with gzip.GzipFile(fileobj=f) as gz:
784
+ content = gz.read().decode("utf-8")
785
+ # With quoting_style="none", strings should not be quoted
786
+ assert "abc\t1" in content
787
+ assert "def\t2" in content
788
+
789
+ def test_write_orc(self):
790
+ path = f"{self.base_path}/test.orc"
791
+
792
+ table_to_file(
793
+ self.table,
794
+ path,
795
+ self.fs,
796
+ lambda x: path,
797
+ content_type=ContentType.ORC.value,
798
+ )
799
+ assert self.fs.exists(path), "file was not written"
800
+
801
+ # Verify content
802
+ result = paorc.read_table(path)
803
+ assert result.equals(self.table)
804
+
805
+ def test_write_parquet(self):
806
+ path = f"{self.base_path}/test.parquet"
807
+
808
+ table_to_file(
809
+ self.table,
810
+ path,
811
+ self.fs,
812
+ lambda x: path,
813
+ content_type=ContentType.PARQUET.value,
814
+ )
815
+ assert self.fs.exists(path), "file was not written"
816
+
817
+ # Verify content
818
+ result = papq.read_table(path)
819
+ assert result.equals(self.table)
820
+
821
+ def test_write_json(self):
822
+ path = f"{self.base_path}/test.json.gz"
823
+
824
+ table_to_file(
825
+ self.table,
826
+ path,
827
+ self.fs,
828
+ lambda x: path,
829
+ content_type=ContentType.JSON.value,
830
+ )
831
+ assert self.fs.exists(path), "file was not written"
832
+
833
+ # Verify content (should be GZIP compressed)
834
+ with self.fs.open(path, "rb") as f:
835
+ with gzip.GzipFile(fileobj=f) as gz:
836
+ content = gz.read().decode("utf-8")
837
+ # Each line should be a valid JSON object
838
+ lines = [
839
+ line for line in content.split("\n") if line
840
+ ] # Skip empty lines
841
+ assert len(lines) == 2 # 2 records
842
+ assert json.loads(lines[0]) == {"col1": "a,b\tc|d", "col2": 1}
843
+ assert json.loads(lines[1]) == {"col1": "e,f\tg|h", "col2": 2}
844
+
845
+ def test_write_avro(self):
846
+ import polars as pl
847
+
848
+ path = f"{self.base_path}/test.avro"
849
+
850
+ table_to_file(
851
+ self.table,
852
+ path,
853
+ self.fs,
854
+ lambda x: path,
855
+ content_type=ContentType.AVRO.value,
856
+ )
857
+ assert self.fs.exists(path), "file was not written"
858
+
859
+ # Verify content by reading with polars
860
+ result = pl.read_avro(path).to_arrow()
861
+ # Cast the result to match the original table's schema
862
+ # (the round-trip from arrow->polars->arrow casts string to large string)
863
+ result = result.cast(self.table.schema)
864
+ assert result.equals(self.table)
865
+
866
+
867
+ class TestPyArrowReaders(TestCase):
868
+ def setUp(self):
869
+ # Create test data files for reading
870
+ self.fs = fsspec.filesystem("file")
871
+ self.base_path = tempfile.mkdtemp()
872
+ self.fs.makedirs(self.base_path, exist_ok=True)
873
+
874
+ # Create test Table
875
+ self.table = pa.Table.from_pylist(
876
+ [
877
+ {"col1": "a,b\tc|d", "col2": 1, "col3": 1.1},
878
+ {"col1": "e,f\tg|h", "col2": 2, "col3": 2.2},
879
+ {"col1": "test", "col2": 3, "col3": 3.3},
880
+ ]
881
+ )
882
+
883
+ # Write test files in different formats
884
+ self._create_test_files()
885
+
886
+ def tearDown(self):
887
+ self.fs.rm(self.base_path, recursive=True)
888
+
889
+ def _create_test_files(self):
890
+ # Create CSV file (GZIP compressed)
891
+ csv_path = f"{self.base_path}/test.csv"
892
+ with self.fs.open(csv_path, "wb") as f:
893
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
894
+ content = '"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n'
895
+ gz.write(content.encode("utf-8"))
896
+
897
+ # Create TSV file (GZIP compressed)
898
+ tsv_path = f"{self.base_path}/test.tsv"
899
+ with self.fs.open(tsv_path, "wb") as f:
900
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
901
+ content = '"a,b\tc|d"\t1\t1.1\n"e,f\tg|h"\t2\t2.2\ntest\t3\t3.3\n'
902
+ gz.write(content.encode("utf-8"))
903
+
904
+ # Create PSV file (GZIP compressed)
905
+ psv_path = f"{self.base_path}/test.psv"
906
+ with self.fs.open(psv_path, "wb") as f:
907
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
908
+ content = '"a,b\tc|d"|1|1.1\n"e,f\tg|h"|2|2.2\ntest|3|3.3\n'
909
+ gz.write(content.encode("utf-8"))
910
+
911
+ # Create unescaped TSV file (GZIP compressed)
912
+ unescaped_tsv_path = f"{self.base_path}/test_unescaped.tsv"
913
+ pa.Table.from_pylist(
914
+ [
915
+ {"col1": "abc", "col2": 1, "col3": 1.1},
916
+ {"col1": "def", "col2": 2, "col3": 2.2},
917
+ {"col1": "ghi", "col2": 3, "col3": 3.3},
918
+ ]
919
+ )
920
+ with self.fs.open(unescaped_tsv_path, "wb") as f:
921
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
922
+ content = "abc\t1\t1.1\ndef\t2\t2.2\nghi\t3\t3.3\n"
923
+ gz.write(content.encode("utf-8"))
924
+
925
+ # Create Parquet file
926
+ parquet_path = f"{self.base_path}/test.parquet"
927
+ with self.fs.open(parquet_path, "wb") as f:
928
+ papq.write_table(self.table, f)
929
+
930
+ # Create Feather file
931
+ feather_path = f"{self.base_path}/test.feather"
932
+ with self.fs.open(feather_path, "wb") as f:
933
+ paf.write_feather(self.table, f)
934
+
935
+ # Create JSON file (GZIP compressed)
936
+ json_path = f"{self.base_path}/test.json"
937
+ with self.fs.open(json_path, "wb") as f:
938
+ with gzip.GzipFile(fileobj=f, mode="wb") as gz:
939
+ # Create NDJSON format - one JSON object per line
940
+ lines = []
941
+ for row in self.table.to_pylist():
942
+ lines.append(json.dumps(row))
943
+ content = "\n".join(lines) + "\n"
944
+ gz.write(content.encode("utf-8"))
945
+
946
+ # Create Avro file using polars (since pyarrow delegates to polars for Avro)
947
+ avro_path = f"{self.base_path}/test.avro"
948
+ import polars as pl
949
+
950
+ pl_df = pl.from_arrow(self.table)
951
+ pl_df.write_avro(avro_path)
952
+
953
+ # Create ORC file
954
+ orc_path = f"{self.base_path}/test.orc"
955
+ with self.fs.open(orc_path, "wb") as f:
956
+ paorc.write_table(self.table, f)
957
+
958
+ def test_content_type_to_reader_kwargs(self):
959
+ # Test CSV kwargs
960
+ csv_kwargs = content_type_to_reader_kwargs(ContentType.CSV.value)
961
+ expected_csv = {"parse_options": pacsv.ParseOptions(delimiter=",")}
962
+ assert (
963
+ csv_kwargs["parse_options"].delimiter
964
+ == expected_csv["parse_options"].delimiter
965
+ )
966
+
967
+ # Test TSV kwargs
968
+ tsv_kwargs = content_type_to_reader_kwargs(ContentType.TSV.value)
969
+ expected_tsv = {"parse_options": pacsv.ParseOptions(delimiter="\t")}
970
+ assert (
971
+ tsv_kwargs["parse_options"].delimiter
972
+ == expected_tsv["parse_options"].delimiter
973
+ )
974
+
975
+ # Test PSV kwargs
976
+ psv_kwargs = content_type_to_reader_kwargs(ContentType.PSV.value)
977
+ expected_psv = {"parse_options": pacsv.ParseOptions(delimiter="|")}
978
+ assert (
979
+ psv_kwargs["parse_options"].delimiter
980
+ == expected_psv["parse_options"].delimiter
981
+ )
982
+
983
+ # Test unescaped TSV kwargs
984
+ unescaped_kwargs = content_type_to_reader_kwargs(
985
+ ContentType.UNESCAPED_TSV.value
986
+ )
987
+ assert unescaped_kwargs["parse_options"].delimiter == "\t"
988
+ assert unescaped_kwargs["parse_options"].quote_char is False
989
+ assert unescaped_kwargs["convert_options"].null_values == [""]
990
+
991
+ # Test Parquet kwargs (should be empty)
992
+ parquet_kwargs = content_type_to_reader_kwargs(ContentType.PARQUET.value)
993
+ assert parquet_kwargs == {}
994
+
995
+ # Test ORC kwargs (should be empty)
996
+ orc_kwargs = content_type_to_reader_kwargs(ContentType.ORC.value)
997
+ assert orc_kwargs == {}
998
+
999
+ # Test Avro kwargs (should be empty)
1000
+ avro_kwargs = content_type_to_reader_kwargs(ContentType.AVRO.value)
1001
+ assert avro_kwargs == {}
1002
+
1003
+ def test_add_column_kwargs(self):
1004
+ kwargs = {}
1005
+ column_names = ["col1", "col2", "col3"]
1006
+ include_columns = ["col1", "col2"]
1007
+
1008
+ # Test CSV column kwargs
1009
+ _add_column_kwargs(ContentType.CSV.value, column_names, include_columns, kwargs)
1010
+ assert kwargs["read_options"].column_names == column_names
1011
+ assert kwargs["convert_options"].include_columns == include_columns
1012
+
1013
+ # Test Parquet column kwargs
1014
+ kwargs = {}
1015
+ _add_column_kwargs(
1016
+ ContentType.PARQUET.value, column_names, include_columns, kwargs
1017
+ )
1018
+ assert kwargs["columns"] == include_columns
1019
+
1020
+ def test_file_to_table_csv(self):
1021
+ # Test reading CSV with file_to_table
1022
+ csv_path = f"{self.base_path}/test.csv"
1023
+
1024
+ result = file_to_table(
1025
+ csv_path,
1026
+ ContentType.CSV.value,
1027
+ ContentEncoding.GZIP.value,
1028
+ filesystem=self.fs,
1029
+ column_names=["col1", "col2", "col3"],
1030
+ )
1031
+
1032
+ assert len(result) == 3
1033
+ assert result.column_names == ["col1", "col2", "col3"]
1034
+ assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
1035
+
1036
+ def test_file_to_table_tsv(self):
1037
+ # Test reading TSV with file_to_table
1038
+ tsv_path = f"{self.base_path}/test.tsv"
1039
+
1040
+ result = file_to_table(
1041
+ tsv_path,
1042
+ ContentType.TSV.value,
1043
+ ContentEncoding.GZIP.value,
1044
+ filesystem=self.fs,
1045
+ column_names=["col1", "col2", "col3"],
1046
+ )
1047
+
1048
+ assert len(result) == 3
1049
+ assert result.column_names == ["col1", "col2", "col3"]
1050
+ assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
1051
+
1052
+ def test_file_to_table_psv(self):
1053
+ # Test reading PSV with file_to_table
1054
+ psv_path = f"{self.base_path}/test.psv"
1055
+
1056
+ result = file_to_table(
1057
+ psv_path,
1058
+ ContentType.PSV.value,
1059
+ ContentEncoding.GZIP.value,
1060
+ filesystem=self.fs,
1061
+ column_names=["col1", "col2", "col3"],
466
1062
  )
467
1063
 
468
- result = s3_file_to_table(
469
- BZ2_COMPRESSED_FILE_UTSV_PATH,
1064
+ assert len(result) == 3
1065
+ assert result.column_names == ["col1", "col2", "col3"]
1066
+ assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
1067
+
1068
+ def test_file_to_table_unescaped_tsv(self):
1069
+ # Test reading unescaped TSV with file_to_table
1070
+ unescaped_tsv_path = f"{self.base_path}/test_unescaped.tsv"
1071
+
1072
+ result = file_to_table(
1073
+ unescaped_tsv_path,
470
1074
  ContentType.UNESCAPED_TSV.value,
1075
+ ContentEncoding.GZIP.value,
1076
+ filesystem=self.fs,
1077
+ column_names=["col1", "col2", "col3"],
1078
+ )
1079
+
1080
+ assert len(result) == 3
1081
+ assert result.column_names == ["col1", "col2", "col3"]
1082
+ assert result.column("col1").to_pylist() == ["abc", "def", "ghi"]
1083
+
1084
+ def test_file_to_table_parquet(self):
1085
+ # Test reading Parquet with file_to_table
1086
+ parquet_path = f"{self.base_path}/test.parquet"
1087
+
1088
+ result = file_to_table(
1089
+ parquet_path, ContentType.PARQUET.value, filesystem=self.fs
1090
+ )
1091
+
1092
+ assert len(result) == 3
1093
+ assert result.column_names == ["col1", "col2", "col3"]
1094
+ assert result.equals(self.table)
1095
+
1096
+ def test_file_to_table_feather(self):
1097
+ # Test reading Feather with file_to_table
1098
+ feather_path = f"{self.base_path}/test.feather"
1099
+
1100
+ result = file_to_table(
1101
+ feather_path, ContentType.FEATHER.value, filesystem=self.fs
1102
+ )
1103
+
1104
+ assert len(result) == 3
1105
+ assert result.column_names == ["col1", "col2", "col3"]
1106
+ assert result.equals(self.table)
1107
+
1108
+ def test_file_to_table_json(self):
1109
+ # Test reading JSON with file_to_table
1110
+ json_path = f"{self.base_path}/test.json"
1111
+
1112
+ result = file_to_table(
1113
+ json_path,
1114
+ ContentType.JSON.value,
1115
+ ContentEncoding.GZIP.value,
1116
+ filesystem=self.fs,
1117
+ )
1118
+
1119
+ assert len(result) == 3
1120
+ assert set(result.column_names) == {"col1", "col2", "col3"}
1121
+ assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
1122
+
1123
+ def test_file_to_table_avro(self):
1124
+ # Test reading Avro with file_to_table
1125
+ avro_path = f"{self.base_path}/test.avro"
1126
+
1127
+ result = file_to_table(avro_path, ContentType.AVRO.value, filesystem=self.fs)
1128
+
1129
+ assert len(result) == 3
1130
+ assert result.column_names == ["col1", "col2", "col3"]
1131
+ # Avro may have different dtypes, so compare values
1132
+ assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
1133
+
1134
+ def test_file_to_table_orc(self):
1135
+ # Test reading ORC with file_to_table
1136
+ orc_path = f"{self.base_path}/test.orc"
1137
+
1138
+ result = file_to_table(orc_path, ContentType.ORC.value, filesystem=self.fs)
1139
+
1140
+ assert len(result) == 3
1141
+ assert result.column_names == ["col1", "col2", "col3"]
1142
+ assert result.equals(self.table)
1143
+
1144
+ def test_file_to_table_with_column_selection(self):
1145
+ # Test reading with column selection
1146
+ csv_path = f"{self.base_path}/test.csv"
1147
+
1148
+ result = file_to_table(
1149
+ csv_path,
1150
+ ContentType.CSV.value,
1151
+ ContentEncoding.GZIP.value,
1152
+ filesystem=self.fs,
1153
+ column_names=["col1", "col2", "col3"],
1154
+ include_columns=["col1", "col2"],
1155
+ )
1156
+
1157
+ assert len(result) == 3
1158
+ assert len(result.column_names) == 2 # Should only have 2 columns
1159
+ assert result.column_names == ["col1", "col2"]
1160
+
1161
+ def test_file_to_table_with_kwargs_provider(self):
1162
+ # Test reading with kwargs provider
1163
+ csv_path = f"{self.base_path}/test.csv"
1164
+ provider = ReadKwargsProviderPyArrowCsvPureUtf8(
1165
+ include_columns=["col1", "col2", "col3"]
1166
+ )
1167
+
1168
+ result = file_to_table(
1169
+ csv_path,
1170
+ ContentType.CSV.value,
1171
+ ContentEncoding.GZIP.value,
1172
+ filesystem=self.fs,
1173
+ column_names=["col1", "col2", "col3"],
1174
+ pa_read_func_kwargs_provider=provider,
1175
+ )
1176
+
1177
+ assert len(result) == 3
1178
+ assert result.column_names == ["col1", "col2", "col3"]
1179
+ # With string types provider, all columns should be strings
1180
+ for col_name in result.column_names:
1181
+ assert result.schema.field(col_name).type == pa.string()
1182
+
1183
+ def test_file_to_table_filesystem_inference(self):
1184
+ # Test filesystem inference when no filesystem is provided
1185
+ # Use JSON file since it should work well with inference
1186
+ json_path = f"{self.base_path}/test.json"
1187
+
1188
+ result = file_to_table(
1189
+ json_path,
1190
+ ContentType.JSON.value,
1191
+ ContentEncoding.GZIP.value
1192
+ # No filesystem provided - should be inferred
1193
+ )
1194
+
1195
+ assert len(result) == 3
1196
+ assert set(result.column_names) == {"col1", "col2", "col3"}
1197
+ assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
1198
+
1199
+ def test_file_to_table_unsupported_content_type(self):
1200
+ # Test error handling for unsupported content type
1201
+ parquet_path = f"{self.base_path}/test.parquet"
1202
+
1203
+ with self.assertRaises(NotImplementedError) as context:
1204
+ file_to_table(parquet_path, "unsupported/content-type", filesystem=self.fs)
1205
+
1206
+ assert "not implemented" in str(context.exception)
1207
+
1208
+ def test_file_to_table_bzip2_compression(self):
1209
+ # Test BZIP2 compression handling
1210
+ import bz2
1211
+
1212
+ # Create a BZIP2 compressed CSV file
1213
+ csv_content = '"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n'
1214
+ compressed_content = bz2.compress(csv_content.encode("utf-8"))
1215
+
1216
+ bz2_path = f"{self.base_path}/test.csv.bz2"
1217
+ with self.fs.open(bz2_path, "wb") as f:
1218
+ f.write(compressed_content)
1219
+
1220
+ result = file_to_table(
1221
+ bz2_path,
1222
+ ContentType.CSV.value,
471
1223
  ContentEncoding.BZIP2.value,
472
- ["is_active", "ship_datetime_utc"],
473
- None,
474
- pa_read_func_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(
475
- schema=schema
476
- ),
1224
+ filesystem=self.fs,
1225
+ column_names=["col1", "col2", "col3"],
477
1226
  )
478
1227
 
479
- self.assertEqual(len(result), 3)
480
- self.assertEqual(len(result.column_names), 2)
481
- result_schema = result.schema
482
- for index, field in enumerate(result_schema):
483
- self.assertEqual(field.name, schema.field(index).name)
1228
+ assert len(result) == 3
1229
+ assert result.column_names == ["col1", "col2", "col3"]
1230
+ assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
484
1231
 
485
- self.assertEqual(result.schema.field(0).type, "string")
486
1232
 
487
- def test_s3_file_to_table_when_parquet_sanity(self):
1233
+ class TestFileToParquet(TestCase):
1234
+ def setUp(self):
1235
+ # Create test data files for reading
1236
+ self.fs = fsspec.filesystem("file")
1237
+ self.base_path = tempfile.mkdtemp()
1238
+ self.fs.makedirs(self.base_path, exist_ok=True)
1239
+
1240
+ # Create test Table
1241
+ self.table = pa.Table.from_pylist(
1242
+ [
1243
+ {"col1": "a,b\tc|d", "col2": 1, "col3": 1.1},
1244
+ {"col1": "e,f\tg|h", "col2": 2, "col3": 2.2},
1245
+ {"col1": "test", "col2": 3, "col3": 3.3},
1246
+ ]
1247
+ )
1248
+
1249
+ # Write test parquet files
1250
+ self._create_test_files()
488
1251
 
489
- pa_kwargs_provider = lambda content_type, kwargs: {
490
- "reader_type": "pyarrow",
491
- **kwargs,
492
- }
1252
+ def tearDown(self):
1253
+ self.fs.rm(self.base_path, recursive=True)
493
1254
 
494
- result = s3_file_to_table(
495
- PARQUET_FILE_PATH,
496
- ContentType.PARQUET.value,
497
- ContentEncoding.IDENTITY.value,
498
- ["n_legs", "animal"],
499
- ["n_legs"],
500
- pa_read_func_kwargs_provider=pa_kwargs_provider,
1255
+ def _create_test_files(self):
1256
+ # Create basic Parquet file
1257
+ parquet_path = f"{self.base_path}/test.parquet"
1258
+ with self.fs.open(parquet_path, "wb") as f:
1259
+ papq.write_table(self.table, f)
1260
+
1261
+ # Create larger Parquet file with multiple row groups
1262
+ large_table = pa.Table.from_pylist(
1263
+ [{"col1": f"row_{i}", "col2": i, "col3": float(i)} for i in range(1000)]
501
1264
  )
1265
+ large_parquet_path = f"{self.base_path}/test_large.parquet"
1266
+ with self.fs.open(large_parquet_path, "wb") as f:
1267
+ papq.write_table(
1268
+ large_table, f, row_group_size=100
1269
+ ) # Create multiple row groups
502
1270
 
503
- self.assertEqual(len(result), 6)
504
- self.assertEqual(len(result.column_names), 1)
505
- schema = result.schema
506
- schema_index = schema.get_field_index("n_legs")
507
- self.assertEqual(schema.field(schema_index).type, "int64")
1271
+ def test_file_to_parquet_basic(self):
1272
+ # Test basic parquet file reading
1273
+ parquet_path = f"{self.base_path}/test.parquet"
508
1274
 
509
- def test_s3_file_to_table_when_parquet_schema_overridden(self):
1275
+ result = file_to_parquet(parquet_path, filesystem=self.fs)
1276
+
1277
+ assert isinstance(result, papq.ParquetFile)
1278
+ assert result.num_row_groups > 0
1279
+ assert result.metadata.num_rows == 3
1280
+ assert result.metadata.num_columns == 3
1281
+
1282
+ # Verify we can read the data
1283
+ table = result.read()
1284
+ assert len(table) == 3
1285
+ assert table.column_names == ["col1", "col2", "col3"]
1286
+
1287
+ def test_file_to_parquet_with_schema_provider(self):
1288
+ # Test with schema override provider
1289
+ parquet_path = f"{self.base_path}/test.parquet"
510
1290
 
511
1291
  schema = pa.schema(
512
- [pa.field("animal", pa.string()), pa.field("n_legs", pa.string())]
1292
+ [
1293
+ pa.field("col1", pa.string()),
1294
+ pa.field("col2", pa.string()), # Override to string
1295
+ pa.field("col3", pa.string()), # Override to string
1296
+ ]
513
1297
  )
514
1298
 
515
- pa_kwargs_provider = lambda content_type, kwargs: {
516
- "schema": schema,
517
- "reader_type": "pyarrow",
518
- **kwargs,
519
- }
1299
+ provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
520
1300
 
521
- result = s3_file_to_table(
522
- PARQUET_FILE_PATH,
1301
+ result = file_to_parquet(
1302
+ parquet_path, filesystem=self.fs, pa_read_func_kwargs_provider=provider
1303
+ )
1304
+
1305
+ assert isinstance(result, papq.ParquetFile)
1306
+ # Note: schema override might not affect ParquetFile metadata,
1307
+ # but should work when reading the table
1308
+ table = result.read()
1309
+ assert len(table) == 3
1310
+
1311
+ def test_file_to_parquet_with_custom_kwargs(self):
1312
+ # Test with custom ParquetFile kwargs
1313
+ parquet_path = f"{self.base_path}/test.parquet"
1314
+
1315
+ result = file_to_parquet(
1316
+ parquet_path,
1317
+ filesystem=self.fs,
1318
+ validate_schema=True, # Custom kwarg for ParquetFile
1319
+ memory_map=True, # Another custom kwarg
1320
+ )
1321
+
1322
+ assert isinstance(result, papq.ParquetFile)
1323
+ assert result.metadata.num_rows == 3
1324
+
1325
+ def test_file_to_parquet_filesystem_inference(self):
1326
+ # Test filesystem inference when no filesystem is provided
1327
+ parquet_path = f"{self.base_path}/test.parquet"
1328
+
1329
+ result = file_to_parquet(
1330
+ parquet_path
1331
+ # No filesystem provided - should be inferred
1332
+ )
1333
+
1334
+ assert isinstance(result, papq.ParquetFile)
1335
+ assert result.metadata.num_rows == 3
1336
+ assert result.metadata.num_columns == 3
1337
+
1338
+ def test_file_to_parquet_large_file(self):
1339
+ # Test with larger parquet file (multiple row groups)
1340
+ large_parquet_path = f"{self.base_path}/test_large.parquet"
1341
+
1342
+ result = file_to_parquet(large_parquet_path, filesystem=self.fs)
1343
+
1344
+ assert isinstance(result, papq.ParquetFile)
1345
+ assert result.metadata.num_rows == 1000
1346
+ assert result.num_row_groups > 1 # Should have multiple row groups
1347
+
1348
+ # Test reading specific row groups
1349
+ first_row_group = result.read_row_group(0)
1350
+ assert len(first_row_group) <= 100 # Based on row_group_size=100
1351
+
1352
+ def test_file_to_parquet_metadata_access(self):
1353
+ # Test accessing various metadata properties
1354
+ parquet_path = f"{self.base_path}/test.parquet"
1355
+
1356
+ result = file_to_parquet(parquet_path, filesystem=self.fs)
1357
+
1358
+ # Test metadata access
1359
+ metadata = result.metadata
1360
+ assert metadata.num_rows == 3
1361
+ assert metadata.num_columns == 3
1362
+ assert metadata.num_row_groups >= 1
1363
+
1364
+ # Test schema access
1365
+ schema = result.schema
1366
+ assert len(schema) == 3
1367
+ assert "col1" in schema.names
1368
+ assert "col2" in schema.names
1369
+ assert "col3" in schema.names
1370
+
1371
+ # Test schema_arrow property
1372
+ schema_arrow = result.schema_arrow
1373
+ assert isinstance(schema_arrow, pa.Schema)
1374
+ assert len(schema_arrow) == 3
1375
+
1376
+ def test_file_to_parquet_column_selection(self):
1377
+ # Test reading specific columns
1378
+ parquet_path = f"{self.base_path}/test.parquet"
1379
+
1380
+ result = file_to_parquet(parquet_path, filesystem=self.fs)
1381
+
1382
+ # Read only specific columns
1383
+ table = result.read(columns=["col1", "col2"])
1384
+ assert len(table.column_names) == 2
1385
+ assert table.column_names == ["col1", "col2"]
1386
+ assert len(table) == 3
1387
+
1388
+ def test_file_to_parquet_invalid_content_type(self):
1389
+ # Test error handling for invalid content type
1390
+ parquet_path = f"{self.base_path}/test.parquet"
1391
+
1392
+ with self.assertRaises(ContentTypeValidationError) as context:
1393
+ file_to_parquet(
1394
+ parquet_path,
1395
+ content_type=ContentType.CSV.value, # Invalid content type
1396
+ filesystem=self.fs,
1397
+ )
1398
+
1399
+ assert "cannot be read into pyarrow.parquet.ParquetFile" in str(
1400
+ context.exception
1401
+ )
1402
+
1403
+ def test_file_to_parquet_invalid_content_encoding(self):
1404
+ # Test error handling for invalid content encoding
1405
+ parquet_path = f"{self.base_path}/test.parquet"
1406
+
1407
+ with self.assertRaises(ContentTypeValidationError) as context:
1408
+ file_to_parquet(
1409
+ parquet_path,
1410
+ content_encoding=ContentEncoding.GZIP.value, # Invalid encoding
1411
+ filesystem=self.fs,
1412
+ )
1413
+
1414
+ assert "cannot be read into pyarrow.parquet.ParquetFile" in str(
1415
+ context.exception
1416
+ )
1417
+
1418
+ def test_file_to_parquet_different_filesystems(self):
1419
+ # Test with different filesystem implementations
1420
+ parquet_path = f"{self.base_path}/test.parquet"
1421
+
1422
+ # Test with fsspec filesystem
1423
+ result_fsspec = file_to_parquet(parquet_path, filesystem=self.fs)
1424
+ assert isinstance(result_fsspec, papq.ParquetFile)
1425
+ assert result_fsspec.metadata.num_rows == 3
1426
+
1427
+ # Test with None filesystem (inferred)
1428
+ result_inferred = file_to_parquet(parquet_path, filesystem=None)
1429
+ assert isinstance(result_inferred, papq.ParquetFile)
1430
+ assert result_inferred.metadata.num_rows == 3
1431
+
1432
+ def test_file_to_parquet_lazy_loading(self):
1433
+ # Test that ParquetFile provides lazy loading capabilities
1434
+ large_parquet_path = f"{self.base_path}/test_large.parquet"
1435
+
1436
+ result = file_to_parquet(large_parquet_path, filesystem=self.fs)
1437
+
1438
+ # ParquetFile should be created without loading all data
1439
+ assert isinstance(result, papq.ParquetFile)
1440
+ assert result.metadata.num_rows == 1000
1441
+
1442
+ # Test reading only specific columns (lazy loading)
1443
+ partial_table = result.read(columns=["col1", "col2"])
1444
+ assert len(partial_table) == 1000 # All rows but only 2 columns
1445
+ assert partial_table.column_names == ["col1", "col2"]
1446
+
1447
+ # Test reading specific row group (lazy loading)
1448
+ row_group_table = result.read_row_group(0)
1449
+ assert len(row_group_table) <= 100 # Based on row_group_size
1450
+
1451
+ def test_file_to_parquet_performance_timing(self):
1452
+ # Test that performance timing is logged (basic functionality test)
1453
+ parquet_path = f"{self.base_path}/test.parquet"
1454
+
1455
+ # This should complete without error and log timing
1456
+ result = file_to_parquet(parquet_path, filesystem=self.fs)
1457
+
1458
+ assert isinstance(result, papq.ParquetFile)
1459
+ assert result.metadata.num_rows == 3
1460
+
1461
+
1462
+ class TestFileToTableFilesystems(TestCase):
1463
+ """Test file_to_table with different filesystem implementations across all content types."""
1464
+
1465
+ def setUp(self):
1466
+ self.tmpdir = tempfile.mkdtemp()
1467
+ self._create_test_files()
1468
+
1469
+ def tearDown(self):
1470
+ import shutil
1471
+
1472
+ shutil.rmtree(self.tmpdir)
1473
+
1474
+ def _create_test_files(self):
1475
+ """Create test files for all supported content types."""
1476
+ # Test data
1477
+ test_data = pa.table(
1478
+ {
1479
+ "id": [1, 2, 3, 4, 5],
1480
+ "name": ["Alice", "Bob", "Charlie", "Diana", "Eve"],
1481
+ "age": [25, 30, 35, 28, 32],
1482
+ "score": [85.5, 92.0, 78.5, 88.0, 95.5],
1483
+ }
1484
+ )
1485
+
1486
+ # File paths
1487
+ self.csv_file = f"{self.tmpdir}/test.csv"
1488
+ self.tsv_file = f"{self.tmpdir}/test.tsv"
1489
+ self.psv_file = f"{self.tmpdir}/test.psv"
1490
+ self.unescaped_tsv_file = f"{self.tmpdir}/test_unescaped.tsv"
1491
+ self.parquet_file = f"{self.tmpdir}/test.parquet"
1492
+ self.feather_file = f"{self.tmpdir}/test.feather"
1493
+ self.json_file = f"{self.tmpdir}/test.json"
1494
+ self.orc_file = f"{self.tmpdir}/test.orc"
1495
+ self.avro_file = f"{self.tmpdir}/test.avro"
1496
+
1497
+ # Create CSV file
1498
+ pacsv.write_csv(
1499
+ test_data,
1500
+ self.csv_file,
1501
+ write_options=pacsv.WriteOptions(delimiter=",", include_header=False),
1502
+ )
1503
+
1504
+ # Create TSV file
1505
+ pacsv.write_csv(
1506
+ test_data,
1507
+ self.tsv_file,
1508
+ write_options=pacsv.WriteOptions(delimiter="\t", include_header=False),
1509
+ )
1510
+
1511
+ # Create PSV file
1512
+ pacsv.write_csv(
1513
+ test_data,
1514
+ self.psv_file,
1515
+ write_options=pacsv.WriteOptions(delimiter="|", include_header=False),
1516
+ )
1517
+
1518
+ # Create unescaped TSV file
1519
+ pacsv.write_csv(
1520
+ test_data,
1521
+ self.unescaped_tsv_file,
1522
+ write_options=pacsv.WriteOptions(
1523
+ delimiter="\t", include_header=False, quoting_style="none"
1524
+ ),
1525
+ )
1526
+
1527
+ # Create Parquet file
1528
+ papq.write_table(test_data, self.parquet_file)
1529
+
1530
+ # Create Feather file
1531
+ paf.write_feather(test_data, self.feather_file)
1532
+
1533
+ # Create JSON file (write as JSONL format)
1534
+ df = test_data.to_pandas()
1535
+ with open(self.json_file, "w") as f:
1536
+ for _, row in df.iterrows():
1537
+ json.dump(row.to_dict(), f)
1538
+ f.write("\n")
1539
+
1540
+ # Create ORC file
1541
+ paorc.write_table(test_data, self.orc_file)
1542
+
1543
+ # Create Avro file
1544
+ try:
1545
+ import polars as pl
1546
+
1547
+ pl_df = pl.from_arrow(test_data)
1548
+ pl_df.write_avro(self.avro_file)
1549
+ except ImportError:
1550
+ # Skip Avro file creation if polars is not available
1551
+ self.avro_file = None
1552
+
1553
+ def _get_filesystems(self, file_path):
1554
+ """Get different filesystem implementations for testing."""
1555
+ # fsspec AbstractFileSystem
1556
+ fsspec_fs = fsspec.filesystem("file")
1557
+
1558
+ # PyArrow filesystem
1559
+ import pyarrow.fs as pafs
1560
+
1561
+ pyarrow_fs = pafs.LocalFileSystem()
1562
+
1563
+ # None for automatic inference
1564
+ auto_infer_fs = None
1565
+
1566
+ return [
1567
+ ("fsspec", fsspec_fs),
1568
+ ("pyarrow", pyarrow_fs),
1569
+ ("auto_infer", auto_infer_fs),
1570
+ ]
1571
+
1572
+ def _assert_table_content(self, table, content_type):
1573
+ """Assert that the loaded table has expected content."""
1574
+ self.assertEqual(len(table), 5, f"Expected 5 rows for {content_type}")
1575
+ self.assertEqual(
1576
+ len(table.columns), 4, f"Expected 4 columns for {content_type}"
1577
+ )
1578
+
1579
+ # Check column names exist (order might vary for some formats)
1580
+ column_names = set(table.column_names)
1581
+ expected_columns = {"id", "name", "age", "score"}
1582
+ self.assertEqual(
1583
+ column_names, expected_columns, f"Column names mismatch for {content_type}"
1584
+ )
1585
+
1586
+ def test_csv_all_filesystems(self):
1587
+ """Test CSV reading with all filesystem types."""
1588
+ for fs_name, filesystem in self._get_filesystems(self.csv_file):
1589
+ with self.subTest(filesystem=fs_name):
1590
+ table = file_to_table(
1591
+ self.csv_file,
1592
+ ContentType.CSV.value,
1593
+ ContentEncoding.IDENTITY.value,
1594
+ filesystem=filesystem,
1595
+ column_names=["id", "name", "age", "score"],
1596
+ )
1597
+ self._assert_table_content(table, f"CSV with {fs_name}")
1598
+
1599
+ def test_tsv_all_filesystems(self):
1600
+ """Test TSV reading with all filesystem types."""
1601
+ for fs_name, filesystem in self._get_filesystems(self.tsv_file):
1602
+ with self.subTest(filesystem=fs_name):
1603
+ table = file_to_table(
1604
+ self.tsv_file,
1605
+ ContentType.TSV.value,
1606
+ ContentEncoding.IDENTITY.value,
1607
+ filesystem=filesystem,
1608
+ column_names=["id", "name", "age", "score"],
1609
+ )
1610
+ self._assert_table_content(table, f"TSV with {fs_name}")
1611
+
1612
+ def test_psv_all_filesystems(self):
1613
+ """Test PSV reading with all filesystem types."""
1614
+ for fs_name, filesystem in self._get_filesystems(self.psv_file):
1615
+ with self.subTest(filesystem=fs_name):
1616
+ table = file_to_table(
1617
+ self.psv_file,
1618
+ ContentType.PSV.value,
1619
+ ContentEncoding.IDENTITY.value,
1620
+ filesystem=filesystem,
1621
+ column_names=["id", "name", "age", "score"],
1622
+ )
1623
+ self._assert_table_content(table, f"PSV with {fs_name}")
1624
+
1625
+ def test_unescaped_tsv_all_filesystems(self):
1626
+ """Test unescaped TSV reading with all filesystem types."""
1627
+ for fs_name, filesystem in self._get_filesystems(self.unescaped_tsv_file):
1628
+ with self.subTest(filesystem=fs_name):
1629
+ table = file_to_table(
1630
+ self.unescaped_tsv_file,
1631
+ ContentType.UNESCAPED_TSV.value,
1632
+ ContentEncoding.IDENTITY.value,
1633
+ filesystem=filesystem,
1634
+ column_names=["id", "name", "age", "score"],
1635
+ )
1636
+ self._assert_table_content(table, f"UNESCAPED_TSV with {fs_name}")
1637
+
1638
+ def test_parquet_all_filesystems(self):
1639
+ """Test Parquet reading with all filesystem types."""
1640
+ for fs_name, filesystem in self._get_filesystems(self.parquet_file):
1641
+ with self.subTest(filesystem=fs_name):
1642
+ table = file_to_table(
1643
+ self.parquet_file,
1644
+ ContentType.PARQUET.value,
1645
+ ContentEncoding.IDENTITY.value,
1646
+ filesystem=filesystem,
1647
+ )
1648
+ self._assert_table_content(table, f"PARQUET with {fs_name}")
1649
+
1650
+ def test_feather_all_filesystems(self):
1651
+ """Test Feather reading with all filesystem types."""
1652
+ for fs_name, filesystem in self._get_filesystems(self.feather_file):
1653
+ with self.subTest(filesystem=fs_name):
1654
+ table = file_to_table(
1655
+ self.feather_file,
1656
+ ContentType.FEATHER.value,
1657
+ ContentEncoding.IDENTITY.value,
1658
+ filesystem=filesystem,
1659
+ )
1660
+ self._assert_table_content(table, f"FEATHER with {fs_name}")
1661
+
1662
+ def test_json_all_filesystems(self):
1663
+ """Test JSON reading with all filesystem types."""
1664
+ for fs_name, filesystem in self._get_filesystems(self.json_file):
1665
+ with self.subTest(filesystem=fs_name):
1666
+ table = file_to_table(
1667
+ self.json_file,
1668
+ ContentType.JSON.value,
1669
+ ContentEncoding.IDENTITY.value,
1670
+ filesystem=filesystem,
1671
+ )
1672
+ self._assert_table_content(table, f"JSON with {fs_name}")
1673
+
1674
+ def test_orc_all_filesystems(self):
1675
+ """Test ORC reading with all filesystem types."""
1676
+ for fs_name, filesystem in self._get_filesystems(self.orc_file):
1677
+ with self.subTest(filesystem=fs_name):
1678
+ table = file_to_table(
1679
+ self.orc_file,
1680
+ ContentType.ORC.value,
1681
+ ContentEncoding.IDENTITY.value,
1682
+ filesystem=filesystem,
1683
+ )
1684
+ self._assert_table_content(table, f"ORC with {fs_name}")
1685
+
1686
+ def test_avro_all_filesystems(self):
1687
+ """Test Avro reading with all filesystem types."""
1688
+ if self.avro_file is None:
1689
+ self.skipTest("Avro file creation skipped (polars not available)")
1690
+
1691
+ for fs_name, filesystem in self._get_filesystems(self.avro_file):
1692
+ with self.subTest(filesystem=fs_name):
1693
+ table = file_to_table(
1694
+ self.avro_file,
1695
+ ContentType.AVRO.value,
1696
+ ContentEncoding.IDENTITY.value,
1697
+ filesystem=filesystem,
1698
+ )
1699
+ self._assert_table_content(table, f"AVRO with {fs_name}")
1700
+
1701
+ def test_column_selection_all_filesystems(self):
1702
+ """Test column selection works with all filesystem types."""
1703
+ for fs_name, filesystem in self._get_filesystems(self.parquet_file):
1704
+ with self.subTest(filesystem=fs_name):
1705
+ table = file_to_table(
1706
+ self.parquet_file,
1707
+ ContentType.PARQUET.value,
1708
+ ContentEncoding.IDENTITY.value,
1709
+ filesystem=filesystem,
1710
+ include_columns=["name", "age"],
1711
+ )
1712
+ self.assertEqual(
1713
+ len(table.columns), 2, f"Expected 2 columns with {fs_name}"
1714
+ )
1715
+ self.assertEqual(
1716
+ set(table.column_names),
1717
+ {"name", "age"},
1718
+ f"Column selection failed with {fs_name}",
1719
+ )
1720
+
1721
+ def test_kwargs_provider_all_filesystems(self):
1722
+ """Test that kwargs providers work with all filesystem types."""
1723
+
1724
+ def schema_provider(content_type, kwargs):
1725
+ if content_type == ContentType.CSV.value:
1726
+ # Force all columns to be strings
1727
+ kwargs["convert_options"] = pacsv.ConvertOptions(
1728
+ column_types={
1729
+ "id": pa.string(),
1730
+ "name": pa.string(),
1731
+ "age": pa.string(),
1732
+ "score": pa.string(),
1733
+ }
1734
+ )
1735
+ return kwargs
1736
+
1737
+ for fs_name, filesystem in self._get_filesystems(self.csv_file):
1738
+ with self.subTest(filesystem=fs_name):
1739
+ table = file_to_table(
1740
+ self.csv_file,
1741
+ ContentType.CSV.value,
1742
+ ContentEncoding.IDENTITY.value,
1743
+ filesystem=filesystem,
1744
+ column_names=["id", "name", "age", "score"],
1745
+ pa_read_func_kwargs_provider=schema_provider,
1746
+ )
1747
+ # Check that all columns are strings
1748
+ for field in table.schema:
1749
+ self.assertEqual(
1750
+ field.type,
1751
+ pa.string(),
1752
+ f"Column {field.name} should be string with {fs_name}",
1753
+ )
1754
+
1755
+ def test_filesystem_auto_inference_consistency(self):
1756
+ """Test that auto-inferred filesystem produces same results as explicit filesystems."""
1757
+ # Use Parquet as it's most reliable across filesystem types
1758
+
1759
+ # Read with auto-inference
1760
+ auto_table = file_to_table(
1761
+ self.parquet_file,
523
1762
  ContentType.PARQUET.value,
524
1763
  ContentEncoding.IDENTITY.value,
525
- ["n_legs", "animal"],
526
- pa_read_func_kwargs_provider=pa_kwargs_provider,
1764
+ filesystem=None, # Auto-infer
527
1765
  )
528
1766
 
529
- self.assertEqual(len(result), 6)
530
- self.assertEqual(len(result.column_names), 2)
1767
+ # Read with explicit fsspec filesystem
1768
+ fsspec_fs = fsspec.filesystem("file")
1769
+ fsspec_table = file_to_table(
1770
+ self.parquet_file,
1771
+ ContentType.PARQUET.value,
1772
+ ContentEncoding.IDENTITY.value,
1773
+ filesystem=fsspec_fs,
1774
+ )
531
1775
 
532
- result_schema = result.schema
533
- for index, field in enumerate(result_schema):
534
- self.assertEqual(field.name, schema.field(index).name)
1776
+ # Read with explicit PyArrow filesystem
1777
+ import pyarrow.fs as pafs
1778
+
1779
+ pyarrow_fs = pafs.LocalFileSystem()
1780
+ pyarrow_table = file_to_table(
1781
+ self.parquet_file,
1782
+ ContentType.PARQUET.value,
1783
+ ContentEncoding.IDENTITY.value,
1784
+ filesystem=pyarrow_fs,
1785
+ )
1786
+
1787
+ # All should produce equivalent results
1788
+ self.assertTrue(
1789
+ auto_table.equals(fsspec_table),
1790
+ "Auto-inferred result should match fsspec result",
1791
+ )
1792
+ self.assertTrue(
1793
+ auto_table.equals(pyarrow_table),
1794
+ "Auto-inferred result should match PyArrow result",
1795
+ )
535
1796
 
536
- self.assertEqual(result.schema.field(1).type, "string")
1797
+ def test_error_handling_all_filesystems(self):
1798
+ """Test error handling works consistently across filesystem types."""
1799
+ for fs_name, filesystem in self._get_filesystems(self.parquet_file):
1800
+ with self.subTest(filesystem=fs_name):
1801
+ # Test unsupported content type
1802
+ with self.assertRaises(NotImplementedError):
1803
+ file_to_table(
1804
+ self.parquet_file,
1805
+ "UNSUPPORTED_TYPE",
1806
+ ContentEncoding.IDENTITY.value,
1807
+ filesystem=filesystem,
1808
+ )
1809
+
1810
+ # Test non-existent file
1811
+ with self.assertRaises((FileNotFoundError, OSError)):
1812
+ file_to_table(
1813
+ f"{self.tmpdir}/non_existent.parquet",
1814
+ ContentType.PARQUET.value,
1815
+ ContentEncoding.IDENTITY.value,
1816
+ filesystem=filesystem,
1817
+ )