deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -1,27 +1,50 @@
|
|
1
1
|
from unittest import TestCase
|
2
2
|
from deltacat.utils.pyarrow import (
|
3
|
-
|
3
|
+
partial_parquet_file_to_table,
|
4
4
|
pyarrow_read_csv,
|
5
|
+
ContentTypeValidationError,
|
5
6
|
content_type_to_reader_kwargs,
|
6
7
|
_add_column_kwargs,
|
7
|
-
|
8
|
+
file_to_table,
|
9
|
+
file_to_parquet,
|
10
|
+
table_to_file,
|
8
11
|
ReadKwargsProviderPyArrowSchemaOverride,
|
12
|
+
ReadKwargsProviderPyArrowCsvPureUtf8,
|
13
|
+
RAISE_ON_DECIMAL_OVERFLOW,
|
9
14
|
RAISE_ON_EMPTY_CSV_KWARG,
|
10
15
|
)
|
16
|
+
import decimal
|
11
17
|
from deltacat.types.media import ContentEncoding, ContentType
|
12
18
|
from deltacat.types.partial_download import PartialParquetParameters
|
13
19
|
from pyarrow.parquet import ParquetFile
|
20
|
+
import tempfile
|
14
21
|
import pyarrow as pa
|
22
|
+
from pyarrow import csv as pacsv
|
23
|
+
import fsspec
|
24
|
+
import gzip
|
25
|
+
import json
|
26
|
+
from pyarrow import (
|
27
|
+
feather as paf,
|
28
|
+
parquet as papq,
|
29
|
+
orc as paorc,
|
30
|
+
)
|
15
31
|
|
16
32
|
PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
|
33
|
+
PARQUET_GZIP_COMPRESSED_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet.gz"
|
17
34
|
EMPTY_UTSV_PATH = "deltacat/tests/utils/data/empty.csv"
|
18
35
|
NON_EMPTY_VALID_UTSV_PATH = "deltacat/tests/utils/data/non_empty_valid.csv"
|
36
|
+
OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH = (
|
37
|
+
"deltacat/tests/utils/data/overflowing_decimal_precision.csv"
|
38
|
+
)
|
39
|
+
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH = (
|
40
|
+
"deltacat/tests/utils/data/overflowing_decimal_scale.csv"
|
41
|
+
)
|
19
42
|
GZIP_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.gz"
|
20
43
|
BZ2_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.bz2"
|
21
44
|
|
22
45
|
|
23
|
-
class
|
24
|
-
def
|
46
|
+
class TestPartialParquetFileToTable(TestCase):
|
47
|
+
def test_partial_parquet_file_to_table_sanity(self):
|
25
48
|
|
26
49
|
pq_file = ParquetFile(PARQUET_FILE_PATH)
|
27
50
|
partial_parquet_params = PartialParquetParameters.of(
|
@@ -35,7 +58,7 @@ class TestS3PartialParquetFileToTable(TestCase):
|
|
35
58
|
# only first row group to be downloaded
|
36
59
|
partial_parquet_params.row_groups_to_download.pop()
|
37
60
|
|
38
|
-
result =
|
61
|
+
result = partial_parquet_file_to_table(
|
39
62
|
PARQUET_FILE_PATH,
|
40
63
|
include_columns=["n_legs"],
|
41
64
|
content_encoding=ContentEncoding.IDENTITY.value,
|
@@ -46,7 +69,7 @@ class TestS3PartialParquetFileToTable(TestCase):
|
|
46
69
|
self.assertEqual(len(result), 3)
|
47
70
|
self.assertEqual(len(result.columns), 1)
|
48
71
|
|
49
|
-
def
|
72
|
+
def test_partial_parquet_file_to_table_when_schema_passed(self):
|
50
73
|
|
51
74
|
pq_file = ParquetFile(PARQUET_FILE_PATH)
|
52
75
|
partial_parquet_params = PartialParquetParameters.of(
|
@@ -66,7 +89,7 @@ class TestS3PartialParquetFileToTable(TestCase):
|
|
66
89
|
|
67
90
|
pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
|
68
91
|
|
69
|
-
result =
|
92
|
+
result = partial_parquet_file_to_table(
|
70
93
|
PARQUET_FILE_PATH,
|
71
94
|
ContentType.PARQUET.value,
|
72
95
|
ContentEncoding.IDENTITY.value,
|
@@ -85,7 +108,7 @@ class TestS3PartialParquetFileToTable(TestCase):
|
|
85
108
|
self.assertEqual(result_schema.field(2).type, "int64")
|
86
109
|
self.assertEqual(result_schema.field(2).name, "MISSING")
|
87
110
|
|
88
|
-
def
|
111
|
+
def test_partial_parquet_file_to_table_when_schema_missing_columns(self):
|
89
112
|
|
90
113
|
pq_file = ParquetFile(PARQUET_FILE_PATH)
|
91
114
|
partial_parquet_params = PartialParquetParameters.of(
|
@@ -105,7 +128,7 @@ class TestS3PartialParquetFileToTable(TestCase):
|
|
105
128
|
|
106
129
|
pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
|
107
130
|
|
108
|
-
result =
|
131
|
+
result = partial_parquet_file_to_table(
|
109
132
|
PARQUET_FILE_PATH,
|
110
133
|
ContentType.PARQUET.value,
|
111
134
|
ContentEncoding.IDENTITY.value,
|
@@ -122,7 +145,7 @@ class TestS3PartialParquetFileToTable(TestCase):
|
|
122
145
|
self.assertEqual(result_schema.field(0).type, "int64")
|
123
146
|
self.assertEqual(result_schema.field(0).name, "MISSING")
|
124
147
|
|
125
|
-
def
|
148
|
+
def test_partial_parquet_file_to_table_when_schema_passed_with_include_columns(
|
126
149
|
self,
|
127
150
|
):
|
128
151
|
|
@@ -139,11 +162,11 @@ class TestS3PartialParquetFileToTable(TestCase):
|
|
139
162
|
|
140
163
|
pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
|
141
164
|
|
142
|
-
result =
|
165
|
+
result = partial_parquet_file_to_table(
|
143
166
|
PARQUET_FILE_PATH,
|
144
167
|
ContentType.PARQUET.value,
|
145
168
|
ContentEncoding.IDENTITY.value,
|
146
|
-
["n_legs", "animal"],
|
169
|
+
column_names=["n_legs", "animal"],
|
147
170
|
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
148
171
|
partial_file_download_params=partial_parquet_params,
|
149
172
|
)
|
@@ -155,7 +178,7 @@ class TestS3PartialParquetFileToTable(TestCase):
|
|
155
178
|
self.assertEqual(result_schema.field(0).type, "string")
|
156
179
|
self.assertEqual(result_schema.field(0).name, "n_legs") # order doesn't change
|
157
180
|
|
158
|
-
def
|
181
|
+
def test_partial_parquet_file_to_table_when_multiple_row_groups(self):
|
159
182
|
|
160
183
|
pq_file = ParquetFile(PARQUET_FILE_PATH)
|
161
184
|
partial_parquet_params = PartialParquetParameters.of(
|
@@ -166,7 +189,7 @@ class TestS3PartialParquetFileToTable(TestCase):
|
|
166
189
|
partial_parquet_params.num_row_groups, 2, "test_file.parquet has changed."
|
167
190
|
)
|
168
191
|
|
169
|
-
result =
|
192
|
+
result = partial_parquet_file_to_table(
|
170
193
|
PARQUET_FILE_PATH,
|
171
194
|
content_encoding=ContentEncoding.IDENTITY.value,
|
172
195
|
content_type=ContentType.PARQUET.value,
|
@@ -407,130 +430,1388 @@ class TestReadCSV(TestCase):
|
|
407
430
|
),
|
408
431
|
)
|
409
432
|
|
433
|
+
def test_read_csv_when_decimal_precision_overflows_and_raise_kwarg_specified(self):
|
434
|
+
schema = pa.schema(
|
435
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
|
436
|
+
)
|
437
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
438
|
+
_add_column_kwargs(
|
439
|
+
ContentType.UNESCAPED_TSV.value,
|
440
|
+
["is_active", "decimal_value"],
|
441
|
+
["is_active", "decimal_value"],
|
442
|
+
kwargs,
|
443
|
+
)
|
444
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
410
445
|
|
411
|
-
|
412
|
-
|
446
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
447
|
+
self.assertRaises(
|
448
|
+
pa.lib.ArrowInvalid,
|
449
|
+
lambda: pyarrow_read_csv(
|
450
|
+
OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH,
|
451
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
|
452
|
+
),
|
453
|
+
)
|
413
454
|
|
455
|
+
def test_read_csv_when_decimal_precision_overflows_sanity(self):
|
414
456
|
schema = pa.schema(
|
415
|
-
[("is_active", pa.string()), ("
|
457
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
|
458
|
+
)
|
459
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
460
|
+
_add_column_kwargs(
|
461
|
+
ContentType.UNESCAPED_TSV.value,
|
462
|
+
["is_active", "decimal_value"],
|
463
|
+
["is_active", "decimal_value"],
|
464
|
+
kwargs,
|
416
465
|
)
|
417
466
|
|
418
|
-
|
419
|
-
|
467
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
468
|
+
|
469
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
470
|
+
|
471
|
+
self.assertRaises(
|
472
|
+
pa.lib.ArrowInvalid,
|
473
|
+
lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH, **kwargs),
|
474
|
+
)
|
475
|
+
|
476
|
+
def test_read_csv_when_decimal_scale_overflows_and_raise_kwarg_specified(self):
|
477
|
+
schema = pa.schema(
|
478
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
|
479
|
+
)
|
480
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
481
|
+
_add_column_kwargs(
|
420
482
|
ContentType.UNESCAPED_TSV.value,
|
421
|
-
|
422
|
-
["is_active", "
|
423
|
-
|
424
|
-
|
425
|
-
|
483
|
+
["is_active", "decimal_value"],
|
484
|
+
["is_active", "decimal_value"],
|
485
|
+
kwargs,
|
486
|
+
)
|
487
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
488
|
+
|
489
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
490
|
+
|
491
|
+
self.assertRaises(
|
492
|
+
pa.lib.ArrowInvalid,
|
493
|
+
lambda: pyarrow_read_csv(
|
494
|
+
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
|
495
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
|
426
496
|
),
|
427
497
|
)
|
428
498
|
|
499
|
+
def test_read_csv_when_decimal_scale_overflows_sanity(self):
|
500
|
+
schema = pa.schema(
|
501
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
|
502
|
+
)
|
503
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
504
|
+
_add_column_kwargs(
|
505
|
+
ContentType.UNESCAPED_TSV.value,
|
506
|
+
["is_active", "decimal_value"],
|
507
|
+
["is_active", "decimal_value"],
|
508
|
+
kwargs,
|
509
|
+
)
|
510
|
+
|
511
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
512
|
+
|
513
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
514
|
+
|
515
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
516
|
+
|
429
517
|
self.assertEqual(len(result), 3)
|
518
|
+
self.assertEqual(
|
519
|
+
result[1][0].as_py(), decimal.Decimal("322236.66")
|
520
|
+
) # rounding decimal
|
521
|
+
self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
|
430
522
|
self.assertEqual(len(result.column_names), 2)
|
431
523
|
result_schema = result.schema
|
432
|
-
|
433
|
-
|
524
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
525
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal128(20, 2))
|
434
526
|
|
435
|
-
|
527
|
+
def test_read_csv_when_decimal_scale_overflows_and_negative_scale(self):
|
528
|
+
schema = pa.schema(
|
529
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(20, -2))]
|
530
|
+
)
|
531
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
532
|
+
_add_column_kwargs(
|
533
|
+
ContentType.UNESCAPED_TSV.value,
|
534
|
+
["is_active", "decimal_value"],
|
535
|
+
["is_active", "decimal_value"],
|
536
|
+
kwargs,
|
537
|
+
)
|
436
538
|
|
437
|
-
|
539
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
540
|
+
|
541
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
542
|
+
|
543
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
544
|
+
|
545
|
+
self.assertEqual(len(result), 3)
|
546
|
+
self.assertEqual(
|
547
|
+
result[1][0].as_py(),
|
548
|
+
decimal.Decimal("322200"), # consequence of negative scale
|
549
|
+
) # rounding decimal
|
550
|
+
self.assertEqual(result[1][1].as_py(), decimal.Decimal("00"))
|
551
|
+
self.assertEqual(len(result.column_names), 2)
|
552
|
+
result_schema = result.schema
|
553
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
554
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal128(20, -2))
|
438
555
|
|
556
|
+
def test_read_csv_when_decimal_scale_overflows_with_decimal256(self):
|
439
557
|
schema = pa.schema(
|
440
|
-
[("is_active", pa.string()), ("
|
558
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
|
559
|
+
)
|
560
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
561
|
+
_add_column_kwargs(
|
562
|
+
ContentType.UNESCAPED_TSV.value,
|
563
|
+
["is_active", "decimal_value"],
|
564
|
+
["is_active", "decimal_value"],
|
565
|
+
kwargs,
|
441
566
|
)
|
442
567
|
|
443
|
-
|
444
|
-
|
568
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
569
|
+
|
570
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
571
|
+
|
572
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
573
|
+
|
574
|
+
self.assertEqual(len(result), 3)
|
575
|
+
self.assertEqual(
|
576
|
+
result[1][0].as_py(), decimal.Decimal("322236.66")
|
577
|
+
) # rounding decimal
|
578
|
+
self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
|
579
|
+
self.assertEqual(len(result.column_names), 2)
|
580
|
+
result_schema = result.schema
|
581
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
582
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal256(20, 2))
|
583
|
+
|
584
|
+
def test_read_csv_when_decimal_scale_overflows_with_decimal256_and_raise_on_overflow(
|
585
|
+
self,
|
586
|
+
):
|
587
|
+
schema = pa.schema(
|
588
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
|
589
|
+
)
|
590
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
591
|
+
_add_column_kwargs(
|
445
592
|
ContentType.UNESCAPED_TSV.value,
|
446
|
-
|
447
|
-
["is_active", "
|
448
|
-
|
449
|
-
|
450
|
-
|
593
|
+
["is_active", "decimal_value"],
|
594
|
+
["is_active", "decimal_value"],
|
595
|
+
kwargs,
|
596
|
+
)
|
597
|
+
|
598
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
599
|
+
|
600
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
601
|
+
|
602
|
+
self.assertRaises(
|
603
|
+
pa.lib.ArrowNotImplementedError,
|
604
|
+
lambda: pyarrow_read_csv(
|
605
|
+
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
|
606
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
|
451
607
|
),
|
452
608
|
)
|
453
609
|
|
454
|
-
|
610
|
+
def test_read_csv_when_decimal_scale_overflows_without_any_schema_then_infers(self):
|
611
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
612
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=None)
|
613
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
614
|
+
|
615
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
616
|
+
|
617
|
+
# The default behavior of pyarrow is to invalid skip rows
|
618
|
+
self.assertEqual(len(result), 2)
|
619
|
+
self.assertEqual(result[1][0].as_py(), 32.33) # rounding decimal
|
620
|
+
self.assertEqual(result[1][1].as_py(), 0.4) # not rounded
|
455
621
|
self.assertEqual(len(result.column_names), 2)
|
456
622
|
result_schema = result.schema
|
457
|
-
|
458
|
-
|
623
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
624
|
+
self.assertEqual(result_schema.field(1).type, pa.float64())
|
459
625
|
|
460
|
-
|
626
|
+
def test_read_csv_when_decimal_scale_and_precision_overflow_and_raise_on_overflow(
|
627
|
+
self,
|
628
|
+
):
|
629
|
+
schema = pa.schema(
|
630
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(5, 2))]
|
631
|
+
)
|
632
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
633
|
+
_add_column_kwargs(
|
634
|
+
ContentType.UNESCAPED_TSV.value,
|
635
|
+
["is_active", "decimal_value"],
|
636
|
+
["is_active", "decimal_value"],
|
637
|
+
kwargs,
|
638
|
+
)
|
639
|
+
|
640
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
641
|
+
|
642
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
461
643
|
|
462
|
-
|
644
|
+
self.assertRaises(
|
645
|
+
pa.lib.ArrowInvalid,
|
646
|
+
lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs),
|
647
|
+
)
|
463
648
|
|
649
|
+
def test_read_csv_when_decimal_scale_overflow_and_file_like_obj_passed(self):
|
464
650
|
schema = pa.schema(
|
465
|
-
[("is_active", pa.string()), ("
|
651
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(15, 2))]
|
652
|
+
)
|
653
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
654
|
+
_add_column_kwargs(
|
655
|
+
ContentType.UNESCAPED_TSV.value,
|
656
|
+
["is_active", "decimal_value"],
|
657
|
+
["is_active", "decimal_value"],
|
658
|
+
kwargs,
|
659
|
+
)
|
660
|
+
|
661
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
662
|
+
|
663
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
664
|
+
|
665
|
+
with open(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, "rb") as file:
|
666
|
+
result = pyarrow_read_csv(file, **kwargs)
|
667
|
+
|
668
|
+
self.assertEqual(len(result), 3)
|
669
|
+
self.assertEqual(
|
670
|
+
result[1][0].as_py(), decimal.Decimal("322236.66")
|
671
|
+
) # rounding decimal
|
672
|
+
self.assertEqual(
|
673
|
+
result[1][1].as_py(), decimal.Decimal("32.33")
|
674
|
+
) # not rounded
|
675
|
+
self.assertEqual(len(result.column_names), 2)
|
676
|
+
result_schema = result.schema
|
677
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
678
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal128(15, 2))
|
679
|
+
|
680
|
+
|
681
|
+
class TestWriters(TestCase):
|
682
|
+
def setUp(self):
|
683
|
+
self.table = pa.table({"col1": ["a,b\tc|d", "e,f\tg|h"], "col2": [1, 2]})
|
684
|
+
self.fs = fsspec.filesystem("file")
|
685
|
+
self.base_path = tempfile.mkdtemp()
|
686
|
+
self.fs.makedirs(self.base_path, exist_ok=True)
|
687
|
+
|
688
|
+
def tearDown(self):
|
689
|
+
self.fs.rm(self.base_path, recursive=True)
|
690
|
+
|
691
|
+
def test_write_feather(self):
|
692
|
+
path = f"{self.base_path}/test.feather"
|
693
|
+
|
694
|
+
table_to_file(
|
695
|
+
self.table,
|
696
|
+
path,
|
697
|
+
self.fs,
|
698
|
+
lambda x: path,
|
699
|
+
content_type=ContentType.FEATHER.value,
|
700
|
+
)
|
701
|
+
assert self.fs.exists(path), "file was not written"
|
702
|
+
|
703
|
+
# Verify content
|
704
|
+
result = paf.read_table(path)
|
705
|
+
assert result.equals(self.table)
|
706
|
+
|
707
|
+
def test_write_csv(self):
|
708
|
+
path = f"{self.base_path}/test.csv.gz"
|
709
|
+
|
710
|
+
table_to_file(
|
711
|
+
self.table,
|
712
|
+
path,
|
713
|
+
self.fs,
|
714
|
+
lambda x: path,
|
715
|
+
content_type=ContentType.CSV.value,
|
716
|
+
)
|
717
|
+
assert self.fs.exists(path), "file was not written"
|
718
|
+
|
719
|
+
# Verify content (should be GZIP compressed)
|
720
|
+
with self.fs.open(path, "rb") as f:
|
721
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
722
|
+
content = gz.read().decode("utf-8")
|
723
|
+
# Should be quoted due to commas in data
|
724
|
+
assert '"a,b\tc|d",1' in content
|
725
|
+
assert '"e,f\tg|h",2' in content
|
726
|
+
|
727
|
+
def test_write_tsv(self):
|
728
|
+
path = f"{self.base_path}/test.tsv.gz"
|
729
|
+
|
730
|
+
table_to_file(
|
731
|
+
self.table,
|
732
|
+
path,
|
733
|
+
self.fs,
|
734
|
+
lambda x: path,
|
735
|
+
content_type=ContentType.TSV.value,
|
736
|
+
)
|
737
|
+
assert self.fs.exists(path), "file was not written"
|
738
|
+
|
739
|
+
# Verify content (should be GZIP compressed)
|
740
|
+
with self.fs.open(path, "rb") as f:
|
741
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
742
|
+
content = gz.read().decode("utf-8")
|
743
|
+
# Should be quoted due to tabs in data
|
744
|
+
assert '"a,b\tc|d"\t1' in content
|
745
|
+
assert '"e,f\tg|h"\t2' in content
|
746
|
+
|
747
|
+
def test_write_psv(self):
|
748
|
+
path = f"{self.base_path}/test.psv.gz"
|
749
|
+
|
750
|
+
table_to_file(
|
751
|
+
self.table,
|
752
|
+
path,
|
753
|
+
self.fs,
|
754
|
+
lambda x: path,
|
755
|
+
content_type=ContentType.PSV.value,
|
756
|
+
)
|
757
|
+
assert self.fs.exists(path), "file was not written"
|
758
|
+
|
759
|
+
# Verify content (should be GZIP compressed)
|
760
|
+
with self.fs.open(path, "rb") as f:
|
761
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
762
|
+
content = gz.read().decode("utf-8")
|
763
|
+
# Should be quoted due to pipes in data
|
764
|
+
assert '"a,b\tc|d"|1' in content
|
765
|
+
assert '"e,f\tg|h"|2' in content
|
766
|
+
|
767
|
+
def test_write_unescaped_tsv(self):
|
768
|
+
# Create table without delimiters for unescaped TSV
|
769
|
+
table = pa.table({"col1": ["abc", "def"], "col2": [1, 2]})
|
770
|
+
path = f"{self.base_path}/test.tsv.gz"
|
771
|
+
|
772
|
+
table_to_file(
|
773
|
+
table,
|
774
|
+
path,
|
775
|
+
self.fs,
|
776
|
+
lambda x: path,
|
777
|
+
content_type=ContentType.UNESCAPED_TSV.value,
|
778
|
+
)
|
779
|
+
assert self.fs.exists(path), "file was not written"
|
780
|
+
|
781
|
+
# Verify content (should be GZIP compressed)
|
782
|
+
with self.fs.open(path, "rb") as f:
|
783
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
784
|
+
content = gz.read().decode("utf-8")
|
785
|
+
# With quoting_style="none", strings should not be quoted
|
786
|
+
assert "abc\t1" in content
|
787
|
+
assert "def\t2" in content
|
788
|
+
|
789
|
+
def test_write_orc(self):
|
790
|
+
path = f"{self.base_path}/test.orc"
|
791
|
+
|
792
|
+
table_to_file(
|
793
|
+
self.table,
|
794
|
+
path,
|
795
|
+
self.fs,
|
796
|
+
lambda x: path,
|
797
|
+
content_type=ContentType.ORC.value,
|
798
|
+
)
|
799
|
+
assert self.fs.exists(path), "file was not written"
|
800
|
+
|
801
|
+
# Verify content
|
802
|
+
result = paorc.read_table(path)
|
803
|
+
assert result.equals(self.table)
|
804
|
+
|
805
|
+
def test_write_parquet(self):
|
806
|
+
path = f"{self.base_path}/test.parquet"
|
807
|
+
|
808
|
+
table_to_file(
|
809
|
+
self.table,
|
810
|
+
path,
|
811
|
+
self.fs,
|
812
|
+
lambda x: path,
|
813
|
+
content_type=ContentType.PARQUET.value,
|
814
|
+
)
|
815
|
+
assert self.fs.exists(path), "file was not written"
|
816
|
+
|
817
|
+
# Verify content
|
818
|
+
result = papq.read_table(path)
|
819
|
+
assert result.equals(self.table)
|
820
|
+
|
821
|
+
def test_write_json(self):
|
822
|
+
path = f"{self.base_path}/test.json.gz"
|
823
|
+
|
824
|
+
table_to_file(
|
825
|
+
self.table,
|
826
|
+
path,
|
827
|
+
self.fs,
|
828
|
+
lambda x: path,
|
829
|
+
content_type=ContentType.JSON.value,
|
830
|
+
)
|
831
|
+
assert self.fs.exists(path), "file was not written"
|
832
|
+
|
833
|
+
# Verify content (should be GZIP compressed)
|
834
|
+
with self.fs.open(path, "rb") as f:
|
835
|
+
with gzip.GzipFile(fileobj=f) as gz:
|
836
|
+
content = gz.read().decode("utf-8")
|
837
|
+
# Each line should be a valid JSON object
|
838
|
+
lines = [
|
839
|
+
line for line in content.split("\n") if line
|
840
|
+
] # Skip empty lines
|
841
|
+
assert len(lines) == 2 # 2 records
|
842
|
+
assert json.loads(lines[0]) == {"col1": "a,b\tc|d", "col2": 1}
|
843
|
+
assert json.loads(lines[1]) == {"col1": "e,f\tg|h", "col2": 2}
|
844
|
+
|
845
|
+
def test_write_avro(self):
|
846
|
+
import polars as pl
|
847
|
+
|
848
|
+
path = f"{self.base_path}/test.avro"
|
849
|
+
|
850
|
+
table_to_file(
|
851
|
+
self.table,
|
852
|
+
path,
|
853
|
+
self.fs,
|
854
|
+
lambda x: path,
|
855
|
+
content_type=ContentType.AVRO.value,
|
856
|
+
)
|
857
|
+
assert self.fs.exists(path), "file was not written"
|
858
|
+
|
859
|
+
# Verify content by reading with polars
|
860
|
+
result = pl.read_avro(path).to_arrow()
|
861
|
+
# Cast the result to match the original table's schema
|
862
|
+
# (the round-trip from arrow->polars->arrow casts string to large string)
|
863
|
+
result = result.cast(self.table.schema)
|
864
|
+
assert result.equals(self.table)
|
865
|
+
|
866
|
+
|
867
|
+
class TestPyArrowReaders(TestCase):
|
868
|
+
def setUp(self):
|
869
|
+
# Create test data files for reading
|
870
|
+
self.fs = fsspec.filesystem("file")
|
871
|
+
self.base_path = tempfile.mkdtemp()
|
872
|
+
self.fs.makedirs(self.base_path, exist_ok=True)
|
873
|
+
|
874
|
+
# Create test Table
|
875
|
+
self.table = pa.Table.from_pylist(
|
876
|
+
[
|
877
|
+
{"col1": "a,b\tc|d", "col2": 1, "col3": 1.1},
|
878
|
+
{"col1": "e,f\tg|h", "col2": 2, "col3": 2.2},
|
879
|
+
{"col1": "test", "col2": 3, "col3": 3.3},
|
880
|
+
]
|
881
|
+
)
|
882
|
+
|
883
|
+
# Write test files in different formats
|
884
|
+
self._create_test_files()
|
885
|
+
|
886
|
+
def tearDown(self):
|
887
|
+
self.fs.rm(self.base_path, recursive=True)
|
888
|
+
|
889
|
+
def _create_test_files(self):
|
890
|
+
# Create CSV file (GZIP compressed)
|
891
|
+
csv_path = f"{self.base_path}/test.csv"
|
892
|
+
with self.fs.open(csv_path, "wb") as f:
|
893
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
894
|
+
content = '"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n'
|
895
|
+
gz.write(content.encode("utf-8"))
|
896
|
+
|
897
|
+
# Create TSV file (GZIP compressed)
|
898
|
+
tsv_path = f"{self.base_path}/test.tsv"
|
899
|
+
with self.fs.open(tsv_path, "wb") as f:
|
900
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
901
|
+
content = '"a,b\tc|d"\t1\t1.1\n"e,f\tg|h"\t2\t2.2\ntest\t3\t3.3\n'
|
902
|
+
gz.write(content.encode("utf-8"))
|
903
|
+
|
904
|
+
# Create PSV file (GZIP compressed)
|
905
|
+
psv_path = f"{self.base_path}/test.psv"
|
906
|
+
with self.fs.open(psv_path, "wb") as f:
|
907
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
908
|
+
content = '"a,b\tc|d"|1|1.1\n"e,f\tg|h"|2|2.2\ntest|3|3.3\n'
|
909
|
+
gz.write(content.encode("utf-8"))
|
910
|
+
|
911
|
+
# Create unescaped TSV file (GZIP compressed)
|
912
|
+
unescaped_tsv_path = f"{self.base_path}/test_unescaped.tsv"
|
913
|
+
pa.Table.from_pylist(
|
914
|
+
[
|
915
|
+
{"col1": "abc", "col2": 1, "col3": 1.1},
|
916
|
+
{"col1": "def", "col2": 2, "col3": 2.2},
|
917
|
+
{"col1": "ghi", "col2": 3, "col3": 3.3},
|
918
|
+
]
|
919
|
+
)
|
920
|
+
with self.fs.open(unescaped_tsv_path, "wb") as f:
|
921
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
922
|
+
content = "abc\t1\t1.1\ndef\t2\t2.2\nghi\t3\t3.3\n"
|
923
|
+
gz.write(content.encode("utf-8"))
|
924
|
+
|
925
|
+
# Create Parquet file
|
926
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
927
|
+
with self.fs.open(parquet_path, "wb") as f:
|
928
|
+
papq.write_table(self.table, f)
|
929
|
+
|
930
|
+
# Create Feather file
|
931
|
+
feather_path = f"{self.base_path}/test.feather"
|
932
|
+
with self.fs.open(feather_path, "wb") as f:
|
933
|
+
paf.write_feather(self.table, f)
|
934
|
+
|
935
|
+
# Create JSON file (GZIP compressed)
|
936
|
+
json_path = f"{self.base_path}/test.json"
|
937
|
+
with self.fs.open(json_path, "wb") as f:
|
938
|
+
with gzip.GzipFile(fileobj=f, mode="wb") as gz:
|
939
|
+
# Create NDJSON format - one JSON object per line
|
940
|
+
lines = []
|
941
|
+
for row in self.table.to_pylist():
|
942
|
+
lines.append(json.dumps(row))
|
943
|
+
content = "\n".join(lines) + "\n"
|
944
|
+
gz.write(content.encode("utf-8"))
|
945
|
+
|
946
|
+
# Create Avro file using polars (since pyarrow delegates to polars for Avro)
|
947
|
+
avro_path = f"{self.base_path}/test.avro"
|
948
|
+
import polars as pl
|
949
|
+
|
950
|
+
pl_df = pl.from_arrow(self.table)
|
951
|
+
pl_df.write_avro(avro_path)
|
952
|
+
|
953
|
+
# Create ORC file
|
954
|
+
orc_path = f"{self.base_path}/test.orc"
|
955
|
+
with self.fs.open(orc_path, "wb") as f:
|
956
|
+
paorc.write_table(self.table, f)
|
957
|
+
|
958
|
+
def test_content_type_to_reader_kwargs(self):
|
959
|
+
# Test CSV kwargs
|
960
|
+
csv_kwargs = content_type_to_reader_kwargs(ContentType.CSV.value)
|
961
|
+
expected_csv = {"parse_options": pacsv.ParseOptions(delimiter=",")}
|
962
|
+
assert (
|
963
|
+
csv_kwargs["parse_options"].delimiter
|
964
|
+
== expected_csv["parse_options"].delimiter
|
965
|
+
)
|
966
|
+
|
967
|
+
# Test TSV kwargs
|
968
|
+
tsv_kwargs = content_type_to_reader_kwargs(ContentType.TSV.value)
|
969
|
+
expected_tsv = {"parse_options": pacsv.ParseOptions(delimiter="\t")}
|
970
|
+
assert (
|
971
|
+
tsv_kwargs["parse_options"].delimiter
|
972
|
+
== expected_tsv["parse_options"].delimiter
|
973
|
+
)
|
974
|
+
|
975
|
+
# Test PSV kwargs
|
976
|
+
psv_kwargs = content_type_to_reader_kwargs(ContentType.PSV.value)
|
977
|
+
expected_psv = {"parse_options": pacsv.ParseOptions(delimiter="|")}
|
978
|
+
assert (
|
979
|
+
psv_kwargs["parse_options"].delimiter
|
980
|
+
== expected_psv["parse_options"].delimiter
|
981
|
+
)
|
982
|
+
|
983
|
+
# Test unescaped TSV kwargs
|
984
|
+
unescaped_kwargs = content_type_to_reader_kwargs(
|
985
|
+
ContentType.UNESCAPED_TSV.value
|
986
|
+
)
|
987
|
+
assert unescaped_kwargs["parse_options"].delimiter == "\t"
|
988
|
+
assert unescaped_kwargs["parse_options"].quote_char is False
|
989
|
+
assert unescaped_kwargs["convert_options"].null_values == [""]
|
990
|
+
|
991
|
+
# Test Parquet kwargs (should be empty)
|
992
|
+
parquet_kwargs = content_type_to_reader_kwargs(ContentType.PARQUET.value)
|
993
|
+
assert parquet_kwargs == {}
|
994
|
+
|
995
|
+
# Test ORC kwargs (should be empty)
|
996
|
+
orc_kwargs = content_type_to_reader_kwargs(ContentType.ORC.value)
|
997
|
+
assert orc_kwargs == {}
|
998
|
+
|
999
|
+
# Test Avro kwargs (should be empty)
|
1000
|
+
avro_kwargs = content_type_to_reader_kwargs(ContentType.AVRO.value)
|
1001
|
+
assert avro_kwargs == {}
|
1002
|
+
|
1003
|
+
def test_add_column_kwargs(self):
|
1004
|
+
kwargs = {}
|
1005
|
+
column_names = ["col1", "col2", "col3"]
|
1006
|
+
include_columns = ["col1", "col2"]
|
1007
|
+
|
1008
|
+
# Test CSV column kwargs
|
1009
|
+
_add_column_kwargs(ContentType.CSV.value, column_names, include_columns, kwargs)
|
1010
|
+
assert kwargs["read_options"].column_names == column_names
|
1011
|
+
assert kwargs["convert_options"].include_columns == include_columns
|
1012
|
+
|
1013
|
+
# Test Parquet column kwargs
|
1014
|
+
kwargs = {}
|
1015
|
+
_add_column_kwargs(
|
1016
|
+
ContentType.PARQUET.value, column_names, include_columns, kwargs
|
1017
|
+
)
|
1018
|
+
assert kwargs["columns"] == include_columns
|
1019
|
+
|
1020
|
+
def test_file_to_table_csv(self):
|
1021
|
+
# Test reading CSV with file_to_table
|
1022
|
+
csv_path = f"{self.base_path}/test.csv"
|
1023
|
+
|
1024
|
+
result = file_to_table(
|
1025
|
+
csv_path,
|
1026
|
+
ContentType.CSV.value,
|
1027
|
+
ContentEncoding.GZIP.value,
|
1028
|
+
filesystem=self.fs,
|
1029
|
+
column_names=["col1", "col2", "col3"],
|
1030
|
+
)
|
1031
|
+
|
1032
|
+
assert len(result) == 3
|
1033
|
+
assert result.column_names == ["col1", "col2", "col3"]
|
1034
|
+
assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
1035
|
+
|
1036
|
+
def test_file_to_table_tsv(self):
|
1037
|
+
# Test reading TSV with file_to_table
|
1038
|
+
tsv_path = f"{self.base_path}/test.tsv"
|
1039
|
+
|
1040
|
+
result = file_to_table(
|
1041
|
+
tsv_path,
|
1042
|
+
ContentType.TSV.value,
|
1043
|
+
ContentEncoding.GZIP.value,
|
1044
|
+
filesystem=self.fs,
|
1045
|
+
column_names=["col1", "col2", "col3"],
|
1046
|
+
)
|
1047
|
+
|
1048
|
+
assert len(result) == 3
|
1049
|
+
assert result.column_names == ["col1", "col2", "col3"]
|
1050
|
+
assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
1051
|
+
|
1052
|
+
def test_file_to_table_psv(self):
|
1053
|
+
# Test reading PSV with file_to_table
|
1054
|
+
psv_path = f"{self.base_path}/test.psv"
|
1055
|
+
|
1056
|
+
result = file_to_table(
|
1057
|
+
psv_path,
|
1058
|
+
ContentType.PSV.value,
|
1059
|
+
ContentEncoding.GZIP.value,
|
1060
|
+
filesystem=self.fs,
|
1061
|
+
column_names=["col1", "col2", "col3"],
|
466
1062
|
)
|
467
1063
|
|
468
|
-
result
|
469
|
-
|
1064
|
+
assert len(result) == 3
|
1065
|
+
assert result.column_names == ["col1", "col2", "col3"]
|
1066
|
+
assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
1067
|
+
|
1068
|
+
def test_file_to_table_unescaped_tsv(self):
|
1069
|
+
# Test reading unescaped TSV with file_to_table
|
1070
|
+
unescaped_tsv_path = f"{self.base_path}/test_unescaped.tsv"
|
1071
|
+
|
1072
|
+
result = file_to_table(
|
1073
|
+
unescaped_tsv_path,
|
470
1074
|
ContentType.UNESCAPED_TSV.value,
|
1075
|
+
ContentEncoding.GZIP.value,
|
1076
|
+
filesystem=self.fs,
|
1077
|
+
column_names=["col1", "col2", "col3"],
|
1078
|
+
)
|
1079
|
+
|
1080
|
+
assert len(result) == 3
|
1081
|
+
assert result.column_names == ["col1", "col2", "col3"]
|
1082
|
+
assert result.column("col1").to_pylist() == ["abc", "def", "ghi"]
|
1083
|
+
|
1084
|
+
def test_file_to_table_parquet(self):
|
1085
|
+
# Test reading Parquet with file_to_table
|
1086
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
1087
|
+
|
1088
|
+
result = file_to_table(
|
1089
|
+
parquet_path, ContentType.PARQUET.value, filesystem=self.fs
|
1090
|
+
)
|
1091
|
+
|
1092
|
+
assert len(result) == 3
|
1093
|
+
assert result.column_names == ["col1", "col2", "col3"]
|
1094
|
+
assert result.equals(self.table)
|
1095
|
+
|
1096
|
+
def test_file_to_table_feather(self):
|
1097
|
+
# Test reading Feather with file_to_table
|
1098
|
+
feather_path = f"{self.base_path}/test.feather"
|
1099
|
+
|
1100
|
+
result = file_to_table(
|
1101
|
+
feather_path, ContentType.FEATHER.value, filesystem=self.fs
|
1102
|
+
)
|
1103
|
+
|
1104
|
+
assert len(result) == 3
|
1105
|
+
assert result.column_names == ["col1", "col2", "col3"]
|
1106
|
+
assert result.equals(self.table)
|
1107
|
+
|
1108
|
+
def test_file_to_table_json(self):
|
1109
|
+
# Test reading JSON with file_to_table
|
1110
|
+
json_path = f"{self.base_path}/test.json"
|
1111
|
+
|
1112
|
+
result = file_to_table(
|
1113
|
+
json_path,
|
1114
|
+
ContentType.JSON.value,
|
1115
|
+
ContentEncoding.GZIP.value,
|
1116
|
+
filesystem=self.fs,
|
1117
|
+
)
|
1118
|
+
|
1119
|
+
assert len(result) == 3
|
1120
|
+
assert set(result.column_names) == {"col1", "col2", "col3"}
|
1121
|
+
assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
1122
|
+
|
1123
|
+
def test_file_to_table_avro(self):
|
1124
|
+
# Test reading Avro with file_to_table
|
1125
|
+
avro_path = f"{self.base_path}/test.avro"
|
1126
|
+
|
1127
|
+
result = file_to_table(avro_path, ContentType.AVRO.value, filesystem=self.fs)
|
1128
|
+
|
1129
|
+
assert len(result) == 3
|
1130
|
+
assert result.column_names == ["col1", "col2", "col3"]
|
1131
|
+
# Avro may have different dtypes, so compare values
|
1132
|
+
assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
1133
|
+
|
1134
|
+
def test_file_to_table_orc(self):
|
1135
|
+
# Test reading ORC with file_to_table
|
1136
|
+
orc_path = f"{self.base_path}/test.orc"
|
1137
|
+
|
1138
|
+
result = file_to_table(orc_path, ContentType.ORC.value, filesystem=self.fs)
|
1139
|
+
|
1140
|
+
assert len(result) == 3
|
1141
|
+
assert result.column_names == ["col1", "col2", "col3"]
|
1142
|
+
assert result.equals(self.table)
|
1143
|
+
|
1144
|
+
def test_file_to_table_with_column_selection(self):
|
1145
|
+
# Test reading with column selection
|
1146
|
+
csv_path = f"{self.base_path}/test.csv"
|
1147
|
+
|
1148
|
+
result = file_to_table(
|
1149
|
+
csv_path,
|
1150
|
+
ContentType.CSV.value,
|
1151
|
+
ContentEncoding.GZIP.value,
|
1152
|
+
filesystem=self.fs,
|
1153
|
+
column_names=["col1", "col2", "col3"],
|
1154
|
+
include_columns=["col1", "col2"],
|
1155
|
+
)
|
1156
|
+
|
1157
|
+
assert len(result) == 3
|
1158
|
+
assert len(result.column_names) == 2 # Should only have 2 columns
|
1159
|
+
assert result.column_names == ["col1", "col2"]
|
1160
|
+
|
1161
|
+
def test_file_to_table_with_kwargs_provider(self):
|
1162
|
+
# Test reading with kwargs provider
|
1163
|
+
csv_path = f"{self.base_path}/test.csv"
|
1164
|
+
provider = ReadKwargsProviderPyArrowCsvPureUtf8(
|
1165
|
+
include_columns=["col1", "col2", "col3"]
|
1166
|
+
)
|
1167
|
+
|
1168
|
+
result = file_to_table(
|
1169
|
+
csv_path,
|
1170
|
+
ContentType.CSV.value,
|
1171
|
+
ContentEncoding.GZIP.value,
|
1172
|
+
filesystem=self.fs,
|
1173
|
+
column_names=["col1", "col2", "col3"],
|
1174
|
+
pa_read_func_kwargs_provider=provider,
|
1175
|
+
)
|
1176
|
+
|
1177
|
+
assert len(result) == 3
|
1178
|
+
assert result.column_names == ["col1", "col2", "col3"]
|
1179
|
+
# With string types provider, all columns should be strings
|
1180
|
+
for col_name in result.column_names:
|
1181
|
+
assert result.schema.field(col_name).type == pa.string()
|
1182
|
+
|
1183
|
+
def test_file_to_table_filesystem_inference(self):
|
1184
|
+
# Test filesystem inference when no filesystem is provided
|
1185
|
+
# Use JSON file since it should work well with inference
|
1186
|
+
json_path = f"{self.base_path}/test.json"
|
1187
|
+
|
1188
|
+
result = file_to_table(
|
1189
|
+
json_path,
|
1190
|
+
ContentType.JSON.value,
|
1191
|
+
ContentEncoding.GZIP.value
|
1192
|
+
# No filesystem provided - should be inferred
|
1193
|
+
)
|
1194
|
+
|
1195
|
+
assert len(result) == 3
|
1196
|
+
assert set(result.column_names) == {"col1", "col2", "col3"}
|
1197
|
+
assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
1198
|
+
|
1199
|
+
def test_file_to_table_unsupported_content_type(self):
|
1200
|
+
# Test error handling for unsupported content type
|
1201
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
1202
|
+
|
1203
|
+
with self.assertRaises(NotImplementedError) as context:
|
1204
|
+
file_to_table(parquet_path, "unsupported/content-type", filesystem=self.fs)
|
1205
|
+
|
1206
|
+
assert "not implemented" in str(context.exception)
|
1207
|
+
|
1208
|
+
def test_file_to_table_bzip2_compression(self):
|
1209
|
+
# Test BZIP2 compression handling
|
1210
|
+
import bz2
|
1211
|
+
|
1212
|
+
# Create a BZIP2 compressed CSV file
|
1213
|
+
csv_content = '"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n'
|
1214
|
+
compressed_content = bz2.compress(csv_content.encode("utf-8"))
|
1215
|
+
|
1216
|
+
bz2_path = f"{self.base_path}/test.csv.bz2"
|
1217
|
+
with self.fs.open(bz2_path, "wb") as f:
|
1218
|
+
f.write(compressed_content)
|
1219
|
+
|
1220
|
+
result = file_to_table(
|
1221
|
+
bz2_path,
|
1222
|
+
ContentType.CSV.value,
|
471
1223
|
ContentEncoding.BZIP2.value,
|
472
|
-
|
473
|
-
|
474
|
-
pa_read_func_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(
|
475
|
-
schema=schema
|
476
|
-
),
|
1224
|
+
filesystem=self.fs,
|
1225
|
+
column_names=["col1", "col2", "col3"],
|
477
1226
|
)
|
478
1227
|
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
for index, field in enumerate(result_schema):
|
483
|
-
self.assertEqual(field.name, schema.field(index).name)
|
1228
|
+
assert len(result) == 3
|
1229
|
+
assert result.column_names == ["col1", "col2", "col3"]
|
1230
|
+
assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
|
484
1231
|
|
485
|
-
self.assertEqual(result.schema.field(0).type, "string")
|
486
1232
|
|
487
|
-
|
1233
|
+
class TestFileToParquet(TestCase):
|
1234
|
+
def setUp(self):
|
1235
|
+
# Create test data files for reading
|
1236
|
+
self.fs = fsspec.filesystem("file")
|
1237
|
+
self.base_path = tempfile.mkdtemp()
|
1238
|
+
self.fs.makedirs(self.base_path, exist_ok=True)
|
1239
|
+
|
1240
|
+
# Create test Table
|
1241
|
+
self.table = pa.Table.from_pylist(
|
1242
|
+
[
|
1243
|
+
{"col1": "a,b\tc|d", "col2": 1, "col3": 1.1},
|
1244
|
+
{"col1": "e,f\tg|h", "col2": 2, "col3": 2.2},
|
1245
|
+
{"col1": "test", "col2": 3, "col3": 3.3},
|
1246
|
+
]
|
1247
|
+
)
|
1248
|
+
|
1249
|
+
# Write test parquet files
|
1250
|
+
self._create_test_files()
|
488
1251
|
|
489
|
-
|
490
|
-
|
491
|
-
**kwargs,
|
492
|
-
}
|
1252
|
+
def tearDown(self):
|
1253
|
+
self.fs.rm(self.base_path, recursive=True)
|
493
1254
|
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
1255
|
+
def _create_test_files(self):
|
1256
|
+
# Create basic Parquet file
|
1257
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
1258
|
+
with self.fs.open(parquet_path, "wb") as f:
|
1259
|
+
papq.write_table(self.table, f)
|
1260
|
+
|
1261
|
+
# Create larger Parquet file with multiple row groups
|
1262
|
+
large_table = pa.Table.from_pylist(
|
1263
|
+
[{"col1": f"row_{i}", "col2": i, "col3": float(i)} for i in range(1000)]
|
501
1264
|
)
|
1265
|
+
large_parquet_path = f"{self.base_path}/test_large.parquet"
|
1266
|
+
with self.fs.open(large_parquet_path, "wb") as f:
|
1267
|
+
papq.write_table(
|
1268
|
+
large_table, f, row_group_size=100
|
1269
|
+
) # Create multiple row groups
|
502
1270
|
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
schema_index = schema.get_field_index("n_legs")
|
507
|
-
self.assertEqual(schema.field(schema_index).type, "int64")
|
1271
|
+
def test_file_to_parquet_basic(self):
|
1272
|
+
# Test basic parquet file reading
|
1273
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
508
1274
|
|
509
|
-
|
1275
|
+
result = file_to_parquet(parquet_path, filesystem=self.fs)
|
1276
|
+
|
1277
|
+
assert isinstance(result, papq.ParquetFile)
|
1278
|
+
assert result.num_row_groups > 0
|
1279
|
+
assert result.metadata.num_rows == 3
|
1280
|
+
assert result.metadata.num_columns == 3
|
1281
|
+
|
1282
|
+
# Verify we can read the data
|
1283
|
+
table = result.read()
|
1284
|
+
assert len(table) == 3
|
1285
|
+
assert table.column_names == ["col1", "col2", "col3"]
|
1286
|
+
|
1287
|
+
def test_file_to_parquet_with_schema_provider(self):
|
1288
|
+
# Test with schema override provider
|
1289
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
510
1290
|
|
511
1291
|
schema = pa.schema(
|
512
|
-
[
|
1292
|
+
[
|
1293
|
+
pa.field("col1", pa.string()),
|
1294
|
+
pa.field("col2", pa.string()), # Override to string
|
1295
|
+
pa.field("col3", pa.string()), # Override to string
|
1296
|
+
]
|
513
1297
|
)
|
514
1298
|
|
515
|
-
|
516
|
-
"schema": schema,
|
517
|
-
"reader_type": "pyarrow",
|
518
|
-
**kwargs,
|
519
|
-
}
|
1299
|
+
provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
520
1300
|
|
521
|
-
result =
|
522
|
-
|
1301
|
+
result = file_to_parquet(
|
1302
|
+
parquet_path, filesystem=self.fs, pa_read_func_kwargs_provider=provider
|
1303
|
+
)
|
1304
|
+
|
1305
|
+
assert isinstance(result, papq.ParquetFile)
|
1306
|
+
# Note: schema override might not affect ParquetFile metadata,
|
1307
|
+
# but should work when reading the table
|
1308
|
+
table = result.read()
|
1309
|
+
assert len(table) == 3
|
1310
|
+
|
1311
|
+
def test_file_to_parquet_with_custom_kwargs(self):
|
1312
|
+
# Test with custom ParquetFile kwargs
|
1313
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
1314
|
+
|
1315
|
+
result = file_to_parquet(
|
1316
|
+
parquet_path,
|
1317
|
+
filesystem=self.fs,
|
1318
|
+
validate_schema=True, # Custom kwarg for ParquetFile
|
1319
|
+
memory_map=True, # Another custom kwarg
|
1320
|
+
)
|
1321
|
+
|
1322
|
+
assert isinstance(result, papq.ParquetFile)
|
1323
|
+
assert result.metadata.num_rows == 3
|
1324
|
+
|
1325
|
+
def test_file_to_parquet_filesystem_inference(self):
|
1326
|
+
# Test filesystem inference when no filesystem is provided
|
1327
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
1328
|
+
|
1329
|
+
result = file_to_parquet(
|
1330
|
+
parquet_path
|
1331
|
+
# No filesystem provided - should be inferred
|
1332
|
+
)
|
1333
|
+
|
1334
|
+
assert isinstance(result, papq.ParquetFile)
|
1335
|
+
assert result.metadata.num_rows == 3
|
1336
|
+
assert result.metadata.num_columns == 3
|
1337
|
+
|
1338
|
+
def test_file_to_parquet_large_file(self):
|
1339
|
+
# Test with larger parquet file (multiple row groups)
|
1340
|
+
large_parquet_path = f"{self.base_path}/test_large.parquet"
|
1341
|
+
|
1342
|
+
result = file_to_parquet(large_parquet_path, filesystem=self.fs)
|
1343
|
+
|
1344
|
+
assert isinstance(result, papq.ParquetFile)
|
1345
|
+
assert result.metadata.num_rows == 1000
|
1346
|
+
assert result.num_row_groups > 1 # Should have multiple row groups
|
1347
|
+
|
1348
|
+
# Test reading specific row groups
|
1349
|
+
first_row_group = result.read_row_group(0)
|
1350
|
+
assert len(first_row_group) <= 100 # Based on row_group_size=100
|
1351
|
+
|
1352
|
+
def test_file_to_parquet_metadata_access(self):
|
1353
|
+
# Test accessing various metadata properties
|
1354
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
1355
|
+
|
1356
|
+
result = file_to_parquet(parquet_path, filesystem=self.fs)
|
1357
|
+
|
1358
|
+
# Test metadata access
|
1359
|
+
metadata = result.metadata
|
1360
|
+
assert metadata.num_rows == 3
|
1361
|
+
assert metadata.num_columns == 3
|
1362
|
+
assert metadata.num_row_groups >= 1
|
1363
|
+
|
1364
|
+
# Test schema access
|
1365
|
+
schema = result.schema
|
1366
|
+
assert len(schema) == 3
|
1367
|
+
assert "col1" in schema.names
|
1368
|
+
assert "col2" in schema.names
|
1369
|
+
assert "col3" in schema.names
|
1370
|
+
|
1371
|
+
# Test schema_arrow property
|
1372
|
+
schema_arrow = result.schema_arrow
|
1373
|
+
assert isinstance(schema_arrow, pa.Schema)
|
1374
|
+
assert len(schema_arrow) == 3
|
1375
|
+
|
1376
|
+
def test_file_to_parquet_column_selection(self):
|
1377
|
+
# Test reading specific columns
|
1378
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
1379
|
+
|
1380
|
+
result = file_to_parquet(parquet_path, filesystem=self.fs)
|
1381
|
+
|
1382
|
+
# Read only specific columns
|
1383
|
+
table = result.read(columns=["col1", "col2"])
|
1384
|
+
assert len(table.column_names) == 2
|
1385
|
+
assert table.column_names == ["col1", "col2"]
|
1386
|
+
assert len(table) == 3
|
1387
|
+
|
1388
|
+
def test_file_to_parquet_invalid_content_type(self):
|
1389
|
+
# Test error handling for invalid content type
|
1390
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
1391
|
+
|
1392
|
+
with self.assertRaises(ContentTypeValidationError) as context:
|
1393
|
+
file_to_parquet(
|
1394
|
+
parquet_path,
|
1395
|
+
content_type=ContentType.CSV.value, # Invalid content type
|
1396
|
+
filesystem=self.fs,
|
1397
|
+
)
|
1398
|
+
|
1399
|
+
assert "cannot be read into pyarrow.parquet.ParquetFile" in str(
|
1400
|
+
context.exception
|
1401
|
+
)
|
1402
|
+
|
1403
|
+
def test_file_to_parquet_invalid_content_encoding(self):
|
1404
|
+
# Test error handling for invalid content encoding
|
1405
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
1406
|
+
|
1407
|
+
with self.assertRaises(ContentTypeValidationError) as context:
|
1408
|
+
file_to_parquet(
|
1409
|
+
parquet_path,
|
1410
|
+
content_encoding=ContentEncoding.GZIP.value, # Invalid encoding
|
1411
|
+
filesystem=self.fs,
|
1412
|
+
)
|
1413
|
+
|
1414
|
+
assert "cannot be read into pyarrow.parquet.ParquetFile" in str(
|
1415
|
+
context.exception
|
1416
|
+
)
|
1417
|
+
|
1418
|
+
def test_file_to_parquet_different_filesystems(self):
|
1419
|
+
# Test with different filesystem implementations
|
1420
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
1421
|
+
|
1422
|
+
# Test with fsspec filesystem
|
1423
|
+
result_fsspec = file_to_parquet(parquet_path, filesystem=self.fs)
|
1424
|
+
assert isinstance(result_fsspec, papq.ParquetFile)
|
1425
|
+
assert result_fsspec.metadata.num_rows == 3
|
1426
|
+
|
1427
|
+
# Test with None filesystem (inferred)
|
1428
|
+
result_inferred = file_to_parquet(parquet_path, filesystem=None)
|
1429
|
+
assert isinstance(result_inferred, papq.ParquetFile)
|
1430
|
+
assert result_inferred.metadata.num_rows == 3
|
1431
|
+
|
1432
|
+
def test_file_to_parquet_lazy_loading(self):
|
1433
|
+
# Test that ParquetFile provides lazy loading capabilities
|
1434
|
+
large_parquet_path = f"{self.base_path}/test_large.parquet"
|
1435
|
+
|
1436
|
+
result = file_to_parquet(large_parquet_path, filesystem=self.fs)
|
1437
|
+
|
1438
|
+
# ParquetFile should be created without loading all data
|
1439
|
+
assert isinstance(result, papq.ParquetFile)
|
1440
|
+
assert result.metadata.num_rows == 1000
|
1441
|
+
|
1442
|
+
# Test reading only specific columns (lazy loading)
|
1443
|
+
partial_table = result.read(columns=["col1", "col2"])
|
1444
|
+
assert len(partial_table) == 1000 # All rows but only 2 columns
|
1445
|
+
assert partial_table.column_names == ["col1", "col2"]
|
1446
|
+
|
1447
|
+
# Test reading specific row group (lazy loading)
|
1448
|
+
row_group_table = result.read_row_group(0)
|
1449
|
+
assert len(row_group_table) <= 100 # Based on row_group_size
|
1450
|
+
|
1451
|
+
def test_file_to_parquet_performance_timing(self):
|
1452
|
+
# Test that performance timing is logged (basic functionality test)
|
1453
|
+
parquet_path = f"{self.base_path}/test.parquet"
|
1454
|
+
|
1455
|
+
# This should complete without error and log timing
|
1456
|
+
result = file_to_parquet(parquet_path, filesystem=self.fs)
|
1457
|
+
|
1458
|
+
assert isinstance(result, papq.ParquetFile)
|
1459
|
+
assert result.metadata.num_rows == 3
|
1460
|
+
|
1461
|
+
|
1462
|
+
class TestFileToTableFilesystems(TestCase):
|
1463
|
+
"""Test file_to_table with different filesystem implementations across all content types."""
|
1464
|
+
|
1465
|
+
def setUp(self):
|
1466
|
+
self.tmpdir = tempfile.mkdtemp()
|
1467
|
+
self._create_test_files()
|
1468
|
+
|
1469
|
+
def tearDown(self):
|
1470
|
+
import shutil
|
1471
|
+
|
1472
|
+
shutil.rmtree(self.tmpdir)
|
1473
|
+
|
1474
|
+
def _create_test_files(self):
|
1475
|
+
"""Create test files for all supported content types."""
|
1476
|
+
# Test data
|
1477
|
+
test_data = pa.table(
|
1478
|
+
{
|
1479
|
+
"id": [1, 2, 3, 4, 5],
|
1480
|
+
"name": ["Alice", "Bob", "Charlie", "Diana", "Eve"],
|
1481
|
+
"age": [25, 30, 35, 28, 32],
|
1482
|
+
"score": [85.5, 92.0, 78.5, 88.0, 95.5],
|
1483
|
+
}
|
1484
|
+
)
|
1485
|
+
|
1486
|
+
# File paths
|
1487
|
+
self.csv_file = f"{self.tmpdir}/test.csv"
|
1488
|
+
self.tsv_file = f"{self.tmpdir}/test.tsv"
|
1489
|
+
self.psv_file = f"{self.tmpdir}/test.psv"
|
1490
|
+
self.unescaped_tsv_file = f"{self.tmpdir}/test_unescaped.tsv"
|
1491
|
+
self.parquet_file = f"{self.tmpdir}/test.parquet"
|
1492
|
+
self.feather_file = f"{self.tmpdir}/test.feather"
|
1493
|
+
self.json_file = f"{self.tmpdir}/test.json"
|
1494
|
+
self.orc_file = f"{self.tmpdir}/test.orc"
|
1495
|
+
self.avro_file = f"{self.tmpdir}/test.avro"
|
1496
|
+
|
1497
|
+
# Create CSV file
|
1498
|
+
pacsv.write_csv(
|
1499
|
+
test_data,
|
1500
|
+
self.csv_file,
|
1501
|
+
write_options=pacsv.WriteOptions(delimiter=",", include_header=False),
|
1502
|
+
)
|
1503
|
+
|
1504
|
+
# Create TSV file
|
1505
|
+
pacsv.write_csv(
|
1506
|
+
test_data,
|
1507
|
+
self.tsv_file,
|
1508
|
+
write_options=pacsv.WriteOptions(delimiter="\t", include_header=False),
|
1509
|
+
)
|
1510
|
+
|
1511
|
+
# Create PSV file
|
1512
|
+
pacsv.write_csv(
|
1513
|
+
test_data,
|
1514
|
+
self.psv_file,
|
1515
|
+
write_options=pacsv.WriteOptions(delimiter="|", include_header=False),
|
1516
|
+
)
|
1517
|
+
|
1518
|
+
# Create unescaped TSV file
|
1519
|
+
pacsv.write_csv(
|
1520
|
+
test_data,
|
1521
|
+
self.unescaped_tsv_file,
|
1522
|
+
write_options=pacsv.WriteOptions(
|
1523
|
+
delimiter="\t", include_header=False, quoting_style="none"
|
1524
|
+
),
|
1525
|
+
)
|
1526
|
+
|
1527
|
+
# Create Parquet file
|
1528
|
+
papq.write_table(test_data, self.parquet_file)
|
1529
|
+
|
1530
|
+
# Create Feather file
|
1531
|
+
paf.write_feather(test_data, self.feather_file)
|
1532
|
+
|
1533
|
+
# Create JSON file (write as JSONL format)
|
1534
|
+
df = test_data.to_pandas()
|
1535
|
+
with open(self.json_file, "w") as f:
|
1536
|
+
for _, row in df.iterrows():
|
1537
|
+
json.dump(row.to_dict(), f)
|
1538
|
+
f.write("\n")
|
1539
|
+
|
1540
|
+
# Create ORC file
|
1541
|
+
paorc.write_table(test_data, self.orc_file)
|
1542
|
+
|
1543
|
+
# Create Avro file
|
1544
|
+
try:
|
1545
|
+
import polars as pl
|
1546
|
+
|
1547
|
+
pl_df = pl.from_arrow(test_data)
|
1548
|
+
pl_df.write_avro(self.avro_file)
|
1549
|
+
except ImportError:
|
1550
|
+
# Skip Avro file creation if polars is not available
|
1551
|
+
self.avro_file = None
|
1552
|
+
|
1553
|
+
def _get_filesystems(self, file_path):
|
1554
|
+
"""Get different filesystem implementations for testing."""
|
1555
|
+
# fsspec AbstractFileSystem
|
1556
|
+
fsspec_fs = fsspec.filesystem("file")
|
1557
|
+
|
1558
|
+
# PyArrow filesystem
|
1559
|
+
import pyarrow.fs as pafs
|
1560
|
+
|
1561
|
+
pyarrow_fs = pafs.LocalFileSystem()
|
1562
|
+
|
1563
|
+
# None for automatic inference
|
1564
|
+
auto_infer_fs = None
|
1565
|
+
|
1566
|
+
return [
|
1567
|
+
("fsspec", fsspec_fs),
|
1568
|
+
("pyarrow", pyarrow_fs),
|
1569
|
+
("auto_infer", auto_infer_fs),
|
1570
|
+
]
|
1571
|
+
|
1572
|
+
def _assert_table_content(self, table, content_type):
|
1573
|
+
"""Assert that the loaded table has expected content."""
|
1574
|
+
self.assertEqual(len(table), 5, f"Expected 5 rows for {content_type}")
|
1575
|
+
self.assertEqual(
|
1576
|
+
len(table.columns), 4, f"Expected 4 columns for {content_type}"
|
1577
|
+
)
|
1578
|
+
|
1579
|
+
# Check column names exist (order might vary for some formats)
|
1580
|
+
column_names = set(table.column_names)
|
1581
|
+
expected_columns = {"id", "name", "age", "score"}
|
1582
|
+
self.assertEqual(
|
1583
|
+
column_names, expected_columns, f"Column names mismatch for {content_type}"
|
1584
|
+
)
|
1585
|
+
|
1586
|
+
def test_csv_all_filesystems(self):
|
1587
|
+
"""Test CSV reading with all filesystem types."""
|
1588
|
+
for fs_name, filesystem in self._get_filesystems(self.csv_file):
|
1589
|
+
with self.subTest(filesystem=fs_name):
|
1590
|
+
table = file_to_table(
|
1591
|
+
self.csv_file,
|
1592
|
+
ContentType.CSV.value,
|
1593
|
+
ContentEncoding.IDENTITY.value,
|
1594
|
+
filesystem=filesystem,
|
1595
|
+
column_names=["id", "name", "age", "score"],
|
1596
|
+
)
|
1597
|
+
self._assert_table_content(table, f"CSV with {fs_name}")
|
1598
|
+
|
1599
|
+
def test_tsv_all_filesystems(self):
|
1600
|
+
"""Test TSV reading with all filesystem types."""
|
1601
|
+
for fs_name, filesystem in self._get_filesystems(self.tsv_file):
|
1602
|
+
with self.subTest(filesystem=fs_name):
|
1603
|
+
table = file_to_table(
|
1604
|
+
self.tsv_file,
|
1605
|
+
ContentType.TSV.value,
|
1606
|
+
ContentEncoding.IDENTITY.value,
|
1607
|
+
filesystem=filesystem,
|
1608
|
+
column_names=["id", "name", "age", "score"],
|
1609
|
+
)
|
1610
|
+
self._assert_table_content(table, f"TSV with {fs_name}")
|
1611
|
+
|
1612
|
+
def test_psv_all_filesystems(self):
|
1613
|
+
"""Test PSV reading with all filesystem types."""
|
1614
|
+
for fs_name, filesystem in self._get_filesystems(self.psv_file):
|
1615
|
+
with self.subTest(filesystem=fs_name):
|
1616
|
+
table = file_to_table(
|
1617
|
+
self.psv_file,
|
1618
|
+
ContentType.PSV.value,
|
1619
|
+
ContentEncoding.IDENTITY.value,
|
1620
|
+
filesystem=filesystem,
|
1621
|
+
column_names=["id", "name", "age", "score"],
|
1622
|
+
)
|
1623
|
+
self._assert_table_content(table, f"PSV with {fs_name}")
|
1624
|
+
|
1625
|
+
def test_unescaped_tsv_all_filesystems(self):
|
1626
|
+
"""Test unescaped TSV reading with all filesystem types."""
|
1627
|
+
for fs_name, filesystem in self._get_filesystems(self.unescaped_tsv_file):
|
1628
|
+
with self.subTest(filesystem=fs_name):
|
1629
|
+
table = file_to_table(
|
1630
|
+
self.unescaped_tsv_file,
|
1631
|
+
ContentType.UNESCAPED_TSV.value,
|
1632
|
+
ContentEncoding.IDENTITY.value,
|
1633
|
+
filesystem=filesystem,
|
1634
|
+
column_names=["id", "name", "age", "score"],
|
1635
|
+
)
|
1636
|
+
self._assert_table_content(table, f"UNESCAPED_TSV with {fs_name}")
|
1637
|
+
|
1638
|
+
def test_parquet_all_filesystems(self):
|
1639
|
+
"""Test Parquet reading with all filesystem types."""
|
1640
|
+
for fs_name, filesystem in self._get_filesystems(self.parquet_file):
|
1641
|
+
with self.subTest(filesystem=fs_name):
|
1642
|
+
table = file_to_table(
|
1643
|
+
self.parquet_file,
|
1644
|
+
ContentType.PARQUET.value,
|
1645
|
+
ContentEncoding.IDENTITY.value,
|
1646
|
+
filesystem=filesystem,
|
1647
|
+
)
|
1648
|
+
self._assert_table_content(table, f"PARQUET with {fs_name}")
|
1649
|
+
|
1650
|
+
def test_feather_all_filesystems(self):
|
1651
|
+
"""Test Feather reading with all filesystem types."""
|
1652
|
+
for fs_name, filesystem in self._get_filesystems(self.feather_file):
|
1653
|
+
with self.subTest(filesystem=fs_name):
|
1654
|
+
table = file_to_table(
|
1655
|
+
self.feather_file,
|
1656
|
+
ContentType.FEATHER.value,
|
1657
|
+
ContentEncoding.IDENTITY.value,
|
1658
|
+
filesystem=filesystem,
|
1659
|
+
)
|
1660
|
+
self._assert_table_content(table, f"FEATHER with {fs_name}")
|
1661
|
+
|
1662
|
+
def test_json_all_filesystems(self):
|
1663
|
+
"""Test JSON reading with all filesystem types."""
|
1664
|
+
for fs_name, filesystem in self._get_filesystems(self.json_file):
|
1665
|
+
with self.subTest(filesystem=fs_name):
|
1666
|
+
table = file_to_table(
|
1667
|
+
self.json_file,
|
1668
|
+
ContentType.JSON.value,
|
1669
|
+
ContentEncoding.IDENTITY.value,
|
1670
|
+
filesystem=filesystem,
|
1671
|
+
)
|
1672
|
+
self._assert_table_content(table, f"JSON with {fs_name}")
|
1673
|
+
|
1674
|
+
def test_orc_all_filesystems(self):
|
1675
|
+
"""Test ORC reading with all filesystem types."""
|
1676
|
+
for fs_name, filesystem in self._get_filesystems(self.orc_file):
|
1677
|
+
with self.subTest(filesystem=fs_name):
|
1678
|
+
table = file_to_table(
|
1679
|
+
self.orc_file,
|
1680
|
+
ContentType.ORC.value,
|
1681
|
+
ContentEncoding.IDENTITY.value,
|
1682
|
+
filesystem=filesystem,
|
1683
|
+
)
|
1684
|
+
self._assert_table_content(table, f"ORC with {fs_name}")
|
1685
|
+
|
1686
|
+
def test_avro_all_filesystems(self):
|
1687
|
+
"""Test Avro reading with all filesystem types."""
|
1688
|
+
if self.avro_file is None:
|
1689
|
+
self.skipTest("Avro file creation skipped (polars not available)")
|
1690
|
+
|
1691
|
+
for fs_name, filesystem in self._get_filesystems(self.avro_file):
|
1692
|
+
with self.subTest(filesystem=fs_name):
|
1693
|
+
table = file_to_table(
|
1694
|
+
self.avro_file,
|
1695
|
+
ContentType.AVRO.value,
|
1696
|
+
ContentEncoding.IDENTITY.value,
|
1697
|
+
filesystem=filesystem,
|
1698
|
+
)
|
1699
|
+
self._assert_table_content(table, f"AVRO with {fs_name}")
|
1700
|
+
|
1701
|
+
def test_column_selection_all_filesystems(self):
|
1702
|
+
"""Test column selection works with all filesystem types."""
|
1703
|
+
for fs_name, filesystem in self._get_filesystems(self.parquet_file):
|
1704
|
+
with self.subTest(filesystem=fs_name):
|
1705
|
+
table = file_to_table(
|
1706
|
+
self.parquet_file,
|
1707
|
+
ContentType.PARQUET.value,
|
1708
|
+
ContentEncoding.IDENTITY.value,
|
1709
|
+
filesystem=filesystem,
|
1710
|
+
include_columns=["name", "age"],
|
1711
|
+
)
|
1712
|
+
self.assertEqual(
|
1713
|
+
len(table.columns), 2, f"Expected 2 columns with {fs_name}"
|
1714
|
+
)
|
1715
|
+
self.assertEqual(
|
1716
|
+
set(table.column_names),
|
1717
|
+
{"name", "age"},
|
1718
|
+
f"Column selection failed with {fs_name}",
|
1719
|
+
)
|
1720
|
+
|
1721
|
+
def test_kwargs_provider_all_filesystems(self):
|
1722
|
+
"""Test that kwargs providers work with all filesystem types."""
|
1723
|
+
|
1724
|
+
def schema_provider(content_type, kwargs):
|
1725
|
+
if content_type == ContentType.CSV.value:
|
1726
|
+
# Force all columns to be strings
|
1727
|
+
kwargs["convert_options"] = pacsv.ConvertOptions(
|
1728
|
+
column_types={
|
1729
|
+
"id": pa.string(),
|
1730
|
+
"name": pa.string(),
|
1731
|
+
"age": pa.string(),
|
1732
|
+
"score": pa.string(),
|
1733
|
+
}
|
1734
|
+
)
|
1735
|
+
return kwargs
|
1736
|
+
|
1737
|
+
for fs_name, filesystem in self._get_filesystems(self.csv_file):
|
1738
|
+
with self.subTest(filesystem=fs_name):
|
1739
|
+
table = file_to_table(
|
1740
|
+
self.csv_file,
|
1741
|
+
ContentType.CSV.value,
|
1742
|
+
ContentEncoding.IDENTITY.value,
|
1743
|
+
filesystem=filesystem,
|
1744
|
+
column_names=["id", "name", "age", "score"],
|
1745
|
+
pa_read_func_kwargs_provider=schema_provider,
|
1746
|
+
)
|
1747
|
+
# Check that all columns are strings
|
1748
|
+
for field in table.schema:
|
1749
|
+
self.assertEqual(
|
1750
|
+
field.type,
|
1751
|
+
pa.string(),
|
1752
|
+
f"Column {field.name} should be string with {fs_name}",
|
1753
|
+
)
|
1754
|
+
|
1755
|
+
def test_filesystem_auto_inference_consistency(self):
|
1756
|
+
"""Test that auto-inferred filesystem produces same results as explicit filesystems."""
|
1757
|
+
# Use Parquet as it's most reliable across filesystem types
|
1758
|
+
|
1759
|
+
# Read with auto-inference
|
1760
|
+
auto_table = file_to_table(
|
1761
|
+
self.parquet_file,
|
523
1762
|
ContentType.PARQUET.value,
|
524
1763
|
ContentEncoding.IDENTITY.value,
|
525
|
-
|
526
|
-
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
1764
|
+
filesystem=None, # Auto-infer
|
527
1765
|
)
|
528
1766
|
|
529
|
-
|
530
|
-
|
1767
|
+
# Read with explicit fsspec filesystem
|
1768
|
+
fsspec_fs = fsspec.filesystem("file")
|
1769
|
+
fsspec_table = file_to_table(
|
1770
|
+
self.parquet_file,
|
1771
|
+
ContentType.PARQUET.value,
|
1772
|
+
ContentEncoding.IDENTITY.value,
|
1773
|
+
filesystem=fsspec_fs,
|
1774
|
+
)
|
531
1775
|
|
532
|
-
|
533
|
-
|
534
|
-
|
1776
|
+
# Read with explicit PyArrow filesystem
|
1777
|
+
import pyarrow.fs as pafs
|
1778
|
+
|
1779
|
+
pyarrow_fs = pafs.LocalFileSystem()
|
1780
|
+
pyarrow_table = file_to_table(
|
1781
|
+
self.parquet_file,
|
1782
|
+
ContentType.PARQUET.value,
|
1783
|
+
ContentEncoding.IDENTITY.value,
|
1784
|
+
filesystem=pyarrow_fs,
|
1785
|
+
)
|
1786
|
+
|
1787
|
+
# All should produce equivalent results
|
1788
|
+
self.assertTrue(
|
1789
|
+
auto_table.equals(fsspec_table),
|
1790
|
+
"Auto-inferred result should match fsspec result",
|
1791
|
+
)
|
1792
|
+
self.assertTrue(
|
1793
|
+
auto_table.equals(pyarrow_table),
|
1794
|
+
"Auto-inferred result should match PyArrow result",
|
1795
|
+
)
|
535
1796
|
|
536
|
-
|
1797
|
+
def test_error_handling_all_filesystems(self):
|
1798
|
+
"""Test error handling works consistently across filesystem types."""
|
1799
|
+
for fs_name, filesystem in self._get_filesystems(self.parquet_file):
|
1800
|
+
with self.subTest(filesystem=fs_name):
|
1801
|
+
# Test unsupported content type
|
1802
|
+
with self.assertRaises(NotImplementedError):
|
1803
|
+
file_to_table(
|
1804
|
+
self.parquet_file,
|
1805
|
+
"UNSUPPORTED_TYPE",
|
1806
|
+
ContentEncoding.IDENTITY.value,
|
1807
|
+
filesystem=filesystem,
|
1808
|
+
)
|
1809
|
+
|
1810
|
+
# Test non-existent file
|
1811
|
+
with self.assertRaises((FileNotFoundError, OSError)):
|
1812
|
+
file_to_table(
|
1813
|
+
f"{self.tmpdir}/non_existent.parquet",
|
1814
|
+
ContentType.PARQUET.value,
|
1815
|
+
ContentEncoding.IDENTITY.value,
|
1816
|
+
filesystem=filesystem,
|
1817
|
+
)
|