PyPI - deltacat - Versions diffs - 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl - Mend

deltacat 1.1.38py3-none-any.whl → 2.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (367) hide show

deltacat/__init__.py +150 -12
deltacat/annotations.py +36 -0
deltacat/api.py +578 -0
deltacat/aws/constants.py +0 -23
deltacat/aws/s3u.py +4 -631
deltacat/benchmarking/benchmark_engine.py +84 -0
deltacat/benchmarking/benchmark_report.py +86 -0
deltacat/benchmarking/benchmark_suite.py +11 -0
deltacat/benchmarking/conftest.py +22 -19
deltacat/benchmarking/data/random_row_generator.py +94 -0
deltacat/benchmarking/data/row_generator.py +10 -0
deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
deltacat/catalog/__init__.py +73 -0
deltacat/catalog/delegate.py +615 -140
deltacat/catalog/interface.py +404 -81
deltacat/catalog/main/impl.py +2882 -0
deltacat/catalog/model/catalog.py +348 -46
deltacat/catalog/model/properties.py +155 -0
deltacat/catalog/model/table_definition.py +32 -1
deltacat/compute/__init__.py +14 -0
deltacat/compute/compactor/compaction_session.py +97 -75
deltacat/compute/compactor/model/compact_partition_params.py +75 -30
deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
deltacat/compute/compactor/model/delta_annotated.py +3 -3
deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
deltacat/compute/compactor/model/delta_file_locator.py +3 -1
deltacat/compute/compactor/model/round_completion_info.py +19 -9
deltacat/compute/compactor/model/table_object_store.py +3 -2
deltacat/compute/compactor/repartition_session.py +9 -22
deltacat/compute/compactor/steps/dedupe.py +11 -4
deltacat/compute/compactor/steps/hash_bucket.py +6 -6
deltacat/compute/compactor/steps/materialize.py +15 -9
deltacat/compute/compactor/steps/repartition.py +12 -11
deltacat/compute/compactor/utils/io.py +7 -6
deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
deltacat/compute/compactor/utils/sort_key.py +9 -2
deltacat/compute/compactor/utils/system_columns.py +3 -1
deltacat/compute/compactor_v2/compaction_session.py +13 -14
deltacat/compute/compactor_v2/deletes/utils.py +3 -3
deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
deltacat/compute/compactor_v2/model/merge_input.py +28 -9
deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
deltacat/compute/compactor_v2/steps/merge.py +156 -53
deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
deltacat/compute/compactor_v2/utils/delta.py +5 -3
deltacat/compute/compactor_v2/utils/io.py +10 -3
deltacat/compute/compactor_v2/utils/merge.py +14 -2
deltacat/compute/compactor_v2/utils/task_options.py +2 -10
deltacat/compute/converter/constants.py +9 -0
deltacat/compute/converter/converter_session.py +298 -0
deltacat/compute/converter/model/convert_input.py +96 -0
deltacat/compute/converter/model/convert_input_files.py +78 -0
deltacat/compute/converter/model/convert_result.py +80 -0
deltacat/compute/converter/model/converter_session_params.py +144 -0
deltacat/compute/converter/pyiceberg/catalog.py +78 -0
deltacat/compute/converter/pyiceberg/overrides.py +263 -0
deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
deltacat/compute/converter/steps/convert.py +366 -0
deltacat/compute/converter/steps/dedupe.py +94 -0
deltacat/compute/converter/utils/__init__.py +0 -0
deltacat/compute/converter/utils/convert_task_options.py +132 -0
deltacat/compute/converter/utils/converter_session_utils.py +175 -0
deltacat/compute/converter/utils/iceberg_columns.py +87 -0
deltacat/compute/converter/utils/io.py +203 -0
deltacat/compute/converter/utils/s3u.py +148 -0
deltacat/compute/janitor.py +205 -0
deltacat/compute/jobs/__init__.py +0 -0
deltacat/compute/jobs/client.py +417 -0
deltacat/compute/resource_estimation/delta.py +11 -1
deltacat/constants.py +90 -1
deltacat/docs/__init__.py +0 -0
deltacat/docs/autogen/__init__.py +0 -0
deltacat/docs/autogen/schema/__init__.py +0 -0
deltacat/docs/autogen/schema/inference/__init__.py +0 -0
deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
deltacat/env.py +61 -0
deltacat/examples/__init__.py +0 -0
deltacat/examples/basic_logging.py +101 -0
deltacat/examples/compactor/__init__.py +0 -0
deltacat/examples/compactor/aws/__init__.py +1 -0
deltacat/examples/compactor/bootstrap.py +863 -0
deltacat/examples/compactor/compactor.py +373 -0
deltacat/examples/compactor/explorer.py +473 -0
deltacat/examples/compactor/gcp/__init__.py +1 -0
deltacat/examples/compactor/job_runner.py +439 -0
deltacat/examples/compactor/utils/__init__.py +1 -0
deltacat/examples/compactor/utils/common.py +261 -0
deltacat/examples/experimental/__init__.py +0 -0
deltacat/examples/experimental/iceberg/__init__.py +0 -0
deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
deltacat/examples/hello_world.py +29 -0
deltacat/examples/indexer/__init__.py +0 -0
deltacat/examples/indexer/aws/__init__.py +0 -0
deltacat/examples/indexer/gcp/__init__.py +0 -0
deltacat/examples/indexer/indexer.py +163 -0
deltacat/examples/indexer/job_runner.py +198 -0
deltacat/exceptions.py +116 -12
deltacat/experimental/__init__.py +0 -0
deltacat/experimental/catalog/__init__.py +0 -0
deltacat/experimental/catalog/iceberg/__init__.py +6 -0
deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
deltacat/experimental/catalog/iceberg/impl.py +399 -0
deltacat/experimental/catalog/iceberg/overrides.py +72 -0
deltacat/experimental/compatibility/__init__.py +0 -0
deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
deltacat/experimental/converter_agent/__init__.py +0 -0
deltacat/experimental/converter_agent/beam/__init__.py +0 -0
deltacat/experimental/converter_agent/beam/managed.py +173 -0
deltacat/experimental/converter_agent/table_monitor.py +479 -0
deltacat/experimental/daft/__init__.py +4 -0
deltacat/experimental/daft/daft_catalog.py +229 -0
deltacat/experimental/storage/__init__.py +0 -0
deltacat/experimental/storage/iceberg/__init__.py +0 -0
deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
deltacat/experimental/storage/iceberg/impl.py +739 -0
deltacat/experimental/storage/iceberg/model.py +713 -0
deltacat/experimental/storage/iceberg/visitor.py +119 -0
deltacat/experimental/storage/rivulet/__init__.py +11 -0
deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
deltacat/experimental/storage/rivulet/dataset.py +745 -0
deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
deltacat/experimental/storage/rivulet/serializer.py +40 -0
deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
deltacat/io/__init__.py +13 -0
deltacat/io/dataset/__init__.py +0 -0
deltacat/io/dataset/deltacat_dataset.py +91 -0
deltacat/io/datasink/__init__.py +0 -0
deltacat/io/datasink/deltacat_datasink.py +207 -0
deltacat/io/datasource/__init__.py +0 -0
deltacat/io/datasource/deltacat_datasource.py +579 -0
deltacat/io/reader/__init__.py +0 -0
deltacat/io/reader/deltacat_read_api.py +172 -0
deltacat/logs.py +4 -1
deltacat/storage/__init__.py +138 -28
deltacat/storage/interface.py +260 -155
deltacat/storage/main/__init__.py +0 -0
deltacat/storage/main/impl.py +3030 -0
deltacat/storage/model/delta.py +142 -71
deltacat/storage/model/expression/__init__.py +47 -0
deltacat/storage/model/expression/expression.py +656 -0
deltacat/storage/model/expression/visitor.py +248 -0
deltacat/storage/model/interop.py +24 -0
deltacat/storage/model/list_result.py +8 -0
deltacat/storage/model/locator.py +93 -9
deltacat/storage/model/manifest.py +643 -0
deltacat/storage/model/metafile.py +1421 -0
deltacat/storage/model/namespace.py +41 -18
deltacat/storage/model/partition.py +443 -43
deltacat/storage/model/scan/__init__.py +0 -0
deltacat/storage/model/scan/push_down.py +46 -0
deltacat/storage/model/scan/scan_plan.py +10 -0
deltacat/storage/model/scan/scan_task.py +34 -0
deltacat/storage/model/schema.py +3160 -0
deltacat/storage/model/shard.py +51 -0
deltacat/storage/model/sort_key.py +210 -13
deltacat/storage/model/stream.py +215 -80
deltacat/storage/model/table.py +134 -29
deltacat/storage/model/table_version.py +333 -46
deltacat/storage/model/transaction.py +1733 -0
deltacat/storage/model/transform.py +274 -58
deltacat/storage/model/types.py +138 -16
deltacat/storage/util/__init__.py +0 -0
deltacat/storage/util/scan_planner.py +26 -0
deltacat/tests/_io/__init__.py +1 -0
deltacat/tests/_io/reader/__init__.py +0 -0
deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
deltacat/tests/aws/test_s3u.py +2 -31
deltacat/tests/catalog/data/__init__.py +0 -0
deltacat/tests/catalog/main/__init__.py +0 -0
deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
deltacat/tests/catalog/model/__init__.py +0 -0
deltacat/tests/catalog/model/test_table_definition.py +16 -0
deltacat/tests/catalog/test_catalogs.py +321 -0
deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
deltacat/tests/compute/compact_partition_test_cases.py +23 -30
deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
deltacat/tests/compute/compactor/utils/test_io.py +125 -123
deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
deltacat/tests/compute/conftest.py +39 -0
deltacat/tests/compute/converter/__init__.py +0 -0
deltacat/tests/compute/converter/conftest.py +80 -0
deltacat/tests/compute/converter/test_convert_session.py +826 -0
deltacat/tests/compute/converter/utils.py +132 -0
deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
deltacat/tests/compute/test_compact_partition_params.py +16 -11
deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
deltacat/tests/compute/test_janitor.py +236 -0
deltacat/tests/compute/test_util_common.py +726 -46
deltacat/tests/compute/test_util_constant.py +0 -1
deltacat/tests/conftest.py +25 -0
deltacat/tests/daft/__init__.py +0 -0
deltacat/tests/daft/test_model.py +97 -0
deltacat/tests/experimental/__init__.py +1 -0
deltacat/tests/experimental/catalog/__init__.py +0 -0
deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
deltacat/tests/experimental/compatibility/__init__.py +1 -0
deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
deltacat/tests/experimental/daft/__init__.py +0 -0
deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
deltacat/tests/experimental/storage/__init__.py +0 -0
deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
deltacat/tests/storage/__init__.py +0 -0
deltacat/tests/storage/main/__init__.py +0 -0
deltacat/tests/storage/main/test_main_storage.py +8204 -0
deltacat/tests/storage/model/__init__.py +0 -0
deltacat/tests/storage/model/test_delete_parameters.py +21 -0
deltacat/tests/storage/model/test_expression.py +327 -0
deltacat/tests/storage/model/test_manifest.py +129 -0
deltacat/tests/storage/model/test_metafile_io.py +2440 -0
deltacat/tests/storage/model/test_partition_scheme.py +85 -0
deltacat/tests/storage/model/test_schema.py +479 -0
deltacat/tests/storage/model/test_schema_update.py +1925 -0
deltacat/tests/storage/model/test_shard.py +24 -0
deltacat/tests/storage/model/test_sort_scheme.py +90 -0
deltacat/tests/storage/model/test_table_version.py +110 -0
deltacat/tests/storage/model/test_transaction.py +653 -0
deltacat/tests/storage/model/test_transaction_history.py +886 -0
deltacat/tests/test_deltacat_api.py +1064 -0
deltacat/tests/test_exceptions.py +9 -5
deltacat/tests/test_utils/filesystem.py +14 -0
deltacat/tests/test_utils/message_pack_utils.py +54 -0
deltacat/tests/test_utils/pyarrow.py +50 -26
deltacat/tests/test_utils/storage.py +256 -4
deltacat/tests/types/__init__.py +0 -0
deltacat/tests/types/test_tables.py +104 -0
deltacat/tests/utils/exceptions.py +22 -0
deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
deltacat/tests/utils/test_daft.py +124 -34
deltacat/tests/utils/test_numpy.py +1193 -0
deltacat/tests/utils/test_pandas.py +1106 -0
deltacat/tests/utils/test_polars.py +1040 -0
deltacat/tests/utils/test_pyarrow.py +1107 -258
deltacat/types/media.py +345 -37
deltacat/types/partial_download.py +1 -1
deltacat/types/tables.py +2345 -47
deltacat/utils/arguments.py +33 -1
deltacat/utils/daft.py +824 -40
deltacat/utils/export.py +61 -0
deltacat/utils/filesystem.py +450 -0
deltacat/utils/metafile_locator.py +74 -0
deltacat/utils/numpy.py +118 -26
deltacat/utils/pandas.py +577 -48
deltacat/utils/polars.py +759 -0
deltacat/utils/pyarrow.py +1212 -178
deltacat/utils/ray_utils/concurrency.py +1 -1
deltacat/utils/ray_utils/dataset.py +101 -10
deltacat/utils/ray_utils/runtime.py +56 -4
deltacat/utils/reader_compatibility_mapping.py +3083 -0
deltacat/utils/url.py +1325 -0
deltacat-2.0.0.dist-info/METADATA +1163 -0
deltacat-2.0.0.dist-info/RECORD +439 -0
{deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
deltacat/aws/redshift/__init__.py +0 -19
deltacat/aws/redshift/model/manifest.py +0 -394
deltacat/catalog/default_catalog_impl/__init__.py +0 -369
deltacat/compute/compactor/utils/round_completion_file.py +0 -97
deltacat/compute/merge_on_read/__init__.py +0 -4
deltacat/compute/merge_on_read/daft.py +0 -40
deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
deltacat/compute/merge_on_read/utils/delta.py +0 -42
deltacat/io/dataset.py +0 -73
deltacat/io/read_api.py +0 -143
deltacat/storage/model/delete_parameters.py +0 -40
deltacat/storage/model/partition_spec.py +0 -71
deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
deltacat/utils/s3fs.py +0 -21
deltacat-1.1.38.dist-info/METADATA +0 -64
deltacat-1.1.38.dist-info/RECORD +0 -219
/deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
/deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
/deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
/deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
/deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
/deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
/deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
/deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
/deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
/deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
/deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
{deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
{deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0

deltacat/tests/utils/test_pyarrow.py CHANGED Viewed

@@ -1,23 +1,33 @@
 from unittest import TestCase
 from deltacat.utils.pyarrow import (
-    s3_partial_parquet_file_to_table,
+    partial_parquet_file_to_table,
     pyarrow_read_csv,
     ContentTypeValidationError,
     content_type_to_reader_kwargs,
     _add_column_kwargs,
-    logger,
-    s3_file_to_table,
-    s3_file_to_parquet,
+    file_to_table,
+    file_to_parquet,
+    table_to_file,
     ReadKwargsProviderPyArrowSchemaOverride,
-    RAISE_ON_EMPTY_CSV_KWARG,
+    ReadKwargsProviderPyArrowCsvPureUtf8,
     RAISE_ON_DECIMAL_OVERFLOW,
-    OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG,
+    RAISE_ON_EMPTY_CSV_KWARG,
 )
 import decimal
 from deltacat.types.media import ContentEncoding, ContentType
 from deltacat.types.partial_download import PartialParquetParameters
 from pyarrow.parquet import ParquetFile
+import tempfile
 import pyarrow as pa
+from pyarrow import csv as pacsv
+import fsspec
+import gzip
+import json
+from pyarrow import (
+    feather as paf,
+    parquet as papq,
+    orc as paorc,
+)
 PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
 PARQUET_GZIP_COMPRESSED_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet.gz"
@@ -33,8 +43,8 @@ GZIP_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed
 BZ2_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.bz2"
-class TestS3PartialParquetFileToTable(TestCase):
-    def test_s3_partial_parquet_file_to_table_sanity(self):
+class TestPartialParquetFileToTable(TestCase):
+    def test_partial_parquet_file_to_table_sanity(self):
         pq_file = ParquetFile(PARQUET_FILE_PATH)
         partial_parquet_params = PartialParquetParameters.of(
@@ -48,7 +58,7 @@ class TestS3PartialParquetFileToTable(TestCase):
         # only first row group to be downloaded
         partial_parquet_params.row_groups_to_download.pop()
-        result = s3_partial_parquet_file_to_table(
+        result = partial_parquet_file_to_table(
             PARQUET_FILE_PATH,
             include_columns=["n_legs"],
             content_encoding=ContentEncoding.IDENTITY.value,
@@ -59,7 +69,7 @@ class TestS3PartialParquetFileToTable(TestCase):
         self.assertEqual(len(result), 3)
         self.assertEqual(len(result.columns), 1)
-    def test_s3_partial_parquet_file_to_table_when_schema_passed(self):
+    def test_partial_parquet_file_to_table_when_schema_passed(self):
         pq_file = ParquetFile(PARQUET_FILE_PATH)
         partial_parquet_params = PartialParquetParameters.of(
@@ -79,7 +89,7 @@ class TestS3PartialParquetFileToTable(TestCase):
         pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
-        result = s3_partial_parquet_file_to_table(
+        result = partial_parquet_file_to_table(
             PARQUET_FILE_PATH,
             ContentType.PARQUET.value,
             ContentEncoding.IDENTITY.value,
@@ -98,7 +108,7 @@ class TestS3PartialParquetFileToTable(TestCase):
         self.assertEqual(result_schema.field(2).type, "int64")
         self.assertEqual(result_schema.field(2).name, "MISSING")
-    def test_s3_partial_parquet_file_to_table_when_schema_missing_columns(self):
+    def test_partial_parquet_file_to_table_when_schema_missing_columns(self):
         pq_file = ParquetFile(PARQUET_FILE_PATH)
         partial_parquet_params = PartialParquetParameters.of(
@@ -118,7 +128,7 @@ class TestS3PartialParquetFileToTable(TestCase):
         pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
-        result = s3_partial_parquet_file_to_table(
+        result = partial_parquet_file_to_table(
             PARQUET_FILE_PATH,
             ContentType.PARQUET.value,
             ContentEncoding.IDENTITY.value,
@@ -135,7 +145,7 @@ class TestS3PartialParquetFileToTable(TestCase):
         self.assertEqual(result_schema.field(0).type, "int64")
         self.assertEqual(result_schema.field(0).name, "MISSING")
-    def test_s3_partial_parquet_file_to_table_when_schema_passed_with_include_columns(
+    def test_partial_parquet_file_to_table_when_schema_passed_with_include_columns(
         self,
     ):
@@ -152,11 +162,11 @@ class TestS3PartialParquetFileToTable(TestCase):
         pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
-        result = s3_partial_parquet_file_to_table(
+        result = partial_parquet_file_to_table(
             PARQUET_FILE_PATH,
             ContentType.PARQUET.value,
             ContentEncoding.IDENTITY.value,
-            ["n_legs", "animal"],
+            column_names=["n_legs", "animal"],
             pa_read_func_kwargs_provider=pa_kwargs_provider,
             partial_file_download_params=partial_parquet_params,
         )
@@ -168,7 +178,7 @@ class TestS3PartialParquetFileToTable(TestCase):
         self.assertEqual(result_schema.field(0).type, "string")
         self.assertEqual(result_schema.field(0).name, "n_legs")  # order doesn't change
-    def test_s3_partial_parquet_file_to_table_when_multiple_row_groups(self):
+    def test_partial_parquet_file_to_table_when_multiple_row_groups(self):
         pq_file = ParquetFile(PARQUET_FILE_PATH)
         partial_parquet_params = PartialParquetParameters.of(
@@ -179,7 +189,7 @@ class TestS3PartialParquetFileToTable(TestCase):
             partial_parquet_params.num_row_groups, 2, "test_file.parquet has changed."
         )
-        result = s3_partial_parquet_file_to_table(
+        result = partial_parquet_file_to_table(
             PARQUET_FILE_PATH,
             content_encoding=ContentEncoding.IDENTITY.value,
             content_type=ContentType.PARQUET.value,
@@ -668,301 +678,1140 @@ class TestReadCSV(TestCase):
             self.assertEqual(result_schema.field(1).type, pa.decimal128(15, 2))
-class TestS3FileToTable(TestCase):
-    def test_s3_file_to_table_identity_sanity(self):
+class TestWriters(TestCase):
+    def setUp(self):
+        self.table = pa.table({"col1": ["a,b\tc|d", "e,f\tg|h"], "col2": [1, 2]})
+        self.fs = fsspec.filesystem("file")
+        self.base_path = tempfile.mkdtemp()
+        self.fs.makedirs(self.base_path, exist_ok=True)
+    def tearDown(self):
+        self.fs.rm(self.base_path, recursive=True)
+    def test_write_feather(self):
+        path = f"{self.base_path}/test.feather"
+        table_to_file(
+            self.table,
+            path,
+            self.fs,
+            lambda x: path,
+            content_type=ContentType.FEATHER.value,
+        )
+        assert self.fs.exists(path), "file was not written"
+        # Verify content
+        result = paf.read_table(path)
+        assert result.equals(self.table)
+    def test_write_csv(self):
+        path = f"{self.base_path}/test.csv.gz"
+        table_to_file(
+            self.table,
+            path,
+            self.fs,
+            lambda x: path,
+            content_type=ContentType.CSV.value,
+        )
+        assert self.fs.exists(path), "file was not written"
+        # Verify content (should be GZIP compressed)
+        with self.fs.open(path, "rb") as f:
+            with gzip.GzipFile(fileobj=f) as gz:
+                content = gz.read().decode("utf-8")
+                # Should be quoted due to commas in data
+                assert '"a,b\tc|d",1' in content
+                assert '"e,f\tg|h",2' in content
+    def test_write_tsv(self):
+        path = f"{self.base_path}/test.tsv.gz"
+        table_to_file(
+            self.table,
+            path,
+            self.fs,
+            lambda x: path,
+            content_type=ContentType.TSV.value,
+        )
+        assert self.fs.exists(path), "file was not written"
+        # Verify content (should be GZIP compressed)
+        with self.fs.open(path, "rb") as f:
+            with gzip.GzipFile(fileobj=f) as gz:
+                content = gz.read().decode("utf-8")
+                # Should be quoted due to tabs in data
+                assert '"a,b\tc|d"\t1' in content
+                assert '"e,f\tg|h"\t2' in content
+    def test_write_psv(self):
+        path = f"{self.base_path}/test.psv.gz"
+        table_to_file(
+            self.table,
+            path,
+            self.fs,
+            lambda x: path,
+            content_type=ContentType.PSV.value,
+        )
+        assert self.fs.exists(path), "file was not written"
+        # Verify content (should be GZIP compressed)
+        with self.fs.open(path, "rb") as f:
+            with gzip.GzipFile(fileobj=f) as gz:
+                content = gz.read().decode("utf-8")
+                # Should be quoted due to pipes in data
+                assert '"a,b\tc|d"|1' in content
+                assert '"e,f\tg|h"|2' in content
+    def test_write_unescaped_tsv(self):
+        # Create table without delimiters for unescaped TSV
+        table = pa.table({"col1": ["abc", "def"], "col2": [1, 2]})
+        path = f"{self.base_path}/test.tsv.gz"
+        table_to_file(
+            table,
+            path,
+            self.fs,
+            lambda x: path,
+            content_type=ContentType.UNESCAPED_TSV.value,
+        )
+        assert self.fs.exists(path), "file was not written"
+        # Verify content (should be GZIP compressed)
+        with self.fs.open(path, "rb") as f:
+            with gzip.GzipFile(fileobj=f) as gz:
+                content = gz.read().decode("utf-8")
+                # With quoting_style="none", strings should not be quoted
+                assert "abc\t1" in content
+                assert "def\t2" in content
+    def test_write_orc(self):
+        path = f"{self.base_path}/test.orc"
+        table_to_file(
+            self.table,
+            path,
+            self.fs,
+            lambda x: path,
+            content_type=ContentType.ORC.value,
+        )
+        assert self.fs.exists(path), "file was not written"
+        # Verify content
+        result = paorc.read_table(path)
+        assert result.equals(self.table)
+    def test_write_parquet(self):
+        path = f"{self.base_path}/test.parquet"
+        table_to_file(
+            self.table,
+            path,
+            self.fs,
+            lambda x: path,
+            content_type=ContentType.PARQUET.value,
+        )
+        assert self.fs.exists(path), "file was not written"
+        # Verify content
+        result = papq.read_table(path)
+        assert result.equals(self.table)
+    def test_write_json(self):
+        path = f"{self.base_path}/test.json.gz"
+        table_to_file(
+            self.table,
+            path,
+            self.fs,
+            lambda x: path,
+            content_type=ContentType.JSON.value,
+        )
+        assert self.fs.exists(path), "file was not written"
+        # Verify content (should be GZIP compressed)
+        with self.fs.open(path, "rb") as f:
+            with gzip.GzipFile(fileobj=f) as gz:
+                content = gz.read().decode("utf-8")
+                # Each line should be a valid JSON object
+                lines = [
+                    line for line in content.split("\n") if line
+                ]  # Skip empty lines
+                assert len(lines) == 2  # 2 records
+                assert json.loads(lines[0]) == {"col1": "a,b\tc|d", "col2": 1}
+                assert json.loads(lines[1]) == {"col1": "e,f\tg|h", "col2": 2}
+    def test_write_avro(self):
+        import polars as pl
+        path = f"{self.base_path}/test.avro"
+        table_to_file(
+            self.table,
+            path,
+            self.fs,
+            lambda x: path,
+            content_type=ContentType.AVRO.value,
+        )
+        assert self.fs.exists(path), "file was not written"
+        # Verify content by reading with polars
+        result = pl.read_avro(path).to_arrow()
+        # Cast the result to match the original table's schema
+        # (the round-trip from arrow->polars->arrow casts string to large string)
+        result = result.cast(self.table.schema)
+        assert result.equals(self.table)
+class TestPyArrowReaders(TestCase):
+    def setUp(self):
+        # Create test data files for reading
+        self.fs = fsspec.filesystem("file")
+        self.base_path = tempfile.mkdtemp()
+        self.fs.makedirs(self.base_path, exist_ok=True)
+        # Create test Table
+        self.table = pa.Table.from_pylist(
+            [
+                {"col1": "a,b\tc|d", "col2": 1, "col3": 1.1},
+                {"col1": "e,f\tg|h", "col2": 2, "col3": 2.2},
+                {"col1": "test", "col2": 3, "col3": 3.3},
+            ]
+        )
-        schema = pa.schema(
-            [("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
+        # Write test files in different formats
+        self._create_test_files()
+    def tearDown(self):
+        self.fs.rm(self.base_path, recursive=True)
+    def _create_test_files(self):
+        # Create CSV file (GZIP compressed)
+        csv_path = f"{self.base_path}/test.csv"
+        with self.fs.open(csv_path, "wb") as f:
+            with gzip.GzipFile(fileobj=f, mode="wb") as gz:
+                content = '"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n'
+                gz.write(content.encode("utf-8"))
+        # Create TSV file (GZIP compressed)
+        tsv_path = f"{self.base_path}/test.tsv"
+        with self.fs.open(tsv_path, "wb") as f:
+            with gzip.GzipFile(fileobj=f, mode="wb") as gz:
+                content = '"a,b\tc|d"\t1\t1.1\n"e,f\tg|h"\t2\t2.2\ntest\t3\t3.3\n'
+                gz.write(content.encode("utf-8"))
+        # Create PSV file (GZIP compressed)
+        psv_path = f"{self.base_path}/test.psv"
+        with self.fs.open(psv_path, "wb") as f:
+            with gzip.GzipFile(fileobj=f, mode="wb") as gz:
+                content = '"a,b\tc|d"|1|1.1\n"e,f\tg|h"|2|2.2\ntest|3|3.3\n'
+                gz.write(content.encode("utf-8"))
+        # Create unescaped TSV file (GZIP compressed)
+        unescaped_tsv_path = f"{self.base_path}/test_unescaped.tsv"
+        pa.Table.from_pylist(
+            [
+                {"col1": "abc", "col2": 1, "col3": 1.1},
+                {"col1": "def", "col2": 2, "col3": 2.2},
+                {"col1": "ghi", "col2": 3, "col3": 3.3},
+            ]
         )
+        with self.fs.open(unescaped_tsv_path, "wb") as f:
+            with gzip.GzipFile(fileobj=f, mode="wb") as gz:
+                content = "abc\t1\t1.1\ndef\t2\t2.2\nghi\t3\t3.3\n"
+                gz.write(content.encode("utf-8"))
+        # Create Parquet file
+        parquet_path = f"{self.base_path}/test.parquet"
+        with self.fs.open(parquet_path, "wb") as f:
+            papq.write_table(self.table, f)
+        # Create Feather file
+        feather_path = f"{self.base_path}/test.feather"
+        with self.fs.open(feather_path, "wb") as f:
+            paf.write_feather(self.table, f)
+        # Create JSON file (GZIP compressed)
+        json_path = f"{self.base_path}/test.json"
+        with self.fs.open(json_path, "wb") as f:
+            with gzip.GzipFile(fileobj=f, mode="wb") as gz:
+                # Create NDJSON format - one JSON object per line
+                lines = []
+                for row in self.table.to_pylist():
+                    lines.append(json.dumps(row))
+                content = "\n".join(lines) + "\n"
+                gz.write(content.encode("utf-8"))
+        # Create Avro file using polars (since pyarrow delegates to polars for Avro)
+        avro_path = f"{self.base_path}/test.avro"
+        import polars as pl
+        pl_df = pl.from_arrow(self.table)
+        pl_df.write_avro(avro_path)
+        # Create ORC file
+        orc_path = f"{self.base_path}/test.orc"
+        with self.fs.open(orc_path, "wb") as f:
+            paorc.write_table(self.table, f)
+    def test_content_type_to_reader_kwargs(self):
+        # Test CSV kwargs
+        csv_kwargs = content_type_to_reader_kwargs(ContentType.CSV.value)
+        expected_csv = {"parse_options": pacsv.ParseOptions(delimiter=",")}
+        assert (
+            csv_kwargs["parse_options"].delimiter
+            == expected_csv["parse_options"].delimiter
+        )
+        # Test TSV kwargs
+        tsv_kwargs = content_type_to_reader_kwargs(ContentType.TSV.value)
+        expected_tsv = {"parse_options": pacsv.ParseOptions(delimiter="\t")}
+        assert (
+            tsv_kwargs["parse_options"].delimiter
+            == expected_tsv["parse_options"].delimiter
+        )
+        # Test PSV kwargs
+        psv_kwargs = content_type_to_reader_kwargs(ContentType.PSV.value)
+        expected_psv = {"parse_options": pacsv.ParseOptions(delimiter="|")}
+        assert (
+            psv_kwargs["parse_options"].delimiter
+            == expected_psv["parse_options"].delimiter
+        )
+        # Test unescaped TSV kwargs
+        unescaped_kwargs = content_type_to_reader_kwargs(
+            ContentType.UNESCAPED_TSV.value
+        )
+        assert unescaped_kwargs["parse_options"].delimiter == "\t"
+        assert unescaped_kwargs["parse_options"].quote_char is False
+        assert unescaped_kwargs["convert_options"].null_values == [""]
+        # Test Parquet kwargs (should be empty)
+        parquet_kwargs = content_type_to_reader_kwargs(ContentType.PARQUET.value)
+        assert parquet_kwargs == {}
+        # Test ORC kwargs (should be empty)
+        orc_kwargs = content_type_to_reader_kwargs(ContentType.ORC.value)
+        assert orc_kwargs == {}
+        # Test Avro kwargs (should be empty)
+        avro_kwargs = content_type_to_reader_kwargs(ContentType.AVRO.value)
+        assert avro_kwargs == {}
+    def test_add_column_kwargs(self):
+        kwargs = {}
+        column_names = ["col1", "col2", "col3"]
+        include_columns = ["col1", "col2"]
+        # Test CSV column kwargs
+        _add_column_kwargs(ContentType.CSV.value, column_names, include_columns, kwargs)
+        assert kwargs["read_options"].column_names == column_names
+        assert kwargs["convert_options"].include_columns == include_columns
+        # Test Parquet column kwargs
+        kwargs = {}
+        _add_column_kwargs(
+            ContentType.PARQUET.value, column_names, include_columns, kwargs
+        )
+        assert kwargs["columns"] == include_columns
-        result = s3_file_to_table(
-            NON_EMPTY_VALID_UTSV_PATH,
-            ContentType.UNESCAPED_TSV.value,
-            ContentEncoding.IDENTITY.value,
-            ["is_active", "ship_datetime_utc"],
-            None,
-            pa_read_func_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(
-                schema=schema
-            ),
+    def test_file_to_table_csv(self):
+        # Test reading CSV with file_to_table
+        csv_path = f"{self.base_path}/test.csv"
+        result = file_to_table(
+            csv_path,
+            ContentType.CSV.value,
+            ContentEncoding.GZIP.value,
+            filesystem=self.fs,
+            column_names=["col1", "col2", "col3"],
         )
-        self.assertEqual(len(result), 3)
-        self.assertEqual(len(result.column_names), 2)
-        result_schema = result.schema
-        for index, field in enumerate(result_schema):
-            self.assertEqual(field.name, schema.field(index).name)
+        assert len(result) == 3
+        assert result.column_names == ["col1", "col2", "col3"]
+        assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
-        self.assertEqual(result.schema.field(0).type, "string")
+    def test_file_to_table_tsv(self):
+        # Test reading TSV with file_to_table
+        tsv_path = f"{self.base_path}/test.tsv"
-    def test_s3_file_to_table_gzip_compressed_sanity(self):
+        result = file_to_table(
+            tsv_path,
+            ContentType.TSV.value,
+            ContentEncoding.GZIP.value,
+            filesystem=self.fs,
+            column_names=["col1", "col2", "col3"],
+        )
-        schema = pa.schema(
-            [("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
+        assert len(result) == 3
+        assert result.column_names == ["col1", "col2", "col3"]
+        assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
+    def test_file_to_table_psv(self):
+        # Test reading PSV with file_to_table
+        psv_path = f"{self.base_path}/test.psv"
+        result = file_to_table(
+            psv_path,
+            ContentType.PSV.value,
+            ContentEncoding.GZIP.value,
+            filesystem=self.fs,
+            column_names=["col1", "col2", "col3"],
         )
-        result = s3_file_to_table(
-            GZIP_COMPRESSED_FILE_UTSV_PATH,
+        assert len(result) == 3
+        assert result.column_names == ["col1", "col2", "col3"]
+        assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
+    def test_file_to_table_unescaped_tsv(self):
+        # Test reading unescaped TSV with file_to_table
+        unescaped_tsv_path = f"{self.base_path}/test_unescaped.tsv"
+        result = file_to_table(
+            unescaped_tsv_path,
             ContentType.UNESCAPED_TSV.value,
             ContentEncoding.GZIP.value,
-            ["is_active", "ship_datetime_utc"],
-            None,
-            pa_read_func_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(
-                schema=schema
-            ),
+            filesystem=self.fs,
+            column_names=["col1", "col2", "col3"],
         )
-        self.assertEqual(len(result), 3)
-        self.assertEqual(len(result.column_names), 2)
-        result_schema = result.schema
-        for index, field in enumerate(result_schema):
-            self.assertEqual(field.name, schema.field(index).name)
+        assert len(result) == 3
+        assert result.column_names == ["col1", "col2", "col3"]
+        assert result.column("col1").to_pylist() == ["abc", "def", "ghi"]
-        self.assertEqual(result.schema.field(0).type, "string")
+    def test_file_to_table_parquet(self):
+        # Test reading Parquet with file_to_table
+        parquet_path = f"{self.base_path}/test.parquet"
-    def test_s3_file_to_table_bz2_compressed_sanity(self):
+        result = file_to_table(
+            parquet_path, ContentType.PARQUET.value, filesystem=self.fs
+        )
-        schema = pa.schema(
-            [("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
+        assert len(result) == 3
+        assert result.column_names == ["col1", "col2", "col3"]
+        assert result.equals(self.table)
+    def test_file_to_table_feather(self):
+        # Test reading Feather with file_to_table
+        feather_path = f"{self.base_path}/test.feather"
+        result = file_to_table(
+            feather_path, ContentType.FEATHER.value, filesystem=self.fs
         )
-        result = s3_file_to_table(
-            BZ2_COMPRESSED_FILE_UTSV_PATH,
-            ContentType.UNESCAPED_TSV.value,
-            ContentEncoding.BZIP2.value,
-            ["is_active", "ship_datetime_utc"],
-            None,
-            pa_read_func_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(
-                schema=schema
-            ),
+        assert len(result) == 3
+        assert result.column_names == ["col1", "col2", "col3"]
+        assert result.equals(self.table)
+    def test_file_to_table_json(self):
+        # Test reading JSON with file_to_table
+        json_path = f"{self.base_path}/test.json"
+        result = file_to_table(
+            json_path,
+            ContentType.JSON.value,
+            ContentEncoding.GZIP.value,
+            filesystem=self.fs,
         )
-        self.assertEqual(len(result), 3)
-        self.assertEqual(len(result.column_names), 2)
-        result_schema = result.schema
-        for index, field in enumerate(result_schema):
-            self.assertEqual(field.name, schema.field(index).name)
+        assert len(result) == 3
+        assert set(result.column_names) == {"col1", "col2", "col3"}
+        assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
-        self.assertEqual(result.schema.field(0).type, "string")
+    def test_file_to_table_avro(self):
+        # Test reading Avro with file_to_table
+        avro_path = f"{self.base_path}/test.avro"
-    def test_s3_file_to_table_when_parquet_sanity(self):
+        result = file_to_table(avro_path, ContentType.AVRO.value, filesystem=self.fs)
-        pa_kwargs_provider = lambda content_type, kwargs: {
-            "reader_type": "pyarrow",
-            **kwargs,
-        }
+        assert len(result) == 3
+        assert result.column_names == ["col1", "col2", "col3"]
+        # Avro may have different dtypes, so compare values
+        assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
-        result = s3_file_to_table(
-            PARQUET_FILE_PATH,
-            ContentType.PARQUET.value,
-            ContentEncoding.IDENTITY.value,
-            ["n_legs", "animal"],
-            ["n_legs"],
-            pa_read_func_kwargs_provider=pa_kwargs_provider,
+    def test_file_to_table_orc(self):
+        # Test reading ORC with file_to_table
+        orc_path = f"{self.base_path}/test.orc"
+        result = file_to_table(orc_path, ContentType.ORC.value, filesystem=self.fs)
+        assert len(result) == 3
+        assert result.column_names == ["col1", "col2", "col3"]
+        assert result.equals(self.table)
+    def test_file_to_table_with_column_selection(self):
+        # Test reading with column selection
+        csv_path = f"{self.base_path}/test.csv"
+        result = file_to_table(
+            csv_path,
+            ContentType.CSV.value,
+            ContentEncoding.GZIP.value,
+            filesystem=self.fs,
+            column_names=["col1", "col2", "col3"],
+            include_columns=["col1", "col2"],
         )
-        self.assertEqual(len(result), 6)
-        self.assertEqual(len(result.column_names), 1)
-        schema = result.schema
-        schema_index = schema.get_field_index("n_legs")
-        self.assertEqual(schema.field(schema_index).type, "int64")
+        assert len(result) == 3
+        assert len(result.column_names) == 2  # Should only have 2 columns
+        assert result.column_names == ["col1", "col2"]
-    def test_s3_file_to_table_when_parquet_schema_overridden(self):
+    def test_file_to_table_with_kwargs_provider(self):
+        # Test reading with kwargs provider
+        csv_path = f"{self.base_path}/test.csv"
+        provider = ReadKwargsProviderPyArrowCsvPureUtf8(
+            include_columns=["col1", "col2", "col3"]
+        )
-        schema = pa.schema(
-            [pa.field("animal", pa.string()), pa.field("n_legs", pa.string())]
+        result = file_to_table(
+            csv_path,
+            ContentType.CSV.value,
+            ContentEncoding.GZIP.value,
+            filesystem=self.fs,
+            column_names=["col1", "col2", "col3"],
+            pa_read_func_kwargs_provider=provider,
         )
-        pa_kwargs_provider = lambda content_type, kwargs: {
-            "schema": schema,
-            "reader_type": "pyarrow",
-            **kwargs,
-        }
+        assert len(result) == 3
+        assert result.column_names == ["col1", "col2", "col3"]
+        # With string types provider, all columns should be strings
+        for col_name in result.column_names:
+            assert result.schema.field(col_name).type == pa.string()
-        result = s3_file_to_table(
-            PARQUET_FILE_PATH,
-            ContentType.PARQUET.value,
-            ContentEncoding.IDENTITY.value,
-            ["n_legs", "animal"],
-            pa_read_func_kwargs_provider=pa_kwargs_provider,
+    def test_file_to_table_filesystem_inference(self):
+        # Test filesystem inference when no filesystem is provided
+        # Use JSON file since it should work well with inference
+        json_path = f"{self.base_path}/test.json"
+        result = file_to_table(
+            json_path,
+            ContentType.JSON.value,
+            ContentEncoding.GZIP.value
+            # No filesystem provided - should be inferred
         )
-        self.assertEqual(len(result), 6)
-        self.assertEqual(len(result.column_names), 2)
+        assert len(result) == 3
+        assert set(result.column_names) == {"col1", "col2", "col3"}
+        assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
-        result_schema = result.schema
-        for index, field in enumerate(result_schema):
-            self.assertEqual(field.name, schema.field(index).name)
+    def test_file_to_table_unsupported_content_type(self):
+        # Test error handling for unsupported content type
+        parquet_path = f"{self.base_path}/test.parquet"
-        self.assertEqual(result.schema.field(1).type, "string")
+        with self.assertRaises(NotImplementedError) as context:
+            file_to_table(parquet_path, "unsupported/content-type", filesystem=self.fs)
-    def test_s3_file_to_table_when_parquet_gzip(self):
+        assert "not implemented" in str(context.exception)
-        pa_kwargs_provider = lambda content_type, kwargs: {
-            "reader_type": "pyarrow",
-            **kwargs,
-        }
+    def test_file_to_table_bzip2_compression(self):
+        # Test BZIP2 compression handling
+        import bz2
-        result = s3_file_to_table(
-            PARQUET_GZIP_COMPRESSED_FILE_PATH,
-            ContentType.PARQUET.value,
-            ContentEncoding.GZIP.value,
-            ["n_legs", "animal"],
-            ["n_legs"],
-            pa_read_func_kwargs_provider=pa_kwargs_provider,
+        # Create a BZIP2 compressed CSV file
+        csv_content = '"a,b\tc|d",1,1.1\n"e,f\tg|h",2,2.2\ntest,3,3.3\n'
+        compressed_content = bz2.compress(csv_content.encode("utf-8"))
+        bz2_path = f"{self.base_path}/test.csv.bz2"
+        with self.fs.open(bz2_path, "wb") as f:
+            f.write(compressed_content)
+        result = file_to_table(
+            bz2_path,
+            ContentType.CSV.value,
+            ContentEncoding.BZIP2.value,
+            filesystem=self.fs,
+            column_names=["col1", "col2", "col3"],
         )
-        self.assertEqual(len(result), 6)
-        self.assertEqual(len(result.column_names), 1)
-        schema = result.schema
-        schema_index = schema.get_field_index("n_legs")
-        self.assertEqual(schema.field(schema_index).type, "int64")
+        assert len(result) == 3
+        assert result.column_names == ["col1", "col2", "col3"]
+        assert result.column("col1").to_pylist() == ["a,b\tc|d", "e,f\tg|h", "test"]
+class TestFileToParquet(TestCase):
+    def setUp(self):
+        # Create test data files for reading
+        self.fs = fsspec.filesystem("file")
+        self.base_path = tempfile.mkdtemp()
+        self.fs.makedirs(self.base_path, exist_ok=True)
+        # Create test Table
+        self.table = pa.Table.from_pylist(
+            [
+                {"col1": "a,b\tc|d", "col2": 1, "col3": 1.1},
+                {"col1": "e,f\tg|h", "col2": 2, "col3": 2.2},
+                {"col1": "test", "col2": 3, "col3": 3.3},
+            ]
+        )
+        # Write test parquet files
+        self._create_test_files()
+    def tearDown(self):
+        self.fs.rm(self.base_path, recursive=True)
+    def _create_test_files(self):
+        # Create basic Parquet file
+        parquet_path = f"{self.base_path}/test.parquet"
+        with self.fs.open(parquet_path, "wb") as f:
+            papq.write_table(self.table, f)
+        # Create larger Parquet file with multiple row groups
+        large_table = pa.Table.from_pylist(
+            [{"col1": f"row_{i}", "col2": i, "col3": float(i)} for i in range(1000)]
+        )
+        large_parquet_path = f"{self.base_path}/test_large.parquet"
+        with self.fs.open(large_parquet_path, "wb") as f:
+            papq.write_table(
+                large_table, f, row_group_size=100
+            )  # Create multiple row groups
+    def test_file_to_parquet_basic(self):
+        # Test basic parquet file reading
+        parquet_path = f"{self.base_path}/test.parquet"
+        result = file_to_parquet(parquet_path, filesystem=self.fs)
+        assert isinstance(result, papq.ParquetFile)
+        assert result.num_row_groups > 0
+        assert result.metadata.num_rows == 3
+        assert result.metadata.num_columns == 3
+        # Verify we can read the data
+        table = result.read()
+        assert len(table) == 3
+        assert table.column_names == ["col1", "col2", "col3"]
+    def test_file_to_parquet_with_schema_provider(self):
+        # Test with schema override provider
+        parquet_path = f"{self.base_path}/test.parquet"
-    def test_s3_file_to_table_when_utsv_gzip_and_content_type_overridden(self):
         schema = pa.schema(
-            [("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
+            [
+                pa.field("col1", pa.string()),
+                pa.field("col2", pa.string()),  # Override to string
+                pa.field("col3", pa.string()),  # Override to string
+            ]
         )
-        # OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG has no effect on uTSV files
-        pa_kwargs_provider = lambda content_type, kwargs: {
-            "reader_type": "pyarrow",
-            **kwargs,
-        }
-        pa_kwargs_provider = lambda content_type, kwargs: {
-            "reader_type": "pyarrow",
-            OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
-            **kwargs,
-        }
-        result = s3_file_to_table(
-            GZIP_COMPRESSED_FILE_UTSV_PATH,
-            ContentType.UNESCAPED_TSV.value,
-            ContentEncoding.GZIP.value,
-            ["is_active", "ship_datetime_utc"],
-            None,
-            pa_read_func_kwargs_provider=pa_kwargs_provider,
+        provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
+        result = file_to_parquet(
+            parquet_path, filesystem=self.fs, pa_read_func_kwargs_provider=provider
         )
-        self.assertEqual(len(result), 3)
-        self.assertEqual(len(result.column_names), 2)
-        result_schema = result.schema
-        for index, field in enumerate(result_schema):
-            self.assertEqual(field.name, schema.field(index).name)
+        assert isinstance(result, papq.ParquetFile)
+        # Note: schema override might not affect ParquetFile metadata,
+        # but should work when reading the table
+        table = result.read()
+        assert len(table) == 3
-        self.assertEqual(result.schema.field(0).type, "string")
+    def test_file_to_parquet_with_custom_kwargs(self):
+        # Test with custom ParquetFile kwargs
+        parquet_path = f"{self.base_path}/test.parquet"
-    def test_s3_file_to_table_when_parquet_gzip_and_encoding_overridden(self):
-        pa_kwargs_provider = lambda content_type, kwargs: {
-            "reader_type": "pyarrow",
-            OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
-            **kwargs,
-        }
+        result = file_to_parquet(
+            parquet_path,
+            filesystem=self.fs,
+            validate_schema=True,  # Custom kwarg for ParquetFile
+            memory_map=True,  # Another custom kwarg
+        )
-        result = s3_file_to_table(
-            PARQUET_FILE_PATH,
-            ContentType.PARQUET.value,
-            ContentEncoding.GZIP.value,
-            ["n_legs", "animal"],
-            ["n_legs"],
-            pa_read_func_kwargs_provider=pa_kwargs_provider,
+        assert isinstance(result, papq.ParquetFile)
+        assert result.metadata.num_rows == 3
+    def test_file_to_parquet_filesystem_inference(self):
+        # Test filesystem inference when no filesystem is provided
+        parquet_path = f"{self.base_path}/test.parquet"
+        result = file_to_parquet(
+            parquet_path
+            # No filesystem provided - should be inferred
         )
-        self.assertEqual(len(result), 6)
-        self.assertEqual(len(result.column_names), 1)
+        assert isinstance(result, papq.ParquetFile)
+        assert result.metadata.num_rows == 3
+        assert result.metadata.num_columns == 3
+    def test_file_to_parquet_large_file(self):
+        # Test with larger parquet file (multiple row groups)
+        large_parquet_path = f"{self.base_path}/test_large.parquet"
+        result = file_to_parquet(large_parquet_path, filesystem=self.fs)
+        assert isinstance(result, papq.ParquetFile)
+        assert result.metadata.num_rows == 1000
+        assert result.num_row_groups > 1  # Should have multiple row groups
+        # Test reading specific row groups
+        first_row_group = result.read_row_group(0)
+        assert len(first_row_group) <= 100  # Based on row_group_size=100
+    def test_file_to_parquet_metadata_access(self):
+        # Test accessing various metadata properties
+        parquet_path = f"{self.base_path}/test.parquet"
+        result = file_to_parquet(parquet_path, filesystem=self.fs)
+        # Test metadata access
+        metadata = result.metadata
+        assert metadata.num_rows == 3
+        assert metadata.num_columns == 3
+        assert metadata.num_row_groups >= 1
+        # Test schema access
         schema = result.schema
-        schema_index = schema.get_field_index("n_legs")
-        self.assertEqual(schema.field(schema_index).type, "int64")
-class TestS3FileToParquet(TestCase):
-    def test_s3_file_to_parquet_sanity(self):
-        test_s3_url = PARQUET_FILE_PATH
-        test_content_type = ContentType.PARQUET.value
-        test_content_encoding = ContentEncoding.IDENTITY.value
-        pa_kwargs_provider = lambda content_type, kwargs: {
-            "reader_type": "pyarrow",
-            **kwargs,
-        }
-        with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
-            result_parquet_file: ParquetFile = s3_file_to_parquet(
-                test_s3_url,
-                test_content_type,
-                test_content_encoding,
-                ["n_legs", "animal"],
-                ["n_legs"],
-                pa_read_func_kwargs_provider=pa_kwargs_provider,
-            )
-        log_message_log_args = cm.records[0].getMessage()
-        log_message_presanitize_kwargs = cm.records[1].getMessage()
-        self.assertIn(
-            f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
-            log_message_log_args,
-        )
-        self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs)
-        for index, field in enumerate(result_parquet_file.schema_arrow):
-            self.assertEqual(
-                field.name, result_parquet_file.schema_arrow.field(index).name
+        assert len(schema) == 3
+        assert "col1" in schema.names
+        assert "col2" in schema.names
+        assert "col3" in schema.names
+        # Test schema_arrow property
+        schema_arrow = result.schema_arrow
+        assert isinstance(schema_arrow, pa.Schema)
+        assert len(schema_arrow) == 3
+    def test_file_to_parquet_column_selection(self):
+        # Test reading specific columns
+        parquet_path = f"{self.base_path}/test.parquet"
+        result = file_to_parquet(parquet_path, filesystem=self.fs)
+        # Read only specific columns
+        table = result.read(columns=["col1", "col2"])
+        assert len(table.column_names) == 2
+        assert table.column_names == ["col1", "col2"]
+        assert len(table) == 3
+    def test_file_to_parquet_invalid_content_type(self):
+        # Test error handling for invalid content type
+        parquet_path = f"{self.base_path}/test.parquet"
+        with self.assertRaises(ContentTypeValidationError) as context:
+            file_to_parquet(
+                parquet_path,
+                content_type=ContentType.CSV.value,  # Invalid content type
+                filesystem=self.fs,
             )
-        self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64")
-    def test_s3_file_to_parquet_when_parquet_gzip_encoding_and_overridden_returns_success(
-        self,
-    ):
-        test_s3_url = PARQUET_FILE_PATH
-        test_content_type = ContentType.PARQUET.value
-        test_content_encoding = ContentEncoding.GZIP.value
-        pa_kwargs_provider = lambda content_type, kwargs: {
-            "reader_type": "pyarrow",
-            OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
-            **kwargs,
-        }
-        with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
-            result_parquet_file: ParquetFile = s3_file_to_parquet(
-                test_s3_url,
-                test_content_type,
-                test_content_encoding,
-                ["n_legs", "animal"],
-                ["n_legs"],
-                pa_read_func_kwargs_provider=pa_kwargs_provider,
-            )
-        log_message_log_args = cm.records[0].getMessage()
-        log_message_log_new_content_encoding = cm.records[1].getMessage()
-        log_message_presanitize_kwargs = cm.records[2].getMessage()
-        self.assertIn(
-            f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
-            log_message_log_args,
-        )
-        self.assertIn(
-            f"Overriding {test_s3_url} content encoding from {ContentEncoding.GZIP.value} to {ContentEncoding.IDENTITY.value}",
-            log_message_log_new_content_encoding,
-        )
-        self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs)
-        for index, field in enumerate(result_parquet_file.schema_arrow):
-            self.assertEqual(
-                field.name, result_parquet_file.schema_arrow.field(index).name
+        assert "cannot be read into pyarrow.parquet.ParquetFile" in str(
+            context.exception
+        )
+    def test_file_to_parquet_invalid_content_encoding(self):
+        # Test error handling for invalid content encoding
+        parquet_path = f"{self.base_path}/test.parquet"
+        with self.assertRaises(ContentTypeValidationError) as context:
+            file_to_parquet(
+                parquet_path,
+                content_encoding=ContentEncoding.GZIP.value,  # Invalid encoding
+                filesystem=self.fs,
             )
-        self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64")
-    def test_s3_file_to_parquet_when_parquet_gzip_encoding_not_overridden_throws_error(
-        self,
-    ):
-        test_s3_url = PARQUET_FILE_PATH
-        test_content_type = ContentType.PARQUET.value
-        test_content_encoding = ContentEncoding.GZIP.value
-        pa_kwargs_provider = lambda content_type, kwargs: {
-            "reader_type": "pyarrow",
-            **kwargs,
-        }
-        with self.assertRaises(ContentTypeValidationError):
-            with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
-                s3_file_to_parquet(
-                    test_s3_url,
-                    test_content_type,
-                    test_content_encoding,
-                    ["n_legs", "animal"],
-                    ["n_legs"],
-                    pa_read_func_kwargs_provider=pa_kwargs_provider,
+        assert "cannot be read into pyarrow.parquet.ParquetFile" in str(
+            context.exception
+        )
+    def test_file_to_parquet_different_filesystems(self):
+        # Test with different filesystem implementations
+        parquet_path = f"{self.base_path}/test.parquet"
+        # Test with fsspec filesystem
+        result_fsspec = file_to_parquet(parquet_path, filesystem=self.fs)
+        assert isinstance(result_fsspec, papq.ParquetFile)
+        assert result_fsspec.metadata.num_rows == 3
+        # Test with None filesystem (inferred)
+        result_inferred = file_to_parquet(parquet_path, filesystem=None)
+        assert isinstance(result_inferred, papq.ParquetFile)
+        assert result_inferred.metadata.num_rows == 3
+    def test_file_to_parquet_lazy_loading(self):
+        # Test that ParquetFile provides lazy loading capabilities
+        large_parquet_path = f"{self.base_path}/test_large.parquet"
+        result = file_to_parquet(large_parquet_path, filesystem=self.fs)
+        # ParquetFile should be created without loading all data
+        assert isinstance(result, papq.ParquetFile)
+        assert result.metadata.num_rows == 1000
+        # Test reading only specific columns (lazy loading)
+        partial_table = result.read(columns=["col1", "col2"])
+        assert len(partial_table) == 1000  # All rows but only 2 columns
+        assert partial_table.column_names == ["col1", "col2"]
+        # Test reading specific row group (lazy loading)
+        row_group_table = result.read_row_group(0)
+        assert len(row_group_table) <= 100  # Based on row_group_size
+    def test_file_to_parquet_performance_timing(self):
+        # Test that performance timing is logged (basic functionality test)
+        parquet_path = f"{self.base_path}/test.parquet"
+        # This should complete without error and log timing
+        result = file_to_parquet(parquet_path, filesystem=self.fs)
+        assert isinstance(result, papq.ParquetFile)
+        assert result.metadata.num_rows == 3
+class TestFileToTableFilesystems(TestCase):
+    """Test file_to_table with different filesystem implementations across all content types."""
+    def setUp(self):
+        self.tmpdir = tempfile.mkdtemp()
+        self._create_test_files()
+    def tearDown(self):
+        import shutil
+        shutil.rmtree(self.tmpdir)
+    def _create_test_files(self):
+        """Create test files for all supported content types."""
+        # Test data
+        test_data = pa.table(
+            {
+                "id": [1, 2, 3, 4, 5],
+                "name": ["Alice", "Bob", "Charlie", "Diana", "Eve"],
+                "age": [25, 30, 35, 28, 32],
+                "score": [85.5, 92.0, 78.5, 88.0, 95.5],
+            }
+        )
+        # File paths
+        self.csv_file = f"{self.tmpdir}/test.csv"
+        self.tsv_file = f"{self.tmpdir}/test.tsv"
+        self.psv_file = f"{self.tmpdir}/test.psv"
+        self.unescaped_tsv_file = f"{self.tmpdir}/test_unescaped.tsv"
+        self.parquet_file = f"{self.tmpdir}/test.parquet"
+        self.feather_file = f"{self.tmpdir}/test.feather"
+        self.json_file = f"{self.tmpdir}/test.json"
+        self.orc_file = f"{self.tmpdir}/test.orc"
+        self.avro_file = f"{self.tmpdir}/test.avro"
+        # Create CSV file
+        pacsv.write_csv(
+            test_data,
+            self.csv_file,
+            write_options=pacsv.WriteOptions(delimiter=",", include_header=False),
+        )
+        # Create TSV file
+        pacsv.write_csv(
+            test_data,
+            self.tsv_file,
+            write_options=pacsv.WriteOptions(delimiter="\t", include_header=False),
+        )
+        # Create PSV file
+        pacsv.write_csv(
+            test_data,
+            self.psv_file,
+            write_options=pacsv.WriteOptions(delimiter="|", include_header=False),
+        )
+        # Create unescaped TSV file
+        pacsv.write_csv(
+            test_data,
+            self.unescaped_tsv_file,
+            write_options=pacsv.WriteOptions(
+                delimiter="\t", include_header=False, quoting_style="none"
+            ),
+        )
+        # Create Parquet file
+        papq.write_table(test_data, self.parquet_file)
+        # Create Feather file
+        paf.write_feather(test_data, self.feather_file)
+        # Create JSON file (write as JSONL format)
+        df = test_data.to_pandas()
+        with open(self.json_file, "w") as f:
+            for _, row in df.iterrows():
+                json.dump(row.to_dict(), f)
+                f.write("\n")
+        # Create ORC file
+        paorc.write_table(test_data, self.orc_file)
+        # Create Avro file
+        try:
+            import polars as pl
+            pl_df = pl.from_arrow(test_data)
+            pl_df.write_avro(self.avro_file)
+        except ImportError:
+            # Skip Avro file creation if polars is not available
+            self.avro_file = None
+    def _get_filesystems(self, file_path):
+        """Get different filesystem implementations for testing."""
+        # fsspec AbstractFileSystem
+        fsspec_fs = fsspec.filesystem("file")
+        # PyArrow filesystem
+        import pyarrow.fs as pafs
+        pyarrow_fs = pafs.LocalFileSystem()
+        # None for automatic inference
+        auto_infer_fs = None
+        return [
+            ("fsspec", fsspec_fs),
+            ("pyarrow", pyarrow_fs),
+            ("auto_infer", auto_infer_fs),
+        ]
+    def _assert_table_content(self, table, content_type):
+        """Assert that the loaded table has expected content."""
+        self.assertEqual(len(table), 5, f"Expected 5 rows for {content_type}")
+        self.assertEqual(
+            len(table.columns), 4, f"Expected 4 columns for {content_type}"
+        )
+        # Check column names exist (order might vary for some formats)
+        column_names = set(table.column_names)
+        expected_columns = {"id", "name", "age", "score"}
+        self.assertEqual(
+            column_names, expected_columns, f"Column names mismatch for {content_type}"
+        )
+    def test_csv_all_filesystems(self):
+        """Test CSV reading with all filesystem types."""
+        for fs_name, filesystem in self._get_filesystems(self.csv_file):
+            with self.subTest(filesystem=fs_name):
+                table = file_to_table(
+                    self.csv_file,
+                    ContentType.CSV.value,
+                    ContentEncoding.IDENTITY.value,
+                    filesystem=filesystem,
+                    column_names=["id", "name", "age", "score"],
+                )
+                self._assert_table_content(table, f"CSV with {fs_name}")
+    def test_tsv_all_filesystems(self):
+        """Test TSV reading with all filesystem types."""
+        for fs_name, filesystem in self._get_filesystems(self.tsv_file):
+            with self.subTest(filesystem=fs_name):
+                table = file_to_table(
+                    self.tsv_file,
+                    ContentType.TSV.value,
+                    ContentEncoding.IDENTITY.value,
+                    filesystem=filesystem,
+                    column_names=["id", "name", "age", "score"],
+                )
+                self._assert_table_content(table, f"TSV with {fs_name}")
+    def test_psv_all_filesystems(self):
+        """Test PSV reading with all filesystem types."""
+        for fs_name, filesystem in self._get_filesystems(self.psv_file):
+            with self.subTest(filesystem=fs_name):
+                table = file_to_table(
+                    self.psv_file,
+                    ContentType.PSV.value,
+                    ContentEncoding.IDENTITY.value,
+                    filesystem=filesystem,
+                    column_names=["id", "name", "age", "score"],
+                )
+                self._assert_table_content(table, f"PSV with {fs_name}")
+    def test_unescaped_tsv_all_filesystems(self):
+        """Test unescaped TSV reading with all filesystem types."""
+        for fs_name, filesystem in self._get_filesystems(self.unescaped_tsv_file):
+            with self.subTest(filesystem=fs_name):
+                table = file_to_table(
+                    self.unescaped_tsv_file,
+                    ContentType.UNESCAPED_TSV.value,
+                    ContentEncoding.IDENTITY.value,
+                    filesystem=filesystem,
+                    column_names=["id", "name", "age", "score"],
+                )
+                self._assert_table_content(table, f"UNESCAPED_TSV with {fs_name}")
+    def test_parquet_all_filesystems(self):
+        """Test Parquet reading with all filesystem types."""
+        for fs_name, filesystem in self._get_filesystems(self.parquet_file):
+            with self.subTest(filesystem=fs_name):
+                table = file_to_table(
+                    self.parquet_file,
+                    ContentType.PARQUET.value,
+                    ContentEncoding.IDENTITY.value,
+                    filesystem=filesystem,
+                )
+                self._assert_table_content(table, f"PARQUET with {fs_name}")
+    def test_feather_all_filesystems(self):
+        """Test Feather reading with all filesystem types."""
+        for fs_name, filesystem in self._get_filesystems(self.feather_file):
+            with self.subTest(filesystem=fs_name):
+                table = file_to_table(
+                    self.feather_file,
+                    ContentType.FEATHER.value,
+                    ContentEncoding.IDENTITY.value,
+                    filesystem=filesystem,
+                )
+                self._assert_table_content(table, f"FEATHER with {fs_name}")
+    def test_json_all_filesystems(self):
+        """Test JSON reading with all filesystem types."""
+        for fs_name, filesystem in self._get_filesystems(self.json_file):
+            with self.subTest(filesystem=fs_name):
+                table = file_to_table(
+                    self.json_file,
+                    ContentType.JSON.value,
+                    ContentEncoding.IDENTITY.value,
+                    filesystem=filesystem,
+                )
+                self._assert_table_content(table, f"JSON with {fs_name}")
+    def test_orc_all_filesystems(self):
+        """Test ORC reading with all filesystem types."""
+        for fs_name, filesystem in self._get_filesystems(self.orc_file):
+            with self.subTest(filesystem=fs_name):
+                table = file_to_table(
+                    self.orc_file,
+                    ContentType.ORC.value,
+                    ContentEncoding.IDENTITY.value,
+                    filesystem=filesystem,
+                )
+                self._assert_table_content(table, f"ORC with {fs_name}")
+    def test_avro_all_filesystems(self):
+        """Test Avro reading with all filesystem types."""
+        if self.avro_file is None:
+            self.skipTest("Avro file creation skipped (polars not available)")
+        for fs_name, filesystem in self._get_filesystems(self.avro_file):
+            with self.subTest(filesystem=fs_name):
+                table = file_to_table(
+                    self.avro_file,
+                    ContentType.AVRO.value,
+                    ContentEncoding.IDENTITY.value,
+                    filesystem=filesystem,
+                )
+                self._assert_table_content(table, f"AVRO with {fs_name}")
+    def test_column_selection_all_filesystems(self):
+        """Test column selection works with all filesystem types."""
+        for fs_name, filesystem in self._get_filesystems(self.parquet_file):
+            with self.subTest(filesystem=fs_name):
+                table = file_to_table(
+                    self.parquet_file,
+                    ContentType.PARQUET.value,
+                    ContentEncoding.IDENTITY.value,
+                    filesystem=filesystem,
+                    include_columns=["name", "age"],
+                )
+                self.assertEqual(
+                    len(table.columns), 2, f"Expected 2 columns with {fs_name}"
                 )
-        log_message_log_args = cm.records[0].getMessage()
-        self.assertIn(
-            f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
-            log_message_log_args,
+                self.assertEqual(
+                    set(table.column_names),
+                    {"name", "age"},
+                    f"Column selection failed with {fs_name}",
+                )
+    def test_kwargs_provider_all_filesystems(self):
+        """Test that kwargs providers work with all filesystem types."""
+        def schema_provider(content_type, kwargs):
+            if content_type == ContentType.CSV.value:
+                # Force all columns to be strings
+                kwargs["convert_options"] = pacsv.ConvertOptions(
+                    column_types={
+                        "id": pa.string(),
+                        "name": pa.string(),
+                        "age": pa.string(),
+                        "score": pa.string(),
+                    }
+                )
+            return kwargs
+        for fs_name, filesystem in self._get_filesystems(self.csv_file):
+            with self.subTest(filesystem=fs_name):
+                table = file_to_table(
+                    self.csv_file,
+                    ContentType.CSV.value,
+                    ContentEncoding.IDENTITY.value,
+                    filesystem=filesystem,
+                    column_names=["id", "name", "age", "score"],
+                    pa_read_func_kwargs_provider=schema_provider,
+                )
+                # Check that all columns are strings
+                for field in table.schema:
+                    self.assertEqual(
+                        field.type,
+                        pa.string(),
+                        f"Column {field.name} should be string with {fs_name}",
+                    )
+    def test_filesystem_auto_inference_consistency(self):
+        """Test that auto-inferred filesystem produces same results as explicit filesystems."""
+        # Use Parquet as it's most reliable across filesystem types
+        # Read with auto-inference
+        auto_table = file_to_table(
+            self.parquet_file,
+            ContentType.PARQUET.value,
+            ContentEncoding.IDENTITY.value,
+            filesystem=None,  # Auto-infer
+        )
+        # Read with explicit fsspec filesystem
+        fsspec_fs = fsspec.filesystem("file")
+        fsspec_table = file_to_table(
+            self.parquet_file,
+            ContentType.PARQUET.value,
+            ContentEncoding.IDENTITY.value,
+            filesystem=fsspec_fs,
         )
+        # Read with explicit PyArrow filesystem
+        import pyarrow.fs as pafs
+        pyarrow_fs = pafs.LocalFileSystem()
+        pyarrow_table = file_to_table(
+            self.parquet_file,
+            ContentType.PARQUET.value,
+            ContentEncoding.IDENTITY.value,
+            filesystem=pyarrow_fs,
+        )
+        # All should produce equivalent results
+        self.assertTrue(
+            auto_table.equals(fsspec_table),
+            "Auto-inferred result should match fsspec result",
+        )
+        self.assertTrue(
+            auto_table.equals(pyarrow_table),
+            "Auto-inferred result should match PyArrow result",
+        )
+    def test_error_handling_all_filesystems(self):
+        """Test error handling works consistently across filesystem types."""
+        for fs_name, filesystem in self._get_filesystems(self.parquet_file):
+            with self.subTest(filesystem=fs_name):
+                # Test unsupported content type
+                with self.assertRaises(NotImplementedError):
+                    file_to_table(
+                        self.parquet_file,
+                        "UNSUPPORTED_TYPE",
+                        ContentEncoding.IDENTITY.value,
+                        filesystem=filesystem,
+                    )
+                # Test non-existent file
+                with self.assertRaises((FileNotFoundError, OSError)):
+                    file_to_table(
+                        f"{self.tmpdir}/non_existent.parquet",
+                        ContentType.PARQUET.value,
+                        ContentEncoding.IDENTITY.value,
+                        filesystem=filesystem,
+                    )

deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

deltacat 1.1.38py3-none-any.whl → 2.0.0py3-none-any.whl