deltacat 1.1.11__py3-none-any.whl → 1.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.11"
47
+ __version__ = "1.1.12"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -104,6 +104,44 @@ class TestDaftS3FileToTable(unittest.TestCase):
104
104
  self.assertEqual(table.schema.field("MISSING").type, pa.string())
105
105
  self.assertEqual(table.num_rows, 100)
106
106
 
107
+ def test_read_from_s3_single_column_with_schema_extra_cols_column_names(self):
108
+ schema = pa.schema([("a", pa.int8()), ("MISSING", pa.string())])
109
+ pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
110
+ schema=schema
111
+ )
112
+ table = daft_s3_file_to_table(
113
+ self.MVP_PATH,
114
+ content_encoding=ContentEncoding.IDENTITY.value,
115
+ content_type=ContentType.PARQUET.value,
116
+ column_names=["a", "MISSING"],
117
+ pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
118
+ )
119
+ self.assertEqual(
120
+ table.schema.names, ["a", "MISSING"]
121
+ ) # NOTE: "MISSING" is padded as a null array
122
+ self.assertEqual(table.schema.field("a").type, pa.int8())
123
+ self.assertEqual(table.schema.field("MISSING").type, pa.string())
124
+ self.assertEqual(table.num_rows, 100)
125
+
126
+ def test_read_from_s3_single_column_with_schema_only_missing_col(self):
127
+ schema = pa.schema([("a", pa.int8()), ("MISSING", pa.string())])
128
+ pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
129
+ schema=schema
130
+ )
131
+ table = daft_s3_file_to_table(
132
+ self.MVP_PATH,
133
+ content_encoding=ContentEncoding.IDENTITY.value,
134
+ content_type=ContentType.PARQUET.value,
135
+ include_columns=["MISSING"],
136
+ column_names=["a", "MISSING"],
137
+ pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
138
+ )
139
+ self.assertEqual(
140
+ table.schema.names, ["MISSING"]
141
+ ) # NOTE: "MISSING" is padded as a null array
142
+ self.assertEqual(table.schema.field("MISSING").type, pa.string())
143
+ self.assertEqual(table.num_rows, 0)
144
+
107
145
  def test_read_from_s3_single_column_with_row_groups(self):
108
146
 
109
147
  metadata = pq.read_metadata(self.MVP_PATH)
@@ -85,6 +85,43 @@ class TestS3PartialParquetFileToTable(TestCase):
85
85
  self.assertEqual(result_schema.field(2).type, "int64")
86
86
  self.assertEqual(result_schema.field(2).name, "MISSING")
87
87
 
88
+ def test_s3_partial_parquet_file_to_table_when_schema_missing_columns(self):
89
+
90
+ pq_file = ParquetFile(PARQUET_FILE_PATH)
91
+ partial_parquet_params = PartialParquetParameters.of(
92
+ pq_metadata=pq_file.metadata
93
+ )
94
+ # only first row group to be downloaded
95
+ partial_parquet_params.row_groups_to_download.pop()
96
+
97
+ schema = pa.schema(
98
+ [
99
+ pa.field("n_legs", pa.string()),
100
+ pa.field("animal", pa.string()),
101
+ # NOTE: This field is not in the parquet file, but will be added on as an all-null column
102
+ pa.field("MISSING", pa.int64()),
103
+ ]
104
+ )
105
+
106
+ pa_kwargs_provider = lambda content_type, kwargs: {"schema": schema}
107
+
108
+ result = s3_partial_parquet_file_to_table(
109
+ PARQUET_FILE_PATH,
110
+ ContentType.PARQUET.value,
111
+ ContentEncoding.IDENTITY.value,
112
+ pa_read_func_kwargs_provider=pa_kwargs_provider,
113
+ partial_file_download_params=partial_parquet_params,
114
+ column_names=["n_legs", "animal", "MISSING"],
115
+ include_columns=["MISSING"],
116
+ )
117
+
118
+ self.assertEqual(len(result), 0)
119
+ self.assertEqual(len(result.column_names), 1)
120
+
121
+ result_schema = result.schema
122
+ self.assertEqual(result_schema.field(0).type, "int64")
123
+ self.assertEqual(result_schema.field(0).name, "MISSING")
124
+
88
125
  def test_s3_partial_parquet_file_to_table_when_schema_passed_with_include_columns(
89
126
  self,
90
127
  ):
@@ -234,6 +271,32 @@ class TestReadCSV(TestCase):
234
271
  lambda: pyarrow_read_csv(NON_EMPTY_VALID_UTSV_PATH, **kwargs),
235
272
  )
236
273
 
274
+ def test_read_csv_when_excess_columns_included(self):
275
+
276
+ schema = pa.schema(
277
+ [
278
+ ("is_active", pa.string()),
279
+ ("ship_datetime_utc", pa.timestamp("us")),
280
+ ("MISSING", pa.string()),
281
+ ]
282
+ )
283
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
284
+ _add_column_kwargs(
285
+ ContentType.UNESCAPED_TSV.value,
286
+ ["is_active", "ship_datetime_utc", "MISSING"],
287
+ ["is_active", "ship_datetime_utc", "MISSING"],
288
+ kwargs,
289
+ )
290
+
291
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
292
+
293
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
294
+
295
+ self.assertRaises(
296
+ pa.lib.ArrowInvalid,
297
+ lambda: pyarrow_read_csv(NON_EMPTY_VALID_UTSV_PATH, **kwargs),
298
+ )
299
+
237
300
  def test_read_csv_when_empty_csv_sanity(self):
238
301
 
239
302
  schema = pa.schema(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.11
3
+ Version: 1.1.12
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -21,13 +21,13 @@ Requires-Dist: numpy ==1.21.5
21
21
  Requires-Dist: pandas ==1.3.5
22
22
  Requires-Dist: pyarrow ==12.0.1
23
23
  Requires-Dist: pydantic ==1.10.4
24
- Requires-Dist: ray[default] >=2.20.0
24
+ Requires-Dist: ray >=2.20.0
25
25
  Requires-Dist: s3fs ==2024.5.0
26
26
  Requires-Dist: tenacity ==8.1.0
27
27
  Requires-Dist: typing-extensions ==4.4.0
28
28
  Requires-Dist: pymemcache ==4.0.0
29
29
  Requires-Dist: redis ==4.6.0
30
- Requires-Dist: getdaft ==0.2.29
30
+ Requires-Dist: getdaft ==0.2.31
31
31
  Requires-Dist: schedule ==1.2.0
32
32
 
33
33
  # DeltaCAT
@@ -1,4 +1,4 @@
1
- deltacat/__init__.py,sha256=z6zdRpLnxm9AOnjLtJBmkciFK82KnMkPjmtjy85AttM,1778
1
+ deltacat/__init__.py,sha256=jtrgQAzzZxq5YU_9RwtH0N4p8k4_ACd_vcjuWJf934Q,1778
2
2
  deltacat/constants.py,sha256=_6oRI-3yp5c8J1qKGQZrt89I9-ttT_gSSvVsJ0h8Duc,1939
3
3
  deltacat/exceptions.py,sha256=yWM4RXK7uRrQc1VgJv6Lv2UiNZWAx2wolLq7cBwjlkg,12770
4
4
  deltacat/logs.py,sha256=6g16VkEFidbaMjgenAjggE1r2l664drMVhreRs8B1IQ,8438
@@ -182,10 +182,10 @@ deltacat/tests/test_utils/storage.py,sha256=93GEn4A5WbMHWk0Ec4Bd7RxeHoSEnBfSarfW
182
182
  deltacat/tests/test_utils/utils.py,sha256=a32qEwcSSd1lvRi0aJJ4ZLnc1ZyXmoQF_K95zaQRk2M,455
183
183
  deltacat/tests/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
184
184
  deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iyzjiJ7-Rcl0o,1506
185
- deltacat/tests/utils/test_daft.py,sha256=AIE0qz6oKhEEvBqF0VfQ5pwTtiTHqyf0EuUXduiS3t4,6487
185
+ deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
186
186
  deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
187
187
  deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
188
- deltacat/tests/utils/test_pyarrow.py,sha256=eZAuYp9MUf8lmpIilH57JkURuNsTGZ3IAGC4Gm5hdrM,17307
188
+ deltacat/tests/utils/test_pyarrow.py,sha256=AWx0DTsBA1PkQag48w_HeQdz7tlBzJsm9v7Nd6-dhEY,19607
189
189
  deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
190
190
  deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
191
191
  deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -216,8 +216,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
216
216
  deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
217
217
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
218
218
  deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
219
- deltacat-1.1.11.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
220
- deltacat-1.1.11.dist-info/METADATA,sha256=60wuPvw4-9iEcp9v1Bz0b4fpEwSGfgjYQ_0YGMaVuVo,1757
221
- deltacat-1.1.11.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
222
- deltacat-1.1.11.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
223
- deltacat-1.1.11.dist-info/RECORD,,
219
+ deltacat-1.1.12.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
220
+ deltacat-1.1.12.dist-info/METADATA,sha256=fqP5NYc_4wrfWcsQXO1Aht8xM22LrmDZLHfMAqq2opQ,1748
221
+ deltacat-1.1.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
222
+ deltacat-1.1.12.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
223
+ deltacat-1.1.12.dist-info/RECORD,,